from google.colab import auth
from google.oauth2 import service_account
from googleapiclient.discovery import build

auth.authenticate_user()

import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd

scope = ['https://www.googleapis.com/auth/spreadsheets']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)

client = gspread.authorize(creds)

spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1CatLlB4axdBW0uJRoioR-x00RsPzQ54CFYq1PqzgV7E/edit?usp=drive_web&ouid=107190373735789167396'
sheet = client.open_by_url(spreadsheet_url)

worksheet = sheet.get_worksheet(0)
data = worksheet.get_all_records()
df_synthetic = pd.DataFrame(data)

df_synthetic.head()

import pandas as pd

# Load the data file (adjust file name and method if it's an Excel file)
df = pd.read_csv('prolific_responses_survey1.csv')

# Define the mapping from Likert-scale text to numbers
likert_mapping = {
    "Not at all harmful": 1,
    "Slightly harmful": 2,
    "Moderately harmful": 3,
    "Very harmful": 4,
    "Extremely harmful": 5
}

# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Replace the text values with numeric values in those columns
for col in likert_columns:
    if col in df.columns:  # only replace if column exists in the dataframe
        df[col] = df[col].replace(likert_mapping)

/tmp/ipython-input-1113943154.py:22: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[col] = df[col].replace(likert_mapping)

import pandas as pd

# Assuming the DataFrame is called df and has columns:
# ["ProlificID", "SyntheticPersonID", "AttnCheck", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Condition"]

# 1) Group by SyntheticPersonID and compute the mean for Q1..Q8
df_avg = (
    df.groupby("SyntheticPersonID")[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]]
      .mean()  # average across repeated rows for each SyntheticPersonID
      .reset_index()
)

# 2) (Optional) If expecting a single "avg_harm" column across Q1..Q8
df_avg["avg_harm"] = df_avg[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]].mean(axis=1)

print(df_avg.head())

   SyntheticPersonID        Q1        Q2        Q3        Q4        Q5  \
0                 57  4.000000  4.333333  3.333333  3.333333  2.666667   
1                 89  3.400000  3.400000  3.400000  3.600000  3.400000   
2                141  3.750000  3.750000  3.750000  3.750000  3.750000   
3                153  5.000000  2.000000  3.000000  2.000000  4.000000   
4                166  4.333333  4.333333  4.000000  4.333333  4.333333   

         Q6        Q7        Q8  avg_harm  
0  2.666667  3.666667  4.333333  3.541667  
1  3.400000  3.600000  3.800000  3.500000  
2  3.750000  4.250000  4.500000  3.906250  
3  4.000000  3.000000  5.000000  3.500000  
4  4.000000  4.000000  4.333333  4.208333

import pandas as pd
import itertools
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

# 1) Group by SyntheticPersonID and Condition, averaging Q1..Q8
#    for repeated rows of the same SyntheticPersonID.
grouped_df = (
    df.groupby(["SyntheticPersonID", "Condition"])[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]]
      .mean()
      .reset_index()
)

# 2) Identify the questions to test separately
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

# 3) Get the unique conditions
conditions = grouped_df["Condition"].unique()

# 4) Prepare a list to store results from each test
results = []

# 5) For each question, do pairwise t-tests across all condition pairs
for question in questions:
    for cond1, cond2 in itertools.combinations(conditions, 2):
        # Extract data for this question, for each condition
        group1 = grouped_df.loc[grouped_df["Condition"] == cond1, question].dropna()
        group2 = grouped_df.loc[grouped_df["Condition"] == cond2, question].dropna()

        # Perform Welch's t-test (equal_var=False)
        t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)

        # Store the results
        results.append((question, cond1, cond2, t_stat, p_val))

# 6) Convert results to a DataFrame
results_df = pd.DataFrame(
    results,
    columns=["Question", "Condition1", "Condition2", "t_stat", "p_value"]
)

# 7) Apply Bonferroni correction across all tests (all questions, all pairs)
adjusted = multipletests(results_df["p_value"], method="bonferroni")
results_df["p_adjusted"] = adjusted[1]

# 8) Print the final results
print("Separate t-test results for each question (with Bonferroni correction):")
print(results_df)

Separate t-test results for each question (with Bonferroni correction):
   Question Condition1 Condition2    t_stat       p_value    p_adjusted
0        Q1     Health  Financial -3.587197  3.495687e-04  1.677930e-02
1        Q1     Health    Control -4.902863  1.092902e-06  5.245930e-05
2        Q1     Health  Sensitive  2.168184  3.037270e-02  1.000000e+00
3        Q1  Financial    Control -0.980510  3.270632e-01  1.000000e+00
4        Q1  Financial  Sensitive  5.636677  2.235649e-08  1.073112e-06
5        Q1    Control  Sensitive  7.097109  2.386361e-12  1.145453e-10
6        Q2     Health  Financial  3.229347  1.278838e-03  6.138424e-02
7        Q2     Health    Control  7.193386  1.209643e-12  5.806285e-11
8        Q2     Health  Sensitive  6.622718  5.664077e-11  2.718757e-09
9        Q2  Financial    Control  3.890444  1.063662e-04  5.105578e-03
10       Q2  Financial  Sensitive  3.563337  3.830099e-04  1.838447e-02
11       Q2    Control  Sensitive -0.010752  9.914238e-01  1.000000e+00
12       Q3     Health  Financial  1.677820  9.367899e-02  1.000000e+00
13       Q3     Health    Control  4.875548  1.252754e-06  6.013218e-05
14       Q3     Health  Sensitive  7.272086  7.006189e-13  3.362971e-11
15       Q3  Financial    Control  3.346283  8.477206e-04  4.069059e-02
16       Q3  Financial  Sensitive  5.868525  5.916269e-09  2.839809e-07
17       Q3    Control  Sensitive  2.556409  1.071798e-02  5.144631e-01
18       Q4     Health  Financial  4.356913  1.447577e-05  6.948371e-04
19       Q4     Health    Control  4.663416  3.513352e-06  1.686409e-04
20       Q4     Health  Sensitive  5.180697  2.656909e-07  1.275316e-05
21       Q4  Financial    Control  0.179708  8.574161e-01  1.000000e+00
22       Q4  Financial  Sensitive  0.980023  3.273059e-01  1.000000e+00
23       Q4    Control  Sensitive  0.836422  4.031156e-01  1.000000e+00
24       Q5     Health  Financial  8.191967  7.446464e-16  3.574303e-14
25       Q5     Health    Control  6.830895  1.458423e-11  7.000431e-10
26       Q5     Health  Sensitive  8.988436  1.250018e-18  6.000086e-17
27       Q5  Financial    Control -1.883475  5.991251e-02  1.000000e+00
28       Q5  Financial  Sensitive  2.136642  3.288764e-02  1.000000e+00
29       Q5    Control  Sensitive  3.755913  1.841998e-04  8.841592e-03
30       Q6     Health  Financial  9.017429  9.001603e-19  4.320769e-17
31       Q6     Health    Control  5.551441  3.585966e-08  1.721264e-06
32       Q6     Health  Sensitive  7.846908  1.063415e-14  5.104394e-13
33       Q6  Financial    Control -3.565948  3.787077e-04  1.817797e-02
34       Q6  Financial  Sensitive -0.522283  6.015889e-01  1.000000e+00
35       Q6    Control  Sensitive  2.756823  5.941880e-03  2.852102e-01
36       Q7     Health  Financial  6.845701  1.286144e-11  6.173493e-10
37       Q7     Health    Control  3.078666  2.132678e-03  1.023685e-01
38       Q7     Health  Sensitive  7.967726  4.248489e-15  2.039275e-13
39       Q7  Financial    Control -3.756255  1.818530e-04  8.728946e-03
40       Q7  Financial  Sensitive  1.128150  2.595183e-01  1.000000e+00
41       Q7    Control  Sensitive  4.879113  1.233358e-06  5.920117e-05
42       Q8     Health  Financial -4.672662  3.356588e-06  1.611162e-04
43       Q8     Health    Control -6.334550  3.510638e-10  1.685106e-08
44       Q8     Health  Sensitive  0.923002  3.562217e-01  1.000000e+00
45       Q8  Financial    Control -1.799504  7.222315e-02  1.000000e+00
46       Q8  Financial  Sensitive  5.752353  1.159085e-08  5.563608e-07
47       Q8    Control  Sensitive  7.436152  2.172397e-13  1.042750e-11

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Dictionary mapping question codes to descriptive labels
custom_labels = {
    'Q1': 'Hackers/Cybercriminals',
    'Q2': 'Government',
    'Q3': 'Corporations',
    'Q4': 'Employer/Colleagues',
    'Q5': 'Family',
    'Q6': 'Close Friends',
    'Q7': 'Acquaintances',
    'Q8': 'Publicly Available'
}

# Assume `grouped_df` has one row per (SyntheticPersonID, Condition),
# with columns: ["SyntheticPersonID", "Condition", "Q1", "Q2", ..., "Q8"].

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()

for cond in conditions:
    # Filter data for this condition
    cond_data = grouped_df[grouped_df["Condition"] == cond]

    # Compute means and standard deviations for each question
    mean_vals = cond_data[questions].mean(axis=0)
    std_vals  = cond_data[questions].std(axis=0)

    # Number of SyntheticPersonIDs in this condition
    n = cond_data.shape[0]

    # Standard error for each question
    se_vals = std_vals / np.sqrt(n)  # If n <= 1, be mindful of dividing by zero

    # 95% CI using the t-distribution (two-tailed)
    if n > 1:
        t_multiplier = st.t.ppf(1 - 0.025, df=n-1)
        ci_vals = t_multiplier * se_vals
    else:
        # If there's only one data point in this condition, CI is not meaningful
        ci_vals = np.zeros_like(se_vals)

    # X positions for plotting
    x_positions = np.arange(len(questions))

    # Create the plot
    plt.figure(figsize=(10,6))
    plt.errorbar(
        x_positions,
        mean_vals,
        yerr=ci_vals,
        fmt='o',
        capsize=5,
        color='blue',
        ecolor='black'
    )

    # Use custom labels for the questions
    question_labels = [custom_labels[q] for q in questions]

    plt.xticks(x_positions, question_labels, rotation=45, ha='right')
    plt.xlabel("Question")
    plt.ylabel("Mean Response")
    plt.title(f"Mean Response (95% CI) for Condition: {cond}")
    plt.tight_layout()
    plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Assume grouped_df has one row per (SyntheticPersonID, Condition),
# including averaged columns ["Q1", ..., "Q8"].

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()

for question in questions:
    # Prepare lists to store the mean and CI for each condition
    means = []
    ci_vals = []

    for cond in conditions:
        # Extract data for this question in the current condition
        cond_data = grouped_df.loc[grouped_df["Condition"] == cond, question].dropna()
        mean_val = cond_data.mean()
        std_val = cond_data.std()
        n = cond_data.shape[0]

        # Compute 95% CI using the t-distribution
        if n > 1:
            se_val = std_val / np.sqrt(n)
            t_multiplier = st.t.ppf(1 - 0.025, df=n-1)  # 95% CI
            ci = t_multiplier * se_val
        else:
            # If there's only one data point, the CI can't be computed meaningfully
            ci = 0.0

        means.append(mean_val)
        ci_vals.append(ci)

    # X positions for plotting
    x_positions = np.arange(len(conditions))

    plt.figure(figsize=(8,6))
    plt.errorbar(
        x_positions,
        means,
        yerr=ci_vals,
        fmt='o',
        capsize=5,
        color='blue',
        ecolor='black'
    )

    plt.xticks(x_positions, conditions, rotation=45, ha='right')
    plt.xlabel("Condition")
    plt.ylabel("Mean Response")
    plt.title(f"Mean {question} by Condition (95% CI)")
    plt.tight_layout()
    plt.show()

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

import pandas as pd

# long-form summary: mean across participants × certainty
harm_summary = (
    grouped_df
    .groupby("Condition")[score_types]
    .mean()
    .T
    .sort_values(by="Health", ascending=False)
)
print(harm_summary)

Condition   Control  Financial    Health  Sensitive
Q8         4.145363   4.070019  3.870771   3.831052
Q1         3.728383   3.684524  3.518264   3.420635
Q7         3.315320   3.159649  3.445865   3.113724
Q2         3.122807   3.288252  3.436435   3.123280
Q3         3.133615   3.271147  3.344236   3.029200
Q4         3.141291   3.148747  3.342168   3.105423
Q6         2.917137   2.766071  3.168640   2.789848
Q5         2.749373   2.677694  3.029386   2.574603

df_responses = pd.read_csv('prolific_responses_survey1.csv')  # Adjust filename/path as needed
df_prolific  = pd.read_csv('prolific_export.csv')  # Adjust filename/path as needed
df_responses = df_responses.rename(columns={"ProlificID": "Participant id"})

demo_cols = ["Age","Sex","Ethnicity simplified","Country of birth","Country of residence",
             "Nationality","Language","Student status","Employment status"]
keep_cols = ["Participant id"] + [c for c in demo_cols if c in df_prolific.columns]

df_prolific = df_prolific[keep_cols].copy()
df_prolific = df_prolific.replace({"DATA_EXPIRED": pd.NA, "Data expired": pd.NA})
for c in keep_cols[1:]:
    if c in df_prolific:
        df_prolific[c] = df_prolific[c].astype("string").str.strip()

df_prolific = df_prolific.applymap(
    lambda v: pd.NA if isinstance(v, str) and v.strip().lower().replace(" ", "_") == "data_expired" else v
)

# Define the mapping from Likert-scale text to numbers
likert_mapping = {
    "Not at all harmful": 1,
    "Slightly harmful": 2,
    "Moderately harmful": 3,
    "Very harmful": 4,
    "Extremely harmful": 5
}

# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Replace the text values with numeric values in those columns
for col in likert_columns:
    if col in df.columns:  # only replace if column exists in the dataframe
        df_responses[col] = df_responses[col].replace(likert_mapping)

merged_df = df_responses.merge(df_prolific, on="Participant id", how="left")

# Now merged_df contains columns from both datasets.
print("Merged DataFrame:")
print(merged_df.head())

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

Merged DataFrame:
             Participant id  SyntheticPersonID AttnCheck  Q1  Q2  Q3  Q4  Q5  \
0  674bfd855d300ddd3a4628ab               1545       NaN   3   2   3   3   4   
1  674bfd855d300ddd3a4628ab               8617       NaN   3   3   3   4   4   
2  674bfd855d300ddd3a4628ab              14105       NaN   3   3   3   3   4   
3  674bfd855d300ddd3a4628ab              12001       NaN   3   3   3   4   4   
4  674bfd855d300ddd3a4628ab               2586       NaN   3   3   3   3   3   

   Q6  Q7  ...  Condition Age     Sex Ethnicity simplified Country of birth  \
0   4   4  ...  Financial  28  Female                White    United States   
1   4   4  ...  Financial  28  Female                White    United States   
2   4   4  ...  Financial  28  Female                White    United States   
3   4   4  ...  Financial  28  Female                White    United States   
4   4   4  ...  Financial  28  Female                White    United States   

  Country of residence    Nationality Language Student status  \
0        United States  United States  English            Yes   
1        United States  United States  English            Yes   
2        United States  United States  English            Yes   
3        United States  United States  English            Yes   
4        United States  United States  English            Yes   

  Employment status  
0         Full-Time  
1         Full-Time  
2         Full-Time  
3         Full-Time  
4         Full-Time  

[5 rows x 21 columns]

/tmp/ipython-input-1346675010.py:15: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  df_prolific = df_prolific.applymap(
/tmp/ipython-input-1346675010.py:35: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df_responses[col] = df_responses[col].replace(likert_mapping)

import pandas as pd
import numpy as np
import scipy.stats as stats
import itertools

# Define the questions and the demographic columns to analyze
questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# List of demographic columns (categorical) from the Prolific export.
# Update this list based on the columns available in your prolific_export.
demographic_vars = ["Sex", "Ethnicity simplified", "Country of residence", "Language", "Employment status"]

# Prepare a list to store the t-test results
results = []

# Loop over each demographic variable
for demo in demographic_vars:
    # Get the unique groups in this demographic (drop missing values)
    groups = merged_df[demo].dropna().unique()

    # If there are less than 2 groups, skip this demographic.
    if len(groups) < 2:
        continue

    # For each question, perform pairwise comparisons between groups in this demographic.
    for question in questions:
        # Use itertools.combinations to get all unique pairwise comparisons.
        for group1, group2 in itertools.combinations(groups, 2):
            # Filter data for each group for the given question
            data1 = merged_df.loc[merged_df[demo] == group1, question].dropna()
            data2 = merged_df.loc[merged_df[demo] == group2, question].dropna()

            # Only perform the t-test if both groups have at least 2 observations
            if len(data1) >= 2 and len(data2) >= 2:
                t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            else:
                t_stat, p_val = np.nan, np.nan  # Not enough data

            results.append((demo, question, group1, group2, t_stat, p_val))

# Convert the results list to a DataFrame for easier viewing
results_df = pd.DataFrame(results, columns=["Demographic", "Question", "Group1", "Group2", "t_stat", "p_value"])

# Optionally, adjust p-values for multiple comparisons using Bonferroni correction.
from statsmodels.stats.multitest import multipletests
adjusted = multipletests(results_df["p_value"].dropna(), method="bonferroni")
# Place adjusted p-values back into the results DataFrame.
# (needing to match the indices since multipletests was applied to non-NaN values.)
results_df.loc[results_df["p_value"].notna(), "p_adjusted"] = adjusted[1]

# Display the results
print("Pairwise t-test results for each question by demographic variable:")
print(results_df)
# get df to csv
results_df.to_csv('t-tests.csv', index=False)

/usr/local/lib/python3.12/dist-packages/scipy/stats/_axis_nan_policy.py:579: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.
  res = hypotest_fun_out(*samples, **kwds)

Pairwise t-test results for each question by demographic variable:
           Demographic Question  \
0                  Sex       Q1   
1                  Sex       Q1   
2                  Sex       Q1   
3                  Sex       Q2   
4                  Sex       Q2   
..                 ...      ...   
387  Employment status       Q8   
388  Employment status       Q8   
389  Employment status       Q8   
390  Employment status       Q8   
391  Employment status       Q8   

                                                Group1  \
0                                               Female   
1                                               Female   
2                                                 Male   
3                                               Female   
4                                               Female   
..                                                 ...   
387  Not in paid work (e.g. homemaker', 'retired or...   
388  Not in paid work (e.g. homemaker', 'retired or...   
389                       Unemployed (and job seeking)   
390                       Unemployed (and job seeking)   
391       Due to start a new job within the next month   

                                           Group2     t_stat       p_value  \
0                                            Male  -1.797785  7.226345e-02   
1                               Prefer not to say  -1.639231  1.120220e-01   
2                               Prefer not to say  -1.159118  2.558171e-01   
3                                            Male   6.125198  9.659961e-10   
4                               Prefer not to say -12.554503  3.037793e-14   
..                                            ...        ...           ...   
387  Due to start a new job within the next month   0.121551  9.033545e-01   
388                                         Other  -6.745376  5.085067e-11   
389  Due to start a new job within the next month   4.802704  3.055997e-06   
390                                         Other  -0.982692  3.264334e-01   
391                                         Other  -5.108832  6.592380e-07   

       p_adjusted  
0    1.000000e+00  
1    1.000000e+00  
2    1.000000e+00  
3    3.786705e-07  
4    1.190815e-11  
..            ...  
387  1.000000e+00  
388  1.993346e-08  
389  1.197951e-03  
390  1.000000e+00  
391  2.584213e-04  

[392 rows x 7 columns]

from sklearn.preprocessing import LabelEncoder

feature_list = ["GENDER", "RACETHN", "EDUCCAT5", "DIVISION", "MARITAL_ACS",
                "CHILDRENCAT", "CITIZEN_REC", "BORN_ACS", "AGE_INT",
                "HIV_STAT", "PREG_STAT", "NumChronicIllness",
                "FAMINC5", "CC_NUM", "FDSTMP_CPS",
                "SEXUALITY", "OWNGUN_GSS", "RELIGCAT"
]

df_synthetic.head()

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd

likert_columns = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

df_merged = pd.merge(
    df_synthetic,
    df_avg,
    how="left",
    left_on="id",
    right_on="SyntheticPersonID",
    suffixes=("", "_drop")
)

cols_to_drop = [col for col in df_merged.columns if col.endswith("_drop")]
df_final = df_merged.drop(columns=cols_to_drop)
df_final[likert_columns] = df_final[likert_columns].fillna(0)

print(df_final.head())

                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...  \
0  Some college            Mountain  Never married        3+  ...   
1       HS Grad  West South Central       Divorced         1  ...   
2  Less than HS     Middle Atlantic    Now married         2  ...   
3       HS Grad            Mountain    Now married         2  ...   
4  Some college             Pacific  Never married         1  ...   

  SyntheticPersonID   Q1   Q2   Q3   Q4   Q5   Q6   Q7   Q8 avg_harm  
0               NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
1               NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
2               NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
3               NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
4               NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  

[5 rows x 197 columns]

train_data = df_final[df_final['Q1'] > 0]
predict_data = df_final[df_final['Q1'] == 0]

X_train = train_data[feature_list]
y_train = train_data['Q1']

onehot_encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore',
    drop='first'
)

X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))

scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()

models = [
    ('RandomForest', RandomForestRegressor(n_estimators=100)),
    ('GradientBoosting', GradientBoostingRegressor()),
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('DecisionTree', DecisionTreeRegressor()),
    ('SVR', SVR())
]

def evaluate_model(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    mse = mean_squared_error(y_train, y_train_pred)
    r2 = r2_score(y_train, y_train_pred)
    print(f'Model: {model_name}')
    print(f'MSE on training data: {mse}')
    print(f'R^2 on training data: {r2}\n')

for name, model in models:
    evaluate_model(name, model, X_train_encoded, y_train_scaled)

Model: RandomForest
MSE on training data: 0.14696793433025557
R^2 on training data: 0.8530320656697444

Model: GradientBoosting
MSE on training data: 0.8793177007084634
R^2 on training data: 0.12068229929153684

Model: LinearRegression
MSE on training data: 0.12943632782427547
R^2 on training data: 0.8705636721757246

Model: Ridge
MSE on training data: 0.3482815208841059
R^2 on training data: 0.6517184791158941

Model: Lasso
MSE on training data: 1.0000000000000002
R^2 on training data: 0.0

Model: DecisionTree
MSE on training data: 5.869500782894434e-35
R^2 on training data: 1.0

Model: SVR
MSE on training data: 0.5913284111031373
R^2 on training data: 0.40867158889686284

questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
for q in questions:

  train_data = df_final[df_final[q] > 0]
  predict_data = df_final[df_final[q] == 0]

  X_train = train_data[feature_list]
  y_train = train_data[q]

  onehot_encoder = OneHotEncoder(
      sparse_output=False,
      handle_unknown='ignore',
      drop='first'
  )

  X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))

  scaler = StandardScaler()
  y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()

  models = [
      ('DecisionTree', DecisionTreeRegressor())
  ]

  def evaluate_model(model_name, model, X_train, y_train):
      model.fit(X_train, y_train)
      y_train_pred = model.predict(X_train)
      mse = mean_squared_error(y_train, y_train_pred)
      r2 = r2_score(y_train, y_train_pred)
      print(f'Model: {model_name}')
      print(f'MSE on training data: {mse}')
      print(f'R^2 on training data: {r2}\n')

  for name, model in models:
      evaluate_model(name, model, X_train_encoded, y_train_scaled)


  best_model = DecisionTreeRegressor(
      max_depth=30,
      max_features=None,
      min_samples_leaf=4,
      min_samples_split=5,
      random_state=42
  )
  best_model.fit(X_train_encoded, y_train_scaled)

  X_predict = predict_data[feature_list]
  X_predict_encoded = onehot_encoder.transform(X_predict.astype(str))
  predicted_scores_scaled = best_model.predict(X_predict_encoded)

  predicted_scores = scaler.inverse_transform(predicted_scores_scaled.reshape(-1, 1)).flatten()

  df_final.loc[df_final[q] == 0, q] = predicted_scores

  print(df_final.head())

Model: DecisionTree
MSE on training data: 2.934750391447217e-35
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...  \
0  Some college            Mountain  Never married        3+  ...   
1       HS Grad  West South Central       Divorced         1  ...   
2  Less than HS     Middle Atlantic    Now married         2  ...   
3       HS Grad            Mountain    Now married         2  ...   
4  Some college             Pacific  Never married         1  ...   

  SyntheticPersonID        Q1   Q2   Q3   Q4   Q5   Q6   Q7   Q8 avg_harm  
0               NaN  3.416667  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
1               NaN  3.952381  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
2               NaN  2.916667  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
3               NaN  3.458333  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
4               NaN  3.786667  0.0  0.0  0.0  0.0  0.0  0.0  0.0      NaN  

[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.0024006805786898e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...  \
0  Some college            Mountain  Never married        3+  ...   
1       HS Grad  West South Central       Divorced         1  ...   
2  Less than HS     Middle Atlantic    Now married         2  ...   
3       HS Grad            Mountain    Now married         2  ...   
4  Some college             Pacific  Never married         1  ...   

  SyntheticPersonID        Q1        Q2   Q3   Q4   Q5   Q6   Q7   Q8 avg_harm  
0               NaN  3.416667  3.392857  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
1               NaN  3.952381  3.833333  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
2               NaN  2.916667  2.833333  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
3               NaN  3.458333  3.466667  0.0  0.0  0.0  0.0  0.0  0.0      NaN  
4               NaN  3.786667  3.516667  0.0  0.0  0.0  0.0  0.0  0.0      NaN  

[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.1555579666323415e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...  \
0  Some college            Mountain  Never married        3+  ...   
1       HS Grad  West South Central       Divorced         1  ...   
2  Less than HS     Middle Atlantic    Now married         2  ...   
3       HS Grad            Mountain    Now married         2  ...   
4  Some college             Pacific  Never married         1  ...   

  SyntheticPersonID        Q1        Q2        Q3   Q4   Q5   Q6   Q7   Q8  \
0               NaN  3.416667  3.392857  2.604167  0.0  0.0  0.0  0.0  0.0   
1               NaN  3.952381  3.833333  3.426667  0.0  0.0  0.0  0.0  0.0   
2               NaN  2.916667  2.833333  3.458333  0.0  0.0  0.0  0.0  0.0   
3               NaN  3.458333  3.466667  3.660000  0.0  0.0  0.0  0.0  0.0   
4               NaN  3.786667  3.516667  3.250000  0.0  0.0  0.0  0.0  0.0   

  avg_harm  
0      NaN  
1      NaN  
2      NaN  
3      NaN  
4      NaN  

[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.2619426683223032e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...  \
0  Some college            Mountain  Never married        3+  ...   
1       HS Grad  West South Central       Divorced         1  ...   
2  Less than HS     Middle Atlantic    Now married         2  ...   
3       HS Grad            Mountain    Now married         2  ...   
4  Some college             Pacific  Never married         1  ...   

  SyntheticPersonID        Q1        Q2        Q3        Q4   Q5   Q6   Q7  \
0               NaN  3.416667  3.392857  2.604167  3.041667  0.0  0.0  0.0   
1               NaN  3.952381  3.833333  3.426667  2.538095  0.0  0.0  0.0   
2               NaN  2.916667  2.833333  3.458333  4.027778  0.0  0.0  0.0   
3               NaN  3.458333  3.466667  3.660000  3.511905  0.0  0.0  0.0   
4               NaN  3.786667  3.516667  3.250000  3.290476  0.0  0.0  0.0   

    Q8 avg_harm  
0  0.0      NaN  
1  0.0      NaN  
2  0.0      NaN  
3  0.0      NaN  
4  0.0      NaN  

[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.1739001565788867e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

# drop syntheticpersonid column
df_final = df_final.drop(columns=['SyntheticPersonID'])
df_final = df_final.drop(columns=['avg_harm'])

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_barplot_per_question(df, questions, save=False, outdir=".", fontsize=12, figsize=(12, 8), dpi=100, save_dpi=300):
    palette = sns.color_palette("Set2")
    pdf = None
    if save:
        pdf_path = f"{outdir}/Barplots.pdf"
        pdf = PdfPages(pdf_path)

    q = 1
    for score in questions:
        fig = plt.figure(figsize=figsize, dpi=dpi)
        ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
        ax.set_title(f'Average Scores by Race/Ethnicity and Sexuality for Question {q}', fontsize=fontsize)
        q += 1
        ax.set_xlabel('Race/Ethnicity', fontsize=fontsize)
        ax.set_ylabel('Average Score', fontsize=fontsize)
        ax.tick_params(axis='both', labelsize=fontsize)
        plt.xticks(rotation=45)
        leg = ax.legend(title='Sexuality')
        if leg:
            leg.get_title().set_fontsize(fontsize)
            for t in leg.get_texts():
                t.set_fontsize(fontsize)
        plt.tight_layout()

        if save:
            pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_barplot_per_question(df_final, questions, save=True, outdir=".")

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)

Saved ./Barplots.pdf

import itertools
import scipy.stats as stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Loop over each question
for score in score_types:
    print(f"\nResults for {score}:")

    # Pairwise t-tests for SEXUALITY
    print("Comparisons by SEXUALITY:")
    sexuality_groups = df_final['SEXUALITY'].dropna().unique()
    for group1, group2 in itertools.combinations(sexuality_groups, 2):
        data1 = df_final[df_final['SEXUALITY'] == group1][score].dropna()
        data2 = df_final[df_final['SEXUALITY'] == group2][score].dropna()
        if len(data1) >= 2 and len(data2) >= 2:
            t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            print(f"  {group1} vs {group2}: t = {t_stat:.3f}, p = {p_val:.3e}")

Results for Q1:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = 0.020, p = 9.841e-01
  Heterosexual vs Bisexual: t = -4.631, p = 4.155e-06
  Heterosexual vs Lesbian: t = 1.024, p = 3.075e-01
  Gay vs Bisexual: t = -2.687, p = 7.365e-03
  Gay vs Lesbian: t = 0.850, p = 3.962e-01
  Bisexual vs Lesbian: t = 2.876, p = 4.462e-03

Results for Q2:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = -0.255, p = 7.985e-01
  Heterosexual vs Bisexual: t = -3.453, p = 5.799e-04
  Heterosexual vs Lesbian: t = -0.574, p = 5.671e-01
  Gay vs Bisexual: t = -1.804, p = 7.161e-02
  Gay vs Lesbian: t = -0.383, p = 7.021e-01
  Bisexual vs Lesbian: t = 0.707, p = 4.806e-01

Results for Q3:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = 1.721, p = 8.594e-02
  Heterosexual vs Bisexual: t = -5.635, p = 2.325e-08
  Heterosexual vs Lesbian: t = -1.538, p = 1.262e-01
  Gay vs Bisexual: t = -4.778, p = 2.100e-06
  Gay vs Lesbian: t = -2.202, p = 2.862e-02
  Bisexual vs Lesbian: t = 0.750, p = 4.540e-01

Results for Q4:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = -1.942, p = 5.287e-02
  Heterosexual vs Bisexual: t = -6.565, p = 8.627e-11
  Heterosexual vs Lesbian: t = -3.053, p = 2.706e-03
  Gay vs Bisexual: t = -2.252, p = 2.462e-02
  Gay vs Lesbian: t = -1.700, p = 9.045e-02
  Bisexual vs Lesbian: t = -0.369, p = 7.128e-01

Results for Q5:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = 0.591, p = 5.548e-01
  Heterosexual vs Bisexual: t = 3.610, p = 3.227e-04
  Heterosexual vs Lesbian: t = 0.671, p = 5.033e-01
  Gay vs Bisexual: t = 1.460, p = 1.447e-01
  Gay vs Lesbian: t = 0.265, p = 7.910e-01
  Bisexual vs Lesbian: t = -0.691, p = 4.905e-01

Results for Q6:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = -3.237, p = 1.308e-03
  Heterosexual vs Bisexual: t = -5.511, p = 4.613e-08
  Heterosexual vs Lesbian: t = -2.668, p = 8.519e-03
  Gay vs Bisexual: t = -0.468, p = 6.397e-01
  Gay vs Lesbian: t = -0.736, p = 4.627e-01
  Bisexual vs Lesbian: t = -0.491, p = 6.240e-01

Results for Q7:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = -0.108, p = 9.143e-01
  Heterosexual vs Bisexual: t = 0.563, p = 5.739e-01
  Heterosexual vs Lesbian: t = -0.567, p = 5.716e-01
  Gay vs Bisexual: t = 0.406, p = 6.847e-01
  Gay vs Lesbian: t = -0.440, p = 6.604e-01
  Bisexual vs Lesbian: t = -0.734, p = 4.637e-01

Results for Q8:
Comparisons by SEXUALITY:
  Heterosexual vs Gay: t = -0.327, p = 7.441e-01
  Heterosexual vs Bisexual: t = -4.607, p = 4.658e-06
  Heterosexual vs Lesbian: t = -2.006, p = 4.672e-02
  Gay vs Bisexual: t = -2.553, p = 1.086e-02
  Gay vs Lesbian: t = -1.640, p = 1.024e-01
  Bisexual vs Lesbian: t = -0.195, p = 8.458e-01

import itertools
import scipy.stats as stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Loop over each harm question
for score in score_types:
    print(f"\n=== T-test Results for {score} ===")

    # 1) Pairwise t-tests by HIV_STAT
    print("Comparisons by HIV_STAT:")
    hiv_groups = df_final['HIV_STAT'].dropna().unique()
    for grp1, grp2 in itertools.combinations(hiv_groups, 2):
        data1 = df_final.loc[df_final['HIV_STAT'] == grp1, score].dropna()
        data2 = df_final.loc[df_final['HIV_STAT'] == grp2, score].dropna()

        # Ensure both groups have at least 2 values to run a t-test
        if len(data1) >= 2 and len(data2) >= 2:
            t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            print(f"  {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
        else:
            print(f"  {grp1} vs {grp2}: Not enough data.")

=== T-test Results for Q1 ===
Comparisons by HIV_STAT:
  negative vs positive: t = -0.171, p = 8.642e-01

=== T-test Results for Q2 ===
Comparisons by HIV_STAT:
  negative vs positive: t = 0.110, p = 9.130e-01

=== T-test Results for Q3 ===
Comparisons by HIV_STAT:
  negative vs positive: t = -0.233, p = 8.158e-01

=== T-test Results for Q4 ===
Comparisons by HIV_STAT:
  negative vs positive: t = 1.262, p = 2.094e-01

=== T-test Results for Q5 ===
Comparisons by HIV_STAT:
  negative vs positive: t = 0.426, p = 6.709e-01

=== T-test Results for Q6 ===
Comparisons by HIV_STAT:
  negative vs positive: t = -1.796, p = 7.498e-02

=== T-test Results for Q7 ===
Comparisons by HIV_STAT:
  negative vs positive: t = 0.204, p = 8.385e-01

=== T-test Results for Q8 ===
Comparisons by HIV_STAT:
  negative vs positive: t = 0.523, p = 6.018e-01

import itertools
import scipy.stats as stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Loop over each harm question
for score in score_types:
    print(f"\n=== T-test Results for {score} ===")

    # 1) Pairwise t-tests by HIV_STAT
    print("Comparisons by FAMINC5:")
    hiv_groups = df_final['FAMINC5'].dropna().unique()
    for grp1, grp2 in itertools.combinations(hiv_groups, 2):
        data1 = df_final.loc[df_final['FAMINC5'] == grp1, score].dropna()
        data2 = df_final.loc[df_final['FAMINC5'] == grp2, score].dropna()

        # Ensure both groups have at least 2 values to run a t-test
        if len(data1) >= 2 and len(data2) >= 2:
            t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            print(f"  {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
        else:
            print(f"  {grp1} vs {grp2}: Not enough data.")

=== T-test Results for Q1 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 10.354, p = 6.751e-25
  $150K or more vs $40K to less than $75K: t = 2.848, p = 4.419e-03
  $150K or more vs $20K to less than $40K: t = 5.689, p = 1.340e-08
  $150K or more vs $75K to less than $150K: t = -3.250, p = 1.162e-03
  Less than $20K vs $40K to less than $75K: t = -9.091, p = 1.339e-19
  Less than $20K vs $20K to less than $40K: t = -5.221, p = 1.838e-07
  Less than $20K vs $75K to less than $150K: t = -15.307, p = 8.899e-52
  $40K to less than $75K vs $20K to less than $40K: t = 3.646, p = 2.681e-04
  $40K to less than $75K vs $75K to less than $150K: t = -7.657, p = 2.070e-14
  $20K to less than $40K vs $75K to less than $150K: t = -10.312, p = 9.442e-25

=== T-test Results for Q2 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 1.803, p = 7.145e-02
  $150K or more vs $40K to less than $75K: t = -0.908, p = 3.638e-01
  $150K or more vs $20K to less than $40K: t = -2.290, p = 2.208e-02
  $150K or more vs $75K to less than $150K: t = -2.442, p = 1.465e-02
  Less than $20K vs $40K to less than $75K: t = -2.996, p = 2.746e-03
  Less than $20K vs $20K to less than $40K: t = -4.251, p = 2.164e-05
  Less than $20K vs $75K to less than $150K: t = -4.614, p = 4.033e-06
  $40K to less than $75K vs $20K to less than $40K: t = -1.694, p = 9.033e-02
  $40K to less than $75K vs $75K to less than $150K: t = -1.848, p = 6.465e-02
  $20K to less than $40K vs $75K to less than $150K: t = 0.091, p = 9.278e-01

=== T-test Results for Q3 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 2.109, p = 3.500e-02
  $150K or more vs $40K to less than $75K: t = 3.541, p = 4.026e-04
  $150K or more vs $20K to less than $40K: t = 5.172, p = 2.391e-07
  $150K or more vs $75K to less than $150K: t = -0.912, p = 3.618e-01
  Less than $20K vs $40K to less than $75K: t = 1.048, p = 2.945e-01
  Less than $20K vs $20K to less than $40K: t = 2.877, p = 4.028e-03
  Less than $20K vs $75K to less than $150K: t = -3.339, p = 8.469e-04
  $40K to less than $75K vs $20K to less than $40K: t = 2.303, p = 2.128e-02
  $40K to less than $75K vs $75K to less than $150K: t = -5.519, p = 3.485e-08
  $20K to less than $40K vs $75K to less than $150K: t = -7.126, p = 1.127e-12

=== T-test Results for Q4 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 4.746, p = 2.127e-06
  $150K or more vs $40K to less than $75K: t = 0.836, p = 4.030e-01
  $150K or more vs $20K to less than $40K: t = 4.490, p = 7.265e-06
  $150K or more vs $75K to less than $150K: t = -5.198, p = 2.087e-07
  Less than $20K vs $40K to less than $75K: t = -4.515, p = 6.458e-06
  Less than $20K vs $20K to less than $40K: t = -0.426, p = 6.704e-01
  Less than $20K vs $75K to less than $150K: t = -10.521, p = 1.124e-25
  $40K to less than $75K vs $20K to less than $40K: t = 4.238, p = 2.282e-05
  $40K to less than $75K vs $75K to less than $150K: t = -7.044, p = 1.976e-12
  $20K to less than $40K vs $75K to less than $150K: t = -10.517, p = 1.072e-25

=== T-test Results for Q5 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 1.090, p = 2.758e-01
  $150K or more vs $40K to less than $75K: t = 3.502, p = 4.651e-04
  $150K or more vs $20K to less than $40K: t = 2.781, p = 5.437e-03
  $150K or more vs $75K to less than $150K: t = -0.328, p = 7.427e-01
  Less than $20K vs $40K to less than $75K: t = 2.326, p = 2.004e-02
  Less than $20K vs $20K to less than $40K: t = 1.710, p = 8.735e-02
  Less than $20K vs $75K to less than $150K: t = -1.620, p = 1.052e-01
  $40K to less than $75K vs $20K to less than $40K: t = -0.421, p = 6.741e-01
  $40K to less than $75K vs $75K to less than $150K: t = -4.751, p = 2.046e-06
  $20K to less than $40K vs $75K to less than $150K: t = -3.629, p = 2.871e-04

=== T-test Results for Q6 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 7.296, p = 3.381e-13
  $150K or more vs $40K to less than $75K: t = 1.267, p = 2.051e-01
  $150K or more vs $20K to less than $40K: t = 7.933, p = 2.546e-15
  $150K or more vs $75K to less than $150K: t = -2.626, p = 8.656e-03
  Less than $20K vs $40K to less than $75K: t = -7.016, p = 2.531e-12
  Less than $20K vs $20K to less than $40K: t = 0.339, p = 7.350e-01
  Less than $20K vs $75K to less than $150K: t = -11.143, p = 1.563e-28
  $40K to less than $75K vs $20K to less than $40K: t = 7.779, p = 8.298e-15
  $40K to less than $75K vs $75K to less than $150K: t = -4.748, p = 2.086e-06
  $20K to less than $40K vs $75K to less than $150K: t = -12.183, p = 8.516e-34

=== T-test Results for Q7 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = 7.919, p = 2.875e-15
  $150K or more vs $40K to less than $75K: t = 3.448, p = 5.685e-04
  $150K or more vs $20K to less than $40K: t = 0.657, p = 5.111e-01
  $150K or more vs $75K to less than $150K: t = -7.597, p = 3.608e-14
  Less than $20K vs $40K to less than $75K: t = -5.603, p = 2.201e-08
  Less than $20K vs $20K to less than $40K: t = -7.759, p = 9.979e-15
  Less than $20K vs $75K to less than $150K: t = -17.055, p = 1.266e-63
  $40K to less than $75K vs $20K to less than $40K: t = -2.971, p = 2.975e-03
  $40K to less than $75K vs $75K to less than $150K: t = -13.629, p = 6.067e-42
  $20K to less than $40K vs $75K to less than $150K: t = -9.081, p = 1.373e-19

=== T-test Results for Q8 ===
Comparisons by FAMINC5:
  $150K or more vs Less than $20K: t = -1.502, p = 1.331e-01
  $150K or more vs $40K to less than $75K: t = -3.706, p = 2.126e-04
  $150K or more vs $20K to less than $40K: t = -0.828, p = 4.075e-01
  $150K or more vs $75K to less than $150K: t = -9.260, p = 3.053e-20
  Less than $20K vs $40K to less than $75K: t = -2.044, p = 4.099e-02
  Less than $20K vs $20K to less than $40K: t = 0.778, p = 4.363e-01
  Less than $20K vs $75K to less than $150K: t = -7.771, p = 9.232e-15
  $40K to less than $75K vs $20K to less than $40K: t = 3.136, p = 1.720e-03
  $40K to less than $75K vs $75K to less than $150K: t = -7.172, p = 7.888e-13
  $20K to less than $40K vs $75K to less than $150K: t = -9.387, p = 8.156e-21

import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap_per_question(df, questions, mask_zeros=True, save=False, outdir="."):
    """
    Generate heatmaps of mean values for each question (Q1..Q8)
    by Division × Family Income group.
    """
    canonical_levels = [
        "Less than $20K",
        "$20K to less than $40K",
        "$40K to less than $75K",
        "$75K to less than $150K",
        "$150K or more",
    ]

    for score in questions:
        df_copy = df.copy()

        # If 0 was just a placeholder, treat as missing
        if mask_zeros and score in df_copy:
            df_copy.loc[df_copy[score] == 0, score] = np.nan

        df_copy["FAMINC5"] = pd.Categorical(
            df_copy["FAMINC5"], categories=canonical_levels, ordered=True
        )

        # Compute group means
        means = (
            df_copy.groupby(["DIVISION","FAMINC5"])[score]
                   .mean()
                   .unstack()
                   .reindex(columns=canonical_levels)
        )

        # Plot heatmap
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(means, annot=True, fmt=".2f", cmap="Greys", cbar=False, ax=ax)
        ax.set_title(f"{score}: Mean by Division × Family Income")
        ax.set_xlabel("Family Income")
        ax.set_ylabel("Division")
        plt.tight_layout()

        if save:
            pdf_path = f"{outdir}/Heatmap_{score}.pdf"
            png_path = f"{outdir}/Heatmap_{score}.png"
            plt.savefig(pdf_path, bbox_inches="tight")
            plt.savefig(png_path, bbox_inches="tight", dpi=300)
            print(f"Saved {pdf_path} and {png_path}")

        plt.show()

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_heatmap_per_question(df_final, questions, mask_zeros=True, save=True, outdir=".")

/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION","FAMINC5"])[score]

Saved ./Heatmap_Q1.pdf and ./Heatmap_Q1.png

/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION","FAMINC5"])[score]

Saved ./Heatmap_Q2.pdf and ./Heatmap_Q2.png

/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION","FAMINC5"])[score]

Saved ./Heatmap_Q3.pdf and ./Heatmap_Q3.png

/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION","FAMINC5"])[score]

Saved ./Heatmap_Q4.pdf and ./Heatmap_Q4.png

/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION","FAMINC5"])[score]

Saved ./Heatmap_Q5.pdf and ./Heatmap_Q5.png

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def plot_heatmap_per_question(
    df, questions, mask_zeros=True, save=False, outdir=".",
    fontsize=18, figsize=(8, 6), dpi=100, save_dpi=300
):
    """
    Heatmaps of mean values (Q1..Q8) by Division × Family Income.
    Shows at a normal size in Jupyter and saves with large fonts.
    """
    canonical_levels = [
        "Less than $20K",
        "$20K to less than $40K",
        "$40K to less than $75K",
        "$75K to less than $150K",
        "$150K or more",
    ]

    sns.set_theme(style="white")

    for score in questions:
        df_copy = df.copy()

        if mask_zeros and score in df_copy:
            df_copy.loc[df_copy[score] == 0, score] = np.nan

        df_copy["FAMINC5"] = pd.Categorical(
            df_copy["FAMINC5"], categories=canonical_levels, ordered=True
        )

        means = (
            df_copy.groupby(["DIVISION", "FAMINC5"])[score]
            .mean()
            .unstack()
            .reindex(columns=canonical_levels)
        )

        # normal-size preview
        fig, ax = plt.subplots(figsize=figsize, dpi=dpi)

        sns.heatmap(
            means,
            annot=True,
            fmt=".2f",
            cmap="Greys",
            cbar=False,
            ax=ax,
            annot_kws={"size": fontsize - 2},     # numbers inside cells
            linewidths=0.5,
            linecolor="lightgray"
        )

        # Explicitly set label and tick font sizes
        ax.set_title(f"{score}: Mean by Division × Family Income",
                     fontsize=fontsize + 2, pad=10)
        ax.set_xlabel("Family Income", fontsize=fontsize)
        ax.set_ylabel("Division", fontsize=fontsize)

        ax.tick_params(axis="x", labelsize=fontsize - 2, rotation=30)
        ax.tick_params(axis="y", labelsize=fontsize - 2)

        plt.tight_layout()

        if save:
            base = f"{outdir}/Heatmap_{score}"
            plt.savefig(f"{base}.pdf", bbox_inches="tight", dpi=save_dpi)
            plt.savefig(f"{base}.png", bbox_inches="tight", dpi=save_dpi)
            print(f"Saved {base}.pdf / .png")

        plt.show()

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

plot_heatmap_per_question(
    df_final,
    questions,
    fontsize=18,     # big labels
    figsize=(8,6),   # normal display
    dpi=100,         # notebook view
    save=True,
    save_dpi=300     # publication quality
)

/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION", "FAMINC5"])[score]

Saved ./Heatmap_Q1.pdf / .png

/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION", "FAMINC5"])[score]

Saved ./Heatmap_Q2.pdf / .png

/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION", "FAMINC5"])[score]

Saved ./Heatmap_Q3.pdf / .png

/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION", "FAMINC5"])[score]

Saved ./Heatmap_Q4.pdf / .png

/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_copy.groupby(["DIVISION", "FAMINC5"])[score]

Saved ./Heatmap_Q5.pdf / .png

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_boxplot_per_question(df, questions, save=False, outdir=".", fontsize=12, figsize=(12, 6), dpi=100, save_dpi=300):
    pdf = None
    if save:
        pdf_path = f"{outdir}/Boxplots.pdf"
        pdf = PdfPages(pdf_path)

    for score in questions:
        fig = plt.figure(figsize=figsize, dpi=dpi)
        ax = sns.boxplot(x='DIVISION', y=score, hue='FAMINC5', data=df)
        ax.set_title(f'Distribution of {score} by Division and Family Income', fontsize=fontsize)
        ax.set_xlabel('Race/Ethnicity', fontsize=fontsize)
        ax.set_ylabel(score, fontsize=fontsize)
        ax.tick_params(axis='both', labelsize=fontsize)
        plt.xticks(rotation=45)
        leg = ax.legend(title='Division')
        if leg:
            leg.get_title().set_fontsize(fontsize)
            for t in leg.get_texts():
                t.set_fontsize(fontsize)
        plt.tight_layout()

        if save:
            pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_boxplot_per_question(df_final, questions, save=True, outdir=".")

Saved ./Boxplots.pdf

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# List of your harm questions
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

# We'll loop over each question and fit a model like:
# QX ~ C(HIV_STAT)*C(GENDER) + C(HIV_STAT)*C(RACETHN) + C(HIV_STAT)*C(SEXUALITY)
for q in questions:
    formula = (
        f"{q} ~ C(HIV_STAT)*C(GENDER) "
        f"+ C(HIV_STAT)*C(RACETHN) "
        f"+ C(HIV_STAT)*C(SEXUALITY)"
    )
    model = ols(formula, data=df_final).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"ANOVA for {q} (HIV status interactions):")
    print(anova_table)
    print("\n")

/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
  F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '

ANOVA for Q1 (HIV status interactions):
                               sum_sq       df          F        PR(>F)
C(HIV_STAT)                       NaN      1.0        NaN           NaN
C(GENDER)                    1.031297      1.0   3.459178  6.291625e-02
C(RACETHN)                  32.247881      4.0  27.041477  2.061234e-22
C(SEXUALITY)                 7.345514      3.0   8.212780  2.720823e-04
C(HIV_STAT):C(GENDER)        1.084251      1.0   3.636796  5.652991e-02
C(HIV_STAT):C(RACETHN)       1.855918      4.0   1.556281  1.830005e-01
C(HIV_STAT):C(SEXUALITY)     0.074259      3.0   0.083027  9.203269e-01
Residual                  5957.601692  19983.0        NaN           NaN

/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
  F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '

ANOVA for Q2 (HIV status interactions):
                               sum_sq       df          F        PR(>F)
C(HIV_STAT)                       NaN      1.0        NaN           NaN
C(GENDER)                    7.351555      1.0  26.167008  3.160221e-07
C(RACETHN)                  11.284552      4.0  10.041513  4.072944e-08
C(SEXUALITY)                 4.480533      3.0   5.315980  4.919414e-03
C(HIV_STAT):C(GENDER)        1.152986      1.0   4.103919  4.279721e-02
C(HIV_STAT):C(RACETHN)       1.409422      4.0   1.254169  2.856295e-01
C(HIV_STAT):C(SEXUALITY)     0.355331      3.0   0.421587  6.560110e-01
Residual                  5614.173538  19983.0        NaN           NaN

/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
  F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '

ANOVA for Q3 (HIV status interactions):
                               sum_sq       df          F        PR(>F)
C(HIV_STAT)                       NaN      1.0        NaN           NaN
C(GENDER)                    7.399024      1.0  30.548239  3.297363e-08
C(RACETHN)                   9.770807      4.0  10.085145  3.748663e-08
C(SEXUALITY)                15.220055      3.0  20.946271  8.178470e-10
C(HIV_STAT):C(GENDER)        0.397629      1.0   1.641685  2.001094e-01
C(HIV_STAT):C(RACETHN)       2.161597      4.0   2.231138  6.305302e-02
C(HIV_STAT):C(SEXUALITY)     0.996982      3.0   1.372075  2.536041e-01
Residual                  4840.039940  19983.0        NaN           NaN

/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
  F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '

ANOVA for Q4 (HIV status interactions):
                               sum_sq       df          F        PR(>F)
C(HIV_STAT)                       NaN      1.0        NaN           NaN
C(GENDER)                   20.080138      1.0  75.534928  3.862163e-18
C(RACETHN)                  39.594250      4.0  37.235163  4.492368e-31
C(SEXUALITY)                18.490424      3.0  23.184981  8.761028e-11
C(HIV_STAT):C(GENDER)        0.193486      1.0   0.727831  3.935975e-01
C(HIV_STAT):C(RACETHN)       1.108683      4.0   1.042626  3.834517e-01
C(HIV_STAT):C(SEXUALITY)     0.115409      3.0   0.144710  8.652739e-01
Residual                  5312.262878  19983.0        NaN           NaN

/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
  F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
  warnings.warn('covariance of constraints does not have full '

ANOVA for Q5 (HIV status interactions):
                               sum_sq       df          F        PR(>F)
C(HIV_STAT)                       NaN      1.0        NaN           NaN
C(GENDER)                   16.643305      1.0  58.359518  2.281531e-14
C(RACETHN)                  42.171325      4.0  36.968292  7.577198e-31
C(SEXUALITY)                 8.424883      3.0   9.847245  5.314981e-05
C(HIV_STAT):C(GENDER)        0.040882      1.0   0.143352  7.049751e-01
C(HIV_STAT):C(RACETHN)       0.201964      4.0   0.177046  9.503129e-01
C(HIV_STAT):C(SEXUALITY)     0.496024      3.0   0.579767  5.600380e-01
Residual                  5698.867506  19983.0        NaN           NaN

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# List of your harm questions
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

# We'll loop over each question and fit a model like:
# QX ~ C(HIV_STAT)*C(GENDER) + C(HIV_STAT)*C(RACETHN) + C(HIV_STAT)*C(SEXUALITY)
for q in questions:
    formula = (
        f"{q} ~ C(PREG_STAT)*C(OWNGUN_GSS) "
        f"+ C(PREG_STAT)*C(EDUCCAT5) "
        f"+ C(PREG_STAT)*C(AGE_INT)"
    )
    model = ols(formula, data=df_final).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"ANOVA for {q} (Pregnancy):")
    print(anova_table)
    print("\n")

ANOVA for Q1 (Pregnancy):
                                 sum_sq       df           F         PR(>F)
C(PREG_STAT)                   4.095558      2.0    7.190990   7.552962e-04
C(OWNGUN_GSS)                 48.686100      1.0  170.966305   6.577359e-39
C(EDUCCAT5)                   16.195391      4.0   14.217951   1.366747e-11
C(AGE_INT)                   217.003435      7.0  108.861449  3.168010e-157
C(PREG_STAT):C(OWNGUN_GSS)     0.019176      2.0    0.033669   9.668912e-01
C(PREG_STAT):C(EDUCCAT5)      11.344730      8.0    4.979775   3.483861e-06
C(PREG_STAT):C(AGE_INT)       11.206338     14.0    2.810873   3.248167e-04
Residual                    5684.296889  19961.0         NaN            NaN


ANOVA for Q2 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                  24.899177      2.0  45.243136  2.486321e-20
C(OWNGUN_GSS)                  0.514457      1.0   1.869590  1.715374e-01
C(EDUCCAT5)                    1.714461      4.0   1.557634  1.826261e-01
C(AGE_INT)                    78.603977      7.0  40.807894  1.854515e-57
C(PREG_STAT):C(OWNGUN_GSS)     2.241790      2.0   4.073452  1.703269e-02
C(PREG_STAT):C(EDUCCAT5)      14.606735      8.0   6.635305  1.068917e-08
C(PREG_STAT):C(AGE_INT)       21.944858     14.0   5.696426  3.335045e-11
Residual                    5492.683668  19961.0        NaN           NaN


ANOVA for Q3 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                   5.711883      2.0  11.976923  6.332961e-06
C(OWNGUN_GSS)                  0.002498      1.0   0.010477  9.184737e-01
C(EDUCCAT5)                    9.624765      4.0  10.090813  3.708555e-08
C(AGE_INT)                    72.944647      7.0  43.701000  1.017678e-61
C(PREG_STAT):C(OWNGUN_GSS)     2.349021      2.0   4.925528  7.267716e-03
C(PREG_STAT):C(EDUCCAT5)       5.655299      8.0   2.964569  2.566736e-03
C(PREG_STAT):C(AGE_INT)       13.800257     14.0   4.133854  2.832743e-07
Residual                    4759.773744  19961.0        NaN           NaN


ANOVA for Q4 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                  26.198360      2.0  49.933199  2.335363e-22
C(OWNGUN_GSS)                 13.570312      1.0  51.729125  6.596279e-13
C(EDUCCAT5)                   25.658131      4.0  24.451771  3.231299e-20
C(AGE_INT)                    48.503461      7.0  26.413136  2.698495e-36
C(PREG_STAT):C(OWNGUN_GSS)     4.833991      2.0   9.213426  1.001165e-04
C(PREG_STAT):C(EDUCCAT5)      15.703446      8.0   7.482561  5.148189e-10
C(PREG_STAT):C(AGE_INT)       20.042889     14.0   5.457297  1.377490e-10
Residual                    5236.450585  19961.0        NaN           NaN


ANOVA for Q5 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                  15.439003      2.0  27.264287  1.497663e-12
C(OWNGUN_GSS)                 10.606511      1.0  37.460834  9.500684e-10
C(EDUCCAT5)                    0.264859      4.0   0.233862  9.194177e-01
C(AGE_INT)                    37.935763      7.0  19.140605  1.136515e-25
C(PREG_STAT):C(OWNGUN_GSS)     8.393320      2.0  14.822064  3.695171e-07
C(PREG_STAT):C(EDUCCAT5)      22.745501      8.0  10.041773  4.487110e-14
C(PREG_STAT):C(AGE_INT)       15.484201     14.0   3.906300  1.002161e-06
Residual                    5651.678040  19961.0        NaN           NaN


ANOVA for Q6 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                   0.029660      2.0   0.052414  9.489358e-01
C(OWNGUN_GSS)                  0.337160      1.0   1.191635  2.750132e-01
C(EDUCCAT5)                   45.369680      4.0  40.087821  1.679842e-33
C(AGE_INT)                    77.746520      7.0  39.254518  3.593111e-55
C(PREG_STAT):C(OWNGUN_GSS)     2.941062      2.0   5.197338  5.538758e-03
C(PREG_STAT):C(EDUCCAT5)       3.376778      8.0   1.491830  1.542246e-01
C(PREG_STAT):C(AGE_INT)       19.467936     14.0   4.914718  3.304355e-09
Residual                    5647.751318  19961.0        NaN           NaN


ANOVA for Q7 (Pregnancy):
                                 sum_sq       df           F        PR(>F)
C(PREG_STAT)                   0.280737      2.0    0.572010  5.643991e-01
C(OWNGUN_GSS)                 30.722838      1.0  125.197605  5.619283e-29
C(EDUCCAT5)                   16.087573      4.0   16.389482  2.064751e-13
C(AGE_INT)                    70.312616      7.0   40.932655  1.214827e-57
C(PREG_STAT):C(OWNGUN_GSS)     0.619657      2.0    1.262571  2.829482e-01
C(PREG_STAT):C(EDUCCAT5)       7.443571      8.0    3.791631  1.859487e-04
C(PREG_STAT):C(AGE_INT)       24.180292     14.0    7.038307  9.930476e-15
Residual                    4898.325020  19961.0         NaN           NaN


ANOVA for Q8 (Pregnancy):
                                 sum_sq       df          F        PR(>F)
C(PREG_STAT)                  18.975797      2.0  36.741649  1.181962e-16
C(OWNGUN_GSS)                 21.629464      1.0  83.759560  6.112023e-20
C(EDUCCAT5)                   36.710777      4.0  35.540392  1.242215e-29
C(AGE_INT)                     9.243550      7.0   5.113631  8.018321e-06
C(PREG_STAT):C(OWNGUN_GSS)     0.341448      2.0   0.661124  5.162820e-01
C(PREG_STAT):C(EDUCCAT5)      22.163422      8.0  10.728412  3.530147e-15
C(PREG_STAT):C(AGE_INT)        4.884761     14.0   1.351151  1.683065e-01
Residual                    5154.584575  19961.0        NaN           NaN

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_linear_per_question(df_final, score_types, score_type_to_question, save=False, outdir=".", fontsize=12, height=6, aspect=1.5, dpi=100, save_dpi=300, xlim=(16, 40)):
    pdf = None
    if save:
        pdf_path = f"{outdir}/LinearPlots.pdf"
        pdf = PdfPages(pdf_path)

    for score_type in score_types:
        grouped_scores = df_final.groupby(['AGE', 'PREG_STAT'])[score_type].median().reset_index()
        g = sns.lmplot(x='AGE', y=score_type, data=grouped_scores, hue='PREG_STAT', height=height, aspect=aspect, legend_out=True)
        g.set_axis_labels('Age', score_type)
        g.set(xlim=xlim)
        g.fig.set_dpi(dpi)
        g.fig.suptitle(f"Mean {score_type_to_question[score_type]} vs. Age colored by Pregnancy Status", fontsize=fontsize)
        g.fig.subplots_adjust(top=0.9)

        if save:
            pdf.savefig(g.fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
score_type_to_question = {
    'Q1': 'Question 1',
    'Q2': 'Question 2',
    'Q3': 'Question 3',
    'Q4': 'Question 4',
    'Q5': 'Question 5',
    'Q6': 'Question 6',
    'Q7': 'Question 7',
    'Q8': 'Question 8'
}

plot_linear_per_question(df_final, score_types, score_type_to_question, save=True, outdir=".")

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)

Saved ./LinearPlots.pdf

import itertools
import scipy.stats as stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Loop over each harm question
for score in score_types:
    print(f"\n=== T-test Results for {score} ===")

    # 1) Pairwise t-tests by HIV_STAT
    print("Comparisons by PREG_STAT:")
    hiv_groups = df_final['PREG_STAT'].dropna().unique()
    for grp1, grp2 in itertools.combinations(hiv_groups, 2):
        data1 = df_final.loc[df_final['PREG_STAT'] == grp1, score].dropna()
        data2 = df_final.loc[df_final['PREG_STAT'] == grp2, score].dropna()

        # Ensure both groups have at least 2 values to run a t-test
        if len(data1) >= 2 and len(data2) >= 2:
            t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            print(f"  {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
        else:
            print(f"  {grp1} vs {grp2}: Not enough data.")

=== T-test Results for Q1 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = -1.945, p = 5.175e-02
  Not Applicable vs Positive: t = -0.950, p = 3.422e-01
  Negative vs Positive: t = -0.174, p = 8.622e-01

=== T-test Results for Q2 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = -3.497, p = 4.712e-04
  Not Applicable vs Positive: t = -9.526, p = 1.230e-20
  Negative vs Positive: t = -8.049, p = 2.362e-15

=== T-test Results for Q3 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = 5.203, p = 1.979e-07
  Not Applicable vs Positive: t = -0.162, p = 8.712e-01
  Negative vs Positive: t = -2.197, p = 2.829e-02

=== T-test Results for Q4 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = -8.949, p = 3.901e-19
  Not Applicable vs Positive: t = -2.173, p = 3.005e-02
  Negative vs Positive: t = 1.632, p = 1.029e-01

=== T-test Results for Q5 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = -6.842, p = 8.036e-12
  Not Applicable vs Positive: t = -3.514, p = 4.620e-04
  Negative vs Positive: t = -0.813, p = 4.167e-01

=== T-test Results for Q6 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = 0.347, p = 7.283e-01
  Not Applicable vs Positive: t = 0.017, p = 9.864e-01
  Negative vs Positive: t = -0.127, p = 8.988e-01

=== T-test Results for Q7 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = 1.422, p = 1.549e-01
  Not Applicable vs Positive: t = -0.314, p = 7.533e-01
  Negative vs Positive: t = -0.875, p = 3.819e-01

=== T-test Results for Q8 ===
Comparisons by PREG_STAT:
  Not Applicable vs Negative: t = -8.060, p = 8.099e-16
  Not Applicable vs Positive: t = -3.962, p = 7.981e-05
  Negative vs Positive: t = -0.708, p = 4.789e-01

Supervised ML¶

Graphs¶

		NAME	id	GENDER	AGE	RACETHN	EDUCCAT5	DIVISION	MARITAL_ACS	HHSIZECAT	...	CI_LABEL_OWNGUN_GSS	CI_LABEL_SEXUALITY	CI_LABEL_HIV_STAT	CI_LABEL_PREG_STAT	CI_LABEL_CC_NUM	CI_LABEL_cc_encoded	CI_LABEL_cc_disclosed	CI_LABEL_NumChronicIllness	AGE_INT	Cluster
0	0	Luke Walsh	1	Male	25	White non-Hispanic	Some college	Mountain	Never married	3+	...	is	is	is probably	is probably	is	is probably	is possibly	is probably	20-29	5
1	1	Matilde Izaguirre Checa	2	Female	70	Hispanic	HS Grad	West South Central	Divorced	1	...	is probably	is possibly	is	is	is	is	is	is possibly	70-79	11
2	2	Ryan Smith	3	Male	85	White non-Hispanic	Less than HS	Middle Atlantic	Now married	2	...	is possibly	is probably	is probably	is	is probably	is probably	is probably	is probably	80-89	2
3	3	Matthew Grimes	4	Male	59	White non-Hispanic	HS Grad	Mountain	Now married	2	...	is probably	is probably	is probably	is	is possibly	is probably	is	is probably	50-59	12
4	4	Miraan Rama	5	Female	19	Asian	Some college	Pacific	Never married	1	...	is probably	is	is probably	is	is probably	is probably	is probably	is	10-19	9