InĀ [1]:
from google.colab import auth
from google.oauth2 import service_account
from googleapiclient.discovery import build

auth.authenticate_user()
InĀ [2]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd

scope = ['https://www.googleapis.com/auth/spreadsheets']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)

client = gspread.authorize(creds)

spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1CatLlB4axdBW0uJRoioR-x00RsPzQ54CFYq1PqzgV7E/edit?usp=drive_web&ouid=107190373735789167396'
sheet = client.open_by_url(spreadsheet_url)

worksheet = sheet.get_worksheet(0)
data = worksheet.get_all_records()
df_synthetic = pd.DataFrame(data)

df_synthetic.head()
Out[2]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... CI_LABEL_OWNGUN_GSS CI_LABEL_SEXUALITY CI_LABEL_HIV_STAT CI_LABEL_PREG_STAT CI_LABEL_CC_NUM CI_LABEL_cc_encoded CI_LABEL_cc_disclosed CI_LABEL_NumChronicIllness AGE_INT Cluster
0 0 Luke Walsh 1 Male 25 White non-Hispanic Some college Mountain Never married 3+ ... is is is probably is probably is is probably is possibly is probably 20-29 5
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic HS Grad West South Central Divorced 1 ... is probably is possibly is is is is is is possibly 70-79 11
2 2 Ryan Smith 3 Male 85 White non-Hispanic Less than HS Middle Atlantic Now married 2 ... is possibly is probably is probably is is probably is probably is probably is probably 80-89 2
3 3 Matthew Grimes 4 Male 59 White non-Hispanic HS Grad Mountain Now married 2 ... is probably is probably is probably is is possibly is probably is is probably 50-59 12
4 4 Miraan Rama 5 Female 19 Asian Some college Pacific Never married 1 ... is probably is is probably is is probably is probably is probably is 10-19 9

5 rows Ɨ 187 columns

InĀ [3]:
import pandas as pd

# Load the data file (adjust file name and method if it's an Excel file)
df = pd.read_csv('prolific_responses_survey2.csv')

# Define the mapping from Likert-scale text to numbers
likert_mapping = {
    "Not at all harmful": 1,
    "Slightly harmful": 2,
    "Moderately harmful": 3,
    "Very harmful": 4,
    "Extremely harmful": 5
}

# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Replace the text values with numeric values in those columns
for col in likert_columns:
    if col in df.columns:  # only replace if column exists in the dataframe
        df[col] = df[col].replace(likert_mapping)
/tmp/ipython-input-1004496578.py:22: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[col] = df[col].replace(likert_mapping)
InĀ [4]:
import pandas as pd

# Assuming the DataFrame is called df and has columns:
# ["ProlificID", "SyntheticPersonID", "AttnCheck", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Condition"]

# 1) Group by SyntheticPersonID and compute the mean for Q1..Q8
df_avg = (
    df.groupby("SyntheticPersonID")[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8", "Certainty"]]
      .mean()  # average across repeated rows for each SyntheticPersonID
      .reset_index()
)

# 2) (Optional) If expecting a single "avg_harm" column across Q1..Q8
df_avg["avg_harm"] = df_avg[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]].mean(axis=1)

print(df_avg.head())
   SyntheticPersonID        Q1   Q2        Q3        Q4        Q5   Q6  \
0                 57  3.000000  3.0  3.000000  2.500000  2.000000  2.5   
1                 89  2.500000  2.5  2.500000  2.500000  2.500000  2.5   
2                141  3.250000  2.0  2.750000  2.000000  1.500000  2.0   
3                153  4.333333  4.0  3.666667  3.666667  1.666667  2.0   
4                166  2.500000  3.5  3.500000  2.000000  2.000000  2.5   

         Q7        Q8  Certainty  avg_harm  
0  2.000000  3.000000       50.0  2.625000  
1  2.500000  2.500000       75.0  2.500000  
2  2.750000  3.250000       50.0  2.437500  
3  3.666667  4.666667       50.0  3.458333  
4  3.000000  3.000000       50.0  2.750000  
InĀ [5]:
import pandas as pd
import itertools
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

# 1) Group by SyntheticPersonID and Condition, averaging Q1..Q8
#    for repeated rows of the same SyntheticPersonID.
grouped_df = (
    df.groupby(["SyntheticPersonID", "Condition"])[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8", "Certainty"]]
      .mean()
      .reset_index()
)

# 2) Identify the questions you to test separately
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]

# 3) Get the unique conditions
conditions = grouped_df["Condition"].unique()

# 4) Prepare a list to store results from each test
results = []

# 5) For each question, do pairwise t-tests across all condition pairs
for question in questions:
    for cond1, cond2 in itertools.combinations(conditions, 2):
        # Extract data for this question, for each condition
        group1 = grouped_df.loc[grouped_df["Condition"] == cond1, question].dropna()
        group2 = grouped_df.loc[grouped_df["Condition"] == cond2, question].dropna()

        # Perform Welch's t-test (equal_var=False)
        t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)

        # Store the results
        results.append((question, cond1, cond2, t_stat, p_val))

# 6) Convert results to a DataFrame
results_df = pd.DataFrame(
    results,
    columns=["Question", "Condition1", "Condition2", "t_stat", "p_value"]
)

# 7) Apply Bonferroni correction across all tests (all questions, all pairs)
adjusted = multipletests(results_df["p_value"], method="bonferroni")
results_df["p_adjusted"] = adjusted[1]

# 8) Print the final results
print("Separate t-test results for each question (with Bonferroni correction):")
print(results_df)
Separate t-test results for each question (with Bonferroni correction):
  Question Condition1 Condition2    t_stat       p_value    p_adjusted
0       Q1     Health    Control -3.604307  3.202110e-04  2.561688e-03
1       Q2     Health    Control -8.735927  4.844174e-18  3.875339e-17
2       Q3     Health    Control -3.798648  1.495865e-04  1.196692e-03
3       Q4     Health    Control -9.722139  6.884886e-22  5.507909e-21
4       Q5     Health    Control -5.296605  1.305157e-07  1.044125e-06
5       Q6     Health    Control -6.194854  7.033187e-10  5.626550e-09
6       Q7     Health    Control -5.402034  7.343484e-08  5.874787e-07
7       Q8     Health    Control -5.166240  2.614829e-07  2.091863e-06
InĀ [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Dictionary mapping question codes to descriptive labels
custom_labels = {
    'Q1': 'Hackers/Cybercriminals',
    'Q2': 'Government',
    'Q3': 'Corporations',
    'Q4': 'Employer/Colleagues',
    'Q5': 'Family',
    'Q6': 'Close Friends',
    'Q7': 'Acquaintances',
    'Q8': 'Publicly Available'
}

# Assume `grouped_df` has one row per (SyntheticPersonID, Condition),
# with columns: ["SyntheticPersonID", "Condition", "Q1", "Q2", ..., "Q8"].

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()

for cond in conditions:
    # Filter data for this condition
    cond_data = grouped_df[grouped_df["Condition"] == cond]

    # Compute means and standard deviations for each question
    mean_vals = cond_data[questions].mean(axis=0)
    std_vals  = cond_data[questions].std(axis=0)

    # Number of SyntheticPersonIDs in this condition
    n = cond_data.shape[0]

    # Standard error for each question
    se_vals = std_vals / np.sqrt(n)  # If n <= 1, be mindful of dividing by zero

    # 95% CI using the t-distribution (two-tailed)
    if n > 1:
        t_multiplier = st.t.ppf(1 - 0.025, df=n-1)
        ci_vals = t_multiplier * se_vals
    else:
        # If there's only one data point in this condition, CI is not meaningful
        ci_vals = np.zeros_like(se_vals)

    # X positions for plotting
    x_positions = np.arange(len(questions))

    # Create the plot
    plt.figure(figsize=(10,6))
    plt.errorbar(
        x_positions,
        mean_vals,
        yerr=ci_vals,
        fmt='o',
        capsize=5,
        color='blue',
        ecolor='black'
    )

    # Use custom labels for the questions
    question_labels = [custom_labels[q] for q in questions]

    plt.xticks(x_positions, question_labels, rotation=45, ha='right')
    plt.xlabel("Question")
    plt.ylabel("Mean Response")
    plt.title(f"Mean Response (95% CI) for Condition: {cond}")

    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
InĀ [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Dictionary mapping question codes to descriptive labels
custom_labels = {
    'Q1': 'Hackers/Cybercriminals',
    'Q2': 'Government',
    'Q3': 'Corporations',
    'Q4': 'Employer/Colleagues',
    'Q5': 'Family',
    'Q6': 'Close Friends',
    'Q7': 'Acquaintances',
    'Q8': 'Publicly Available'
}

# Assume `grouped_df` has one row per (SyntheticPersonID, Condition),
# with columns: ["SyntheticPersonID", "Condition", "Q1", "Q2", ..., "Q8"].

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()

# Sort certainties if needed (e.g., 50, then 75)
certainties = grouped_df["Certainty"].unique()
certainties = sorted(certainties)

for cond in conditions:
    plt.figure(figsize=(10, 6))

    for i, cert in enumerate(certainties):
        subset = grouped_df[
            (grouped_df["Condition"] == cond) &
            (grouped_df["Certainty"] == cert)
        ]

        mean_vals = subset[questions].mean(axis=0)
        std_vals  = subset[questions].std(axis=0)
        n = subset.shape[0]
        se_vals = std_vals / np.sqrt(n) if n > 1 else np.zeros_like(std_vals)

        if n > 1:
            t_multiplier = st.t.ppf(1 - 0.025, df=n-1)
            ci_vals = t_multiplier * se_vals
        else:
            ci_vals = np.zeros_like(se_vals)

        x_positions = np.arange(len(questions))
        offset = i * 0.1
        shifted_positions = x_positions + offset

        plt.errorbar(
            shifted_positions,
            mean_vals,
            yerr=ci_vals,
            fmt='o',
            capsize=5,
            label=f"Certainty = {cert}%"
        )

    question_labels = [custom_labels[q] for q in questions]
    plt.xticks(np.arange(len(questions)), question_labels, rotation=45, ha='right', fontsize=18)
    plt.yticks(fontsize=18)

    plt.xlabel("Scenario", fontsize=18)
    plt.ylabel("Mean Response", fontsize=18)
    plt.title(f"Mean Response (95% CI) by Certainty for Condition: {cond}", fontsize=18)
    plt.legend(fontsize=14)
    plt.tight_layout()
    plt.savefig(f'mean_response_{cond}.pdf', bbox_inches='tight')
    plt.show()
No description has been provided for this image
No description has been provided for this image
InĀ [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Assume grouped_df has one row per (SyntheticPersonID, Condition, Certainty) and columns for Q1..Q8, like:
# ["SyntheticPersonID", "Condition", "Certainty", "Q1", "Q2", ..., "Q8"]

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()
certainties = sorted(grouped_df["Certainty"].unique())  # e.g., [50, 75]

for question in questions:
    # Prepare lists to store the mean, CI, and x-axis labels
    means = []
    ci_vals = []
    x_labels = []

    # Loop over each Condition and Certainty combination
    for cond in conditions:
        for cert in certainties:
            # Filter rows that match the current Condition and Certainty
            subset = grouped_df.loc[
                (grouped_df["Condition"] == cond) & (grouped_df["Certainty"] == cert),
                question
            ].dropna()

            mean_val = subset.mean()
            std_val = subset.std()
            n = subset.shape[0]

            # Compute 95% CI using the t-distribution
            if n > 1:
                se_val = std_val / np.sqrt(n)
                t_multiplier = st.t.ppf(1 - 0.025, df=n-1)  # for a 95% CI
                ci = t_multiplier * se_val
            else:
                # If there's only one data point, the CI isn't meaningful
                ci = 0.0

            means.append(mean_val)
            ci_vals.append(ci)
            x_labels.append(f"{cond}-{int(cert)}%")  # e.g. "Health-50%" or "Control-75%"

    # X positions for plotting
    x_positions = np.arange(len(means))

    plt.figure(figsize=(10, 6))
    plt.errorbar(
        x_positions,
        means,
        yerr=ci_vals,
        fmt='o',
        capsize=5,
        color='blue',
        ecolor='black'
    )

    plt.xticks(x_positions, x_labels, rotation=45, ha='right')
    plt.xlabel("Condition - Certainty")
    plt.ylabel("Mean Response")
    plt.title(f"Mean {question} by Condition and Certainty (95% CI)")
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [9]:
import scipy.stats as stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
conditions_of_interest = ["Health", "Control"]

for cond in conditions_of_interest:
    print(f"\n=== Condition: {cond} ===")

    # Subset the data for just this condition
    subset_cond = grouped_df[grouped_df["Condition"] == cond]

    for score in score_types:
        # Separate data for 50% vs 75% certainty
        data_50 = subset_cond[subset_cond["Certainty"] == 50][score].dropna()
        data_75 = subset_cond[subset_cond["Certainty"] == 75][score].dropna()

        # Only run a t-test if both groups have sufficient data
        if len(data_50) >= 2 and len(data_75) >= 2:
            t_stat, p_val = stats.ttest_ind(data_50, data_75, equal_var=False)
            print(f"{score} | t = {t_stat:.3f}, p = {p_val:.3e}")
        else:
            print(f"{score} | Not enough data for t-test.")
=== Condition: Health ===
Q1 | t = -1.197, p = 2.316e-01
Q2 | t = -1.531, p = 1.262e-01
Q3 | t = 0.447, p = 6.551e-01
Q4 | t = 5.150, p = 3.133e-07
Q5 | t = 5.590, p = 2.904e-08
Q6 | t = 8.073, p = 1.914e-15
Q7 | t = 4.011, p = 6.482e-05
Q8 | t = -1.906, p = 5.687e-02

=== Condition: Control ===
Q1 | t = -4.298, p = 1.885e-05
Q2 | t = 4.054, p = 5.414e-05
Q3 | t = 4.196, p = 2.944e-05
Q4 | t = 4.388, p = 1.260e-05
Q5 | t = -3.446, p = 5.949e-04
Q6 | t = -3.640, p = 2.887e-04
Q7 | t = -2.112, p = 3.495e-02
Q8 | t = 0.032, p = 9.741e-01
InĀ [10]:
df_responses = pd.read_csv('prolific_responses_survey2.csv')  # Adjust filename/path as needed
df_prolific  = pd.read_csv('prolific_export_survey2.csv')  # Adjust filename/path as needed
df_responses = df_responses.rename(columns={"ProlificID": "Participant id"})

demo_cols = ["Age","Sex","Ethnicity simplified","Country of birth","Country of residence",
             "Nationality","Language","Student status","Employment status"]
keep_cols = ["Participant id"] + [c for c in demo_cols if c in df_prolific.columns]

df_prolific = df_prolific[keep_cols].copy()
df_prolific = df_prolific.replace({"DATA_EXPIRED": pd.NA, "Data expired": pd.NA})
for c in keep_cols[1:]:
    if c in df_prolific:
        df_prolific[c] = df_prolific[c].astype("string").str.strip()

df_prolific = df_prolific.applymap(
    lambda v: pd.NA if isinstance(v, str) and v.strip().lower().replace(" ", "_") == "data_expired" else v
)

# Define the mapping from Likert-scale text to numbers
likert_mapping = {
    "Not at all harmful": 1,
    "Slightly harmful": 2,
    "Moderately harmful": 3,
    "Very harmful": 4,
    "Extremely harmful": 5
}

# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# Replace the text values with numeric values in those columns
for col in likert_columns:
    if col in df.columns:  # only replace if column exists in the dataframe
        df_responses[col] = df_responses[col].replace(likert_mapping)

merged_df = df_responses.merge(df_prolific, on="Participant id", how="left")

# Now merged_df contains columns from both datasets.
print("Merged DataFrame:")
print(merged_df.head())

questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
Merged DataFrame:
             Participant id  SyntheticPersonID AttnCheck  Q1  Q2  Q3  Q4  Q5  \
0  6738e2f6443d73fad19d2753              14226       NaN   3   1   4   1   2   
1  6738e2f6443d73fad19d2753               1373       NaN   3   1   3   1   2   
2  6738e2f6443d73fad19d2753              10468       NaN   3   1   2   1   2   
3  6738e2f6443d73fad19d2753              13101       NaN   4   1   2   1   2   
4  6738e2f6443d73fad19d2753               3933       NaN   3   1   2   1   2   

   Q6  Q7  ...  Certainty Age   Sex Ethnicity simplified Country of birth  \
0   2   3  ...         75  20  Male                Asian    United States   
1   2   2  ...         75  20  Male                Asian    United States   
2   2   2  ...         75  20  Male                Asian    United States   
3   2   2  ...         75  20  Male                Asian    United States   
4   2   2  ...         75  20  Male                Asian    United States   

  Country of residence Nationality Language Student status Employment status  
0        United States     Vietnam  English             No         Part-Time  
1        United States     Vietnam  English             No         Part-Time  
2        United States     Vietnam  English             No         Part-Time  
3        United States     Vietnam  English             No         Part-Time  
4        United States     Vietnam  English             No         Part-Time  

[5 rows x 22 columns]
/tmp/ipython-input-2343357288.py:15: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  df_prolific = df_prolific.applymap(
/tmp/ipython-input-2343357288.py:35: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df_responses[col] = df_responses[col].replace(likert_mapping)
InĀ [11]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import itertools

# Define the questions and the demographic columns to analyze
questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

# List of demographic columns (categorical) from the Prolific export.
# Update this list based on the columns available in your prolific_export.
demographic_vars = ["Sex", "Ethnicity simplified", "Country of residence", "Language", "Employment status"]

# Prepare a list to store the t-test results
results = []

# Loop over each demographic variable
for demo in demographic_vars:
    # Get the unique groups in this demographic (drop missing values)
    groups = merged_df[demo].dropna().unique()

    # If there are less than 2 groups, skip this demographic.
    if len(groups) < 2:
        continue

    # For each question, perform pairwise comparisons between groups in this demographic.
    for question in questions:
        # Use itertools.combinations to get all unique pairwise comparisons.
        for group1, group2 in itertools.combinations(groups, 2):
            # Filter data for each group for the given question
            data1 = merged_df.loc[merged_df[demo] == group1, question].dropna()
            data2 = merged_df.loc[merged_df[demo] == group2, question].dropna()

            # Only perform the t-test if both groups have at least 2 observations
            if len(data1) >= 2 and len(data2) >= 2:
                t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
            else:
                t_stat, p_val = np.nan, np.nan  # Not enough data

            results.append((demo, question, group1, group2, t_stat, p_val))

# Convert the results list to a DataFrame for easier viewing
results_df = pd.DataFrame(results, columns=["Demographic", "Question", "Group1", "Group2", "t_stat", "p_value"])

# Optionally, adjust p-values for multiple comparisons using Bonferroni correction.
from statsmodels.stats.multitest import multipletests
adjusted = multipletests(results_df["p_value"].dropna(), method="bonferroni")
# Place adjusted p-values back into the results DataFrame.
# (needing to match the indices since multipletests was applied to non-NaN values.)
results_df.loc[results_df["p_value"].notna(), "p_adjusted"] = adjusted[1]

# Display the results
print("Pairwise t-test results for each question by demographic variable:")
print(results_df)
# get df to csv
results_df.to_csv('t-tests_survey2.csv', index=False)
/usr/local/lib/python3.12/dist-packages/scipy/stats/_axis_nan_policy.py:579: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.
  res = hypotest_fun_out(*samples, **kwds)
Pairwise t-test results for each question by demographic variable:
           Demographic Question  \
0                  Sex       Q1   
1                  Sex       Q1   
2                  Sex       Q1   
3                  Sex       Q2   
4                  Sex       Q2   
..                 ...      ...   
443  Employment status       Q8   
444  Employment status       Q8   
445  Employment status       Q8   
446  Employment status       Q8   
447  Employment status       Q8   

                                                Group1  \
0                                                 Male   
1                                                 Male   
2                                               Female   
3                                                 Male   
4                                                 Male   
..                                                 ...   
443  Not in paid work (e.g. homemaker', 'retired or...   
444  Not in paid work (e.g. homemaker', 'retired or...   
445                       Unemployed (and job seeking)   
446                       Unemployed (and job seeking)   
447                                              Other   

                                           Group2     t_stat       p_value  \
0                                          Female  -3.526875  4.238977e-04   
1                               Prefer not to say  -4.857806  4.175547e-05   
2                               Prefer not to say  -4.326655  1.793050e-04   
3                                          Female  -8.136630  4.988460e-16   
4                               Prefer not to say  -1.962172  5.978932e-02   
..                                            ...        ...           ...   
443                                         Other  -6.880348  1.479507e-11   
444  Due to start a new job within the next month   6.946660  8.331797e-11   
445                                         Other  -3.727910  2.064106e-04   
446  Due to start a new job within the next month  12.634814  1.689036e-22   
447  Due to start a new job within the next month  16.171640  8.553387e-30   

       p_adjusted  
0    1.899062e-01  
1    1.870645e-02  
2    8.032866e-02  
3    2.234830e-13  
4    1.000000e+00  
..            ...  
443  6.628192e-09  
444  3.732645e-08  
445  9.247194e-02  
446  7.566881e-20  
447  3.831918e-27  

[448 rows x 7 columns]

Supervised ML¶

InĀ [12]:
from sklearn.preprocessing import LabelEncoder

feature_list = ["GENDER", "RACETHN", "EDUCCAT5", "DIVISION", "MARITAL_ACS",
                "CHILDRENCAT", "CITIZEN_REC", "BORN_ACS", "AGE_INT",
                "HIV_STAT", "PREG_STAT", "NumChronicIllness",
                "FAMINC5", "CC_NUM", "FDSTMP_CPS",
                "SEXUALITY", "OWNGUN_GSS", "RELIGCAT", "Certainty"
]
InĀ [13]:
df_synthetic.head()
Out[13]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... CI_LABEL_OWNGUN_GSS CI_LABEL_SEXUALITY CI_LABEL_HIV_STAT CI_LABEL_PREG_STAT CI_LABEL_CC_NUM CI_LABEL_cc_encoded CI_LABEL_cc_disclosed CI_LABEL_NumChronicIllness AGE_INT Cluster
0 0 Luke Walsh 1 Male 25 White non-Hispanic Some college Mountain Never married 3+ ... is is is probably is probably is is probably is possibly is probably 20-29 5
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic HS Grad West South Central Divorced 1 ... is probably is possibly is is is is is is possibly 70-79 11
2 2 Ryan Smith 3 Male 85 White non-Hispanic Less than HS Middle Atlantic Now married 2 ... is possibly is probably is probably is is probably is probably is probably is probably 80-89 2
3 3 Matthew Grimes 4 Male 59 White non-Hispanic HS Grad Mountain Now married 2 ... is probably is probably is probably is is possibly is probably is is probably 50-59 12
4 4 Miraan Rama 5 Female 19 Asian Some college Pacific Never married 1 ... is probably is is probably is is probably is probably is probably is 10-19 9

5 rows Ɨ 187 columns

InĀ [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
InĀ [15]:
import pandas as pd

df_merged = pd.merge(
    df_synthetic,
    df_avg,
    how="left",
    left_on="id",
    right_on="SyntheticPersonID",
    suffixes=("", "_drop")
)

cols_to_drop = [col for col in df_merged.columns if col.endswith("_drop")]
df_final = df_merged.drop(columns=cols_to_drop)
df_final = df_final.fillna(0)

print(df_final.head())
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...   Q1   Q2  \
0  Some college            Mountain  Never married        3+  ...  0.0  0.0   
1       HS Grad  West South Central       Divorced         1  ...  0.0  0.0   
2  Less than HS     Middle Atlantic    Now married         2  ...  0.0  0.0   
3       HS Grad            Mountain    Now married         2  ...  0.0  0.0   
4  Some college             Pacific  Never married         1  ...  0.0  0.0   

    Q3   Q4   Q5   Q6   Q7   Q8 Certainty avg_harm  
0  0.0  0.0  0.0  0.0  0.0  0.0       0.0      0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0       0.0      0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0       0.0      0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0       0.0      0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0       0.0      0.0  

[5 rows x 198 columns]
InĀ [16]:
# for the rows where Certainty is 0, randomly assign them a value of 50 or 75
df_final.loc[df_final['Certainty'] == 0, 'Certainty'] = np.random.choice([50, 75], size=df_final.loc[df_final['Certainty'] == 0].shape[0])
InĀ [17]:
train_data = df_final[df_final['Q1'] > 0]
predict_data = df_final[df_final['Q1'] == 0]

X_train = train_data[feature_list]
y_train = train_data['Q1']

onehot_encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore',
    drop='first'
)

X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))

scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()

models = [
    ('RandomForest', RandomForestRegressor(n_estimators=100)),
    ('GradientBoosting', GradientBoostingRegressor()),
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('DecisionTree', DecisionTreeRegressor()),
    ('SVR', SVR())
]

def evaluate_model(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    mse = mean_squared_error(y_train, y_train_pred)
    r2 = r2_score(y_train, y_train_pred)
    print(f'Model: {model_name}')
    print(f'MSE on training data: {mse}')
    print(f'R^2 on training data: {r2}\n')

for name, model in models:
    evaluate_model(name, model, X_train_encoded, y_train_scaled)
Model: RandomForest
MSE on training data: 0.14264642089148283
R^2 on training data: 0.8573535791085172

Model: GradientBoosting
MSE on training data: 0.864406135039376
R^2 on training data: 0.13559386496062398

Model: LinearRegression
MSE on training data: 0.12443103062698671
R^2 on training data: 0.8755689693730133

Model: Ridge
MSE on training data: 0.3498706291084605
R^2 on training data: 0.6501293708915394

Model: Lasso
MSE on training data: 1.0
R^2 on training data: 0.0

Model: DecisionTree
MSE on training data: 6.965205287595943e-34
R^2 on training data: 1.0

Model: SVR
MSE on training data: 0.6157409795108677
R^2 on training data: 0.3842590204891323

InĀ [18]:
questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
for q in questions:

  train_data = df_final[df_final[q] > 0]
  predict_data = df_final[df_final[q] == 0]

  X_train = train_data[feature_list]
  y_train = train_data[q]

  onehot_encoder = OneHotEncoder(
      sparse_output=False,
      handle_unknown='ignore',
      drop='first'
  )

  X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))

  scaler = StandardScaler()
  y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()

  models = [
      ('DecisionTree', DecisionTreeRegressor())
  ]

  def evaluate_model(model_name, model, X_train, y_train):
      model.fit(X_train, y_train)
      y_train_pred = model.predict(X_train)
      mse = mean_squared_error(y_train, y_train_pred)
      r2 = r2_score(y_train, y_train_pred)
      print(f'Model: {model_name}')
      print(f'MSE on training data: {mse}')
      print(f'R^2 on training data: {r2}\n')

  for name, model in models:
      evaluate_model(name, model, X_train_encoded, y_train_scaled)


  best_model = DecisionTreeRegressor(
      max_depth=30,
      max_features=None,
      min_samples_leaf=4,
      min_samples_split=5,
      random_state=42
  )
  best_model.fit(X_train_encoded, y_train_scaled)

  X_predict = predict_data[feature_list]
  X_predict_encoded = onehot_encoder.transform(X_predict.astype(str))
  predicted_scores_scaled = best_model.predict(X_predict_encoded)

  predicted_scores = scaler.inverse_transform(predicted_scores_scaled.reshape(-1, 1)).flatten()

  df_final.loc[df_final[q] == 0, q] = predicted_scores

  print(df_final.head())
Model: DecisionTree
MSE on training data: 5.966038624718355e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

    Q2   Q3   Q4   Q5   Q6   Q7   Q8 Certainty avg_harm  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 3.217990910954824e-35
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2   Q3   Q4   Q5   Q6   Q7   Q8 Certainty avg_harm  
0  3.316667  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
1  2.488095  0.0  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
2  2.366667  0.0  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
3  2.569444  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
4  4.020833  0.0  0.0  0.0  0.0  0.0  0.0      50.0      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 1.2489583285969857e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3   Q4   Q5   Q6   Q7   Q8 Certainty avg_harm  
0  3.316667  2.883333  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
1  2.488095  3.222222  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
2  2.366667  3.250000  0.0  0.0  0.0  0.0  0.0      75.0      0.0  
3  2.569444  2.458333  0.0  0.0  0.0  0.0  0.0      50.0      0.0  
4  4.020833  3.148148  0.0  0.0  0.0  0.0  0.0      50.0      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 1.0437399945322273e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3        Q4   Q5   Q6   Q7   Q8 Certainty avg_harm  
0  3.316667  2.883333  2.266667  0.0  0.0  0.0  0.0      50.0      0.0  
1  2.488095  3.222222  2.520833  0.0  0.0  0.0  0.0      75.0      0.0  
2  2.366667  3.250000  3.150000  0.0  0.0  0.0  0.0      75.0      0.0  
3  2.569444  2.458333  3.616667  0.0  0.0  0.0  0.0      50.0      0.0  
4  4.020833  3.148148  3.687500  0.0  0.0  0.0  0.0      50.0      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 1.8426660558141034e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3        Q4        Q5   Q6   Q7   Q8 Certainty avg_harm  
0  3.316667  2.883333  2.266667  1.983333  0.0  0.0  0.0      50.0      0.0  
1  2.488095  3.222222  2.520833  1.809524  0.0  0.0  0.0      75.0      0.0  
2  2.366667  3.250000  3.150000  1.611111  0.0  0.0  0.0      75.0      0.0  
3  2.569444  2.458333  3.616667  2.750000  0.0  0.0  0.0      50.0      0.0  
4  4.020833  3.148148  3.687500  2.833333  0.0  0.0  0.0      50.0      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 9.989404023108136e-35
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3        Q4        Q5        Q6   Q7   Q8 Certainty  \
0  3.316667  2.883333  2.266667  1.983333  3.766667  0.0  0.0      50.0   
1  2.488095  3.222222  2.520833  1.809524  2.736111  0.0  0.0      75.0   
2  2.366667  3.250000  3.150000  1.611111  2.716667  0.0  0.0      75.0   
3  2.569444  2.458333  3.616667  2.750000  2.595238  0.0  0.0      50.0   
4  4.020833  3.148148  3.687500  2.833333  2.523810  0.0  0.0      50.0   

  avg_harm  
0      0.0  
1      0.0  
2      0.0  
3      0.0  
4      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 1.950366085598916e-35
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3        Q4        Q5        Q6        Q7   Q8 Certainty  \
0  3.316667  2.883333  2.266667  1.983333  3.766667  3.300000  0.0      50.0   
1  2.488095  3.222222  2.520833  1.809524  2.736111  2.986111  0.0      75.0   
2  2.366667  3.250000  3.150000  1.611111  2.716667  2.312500  0.0      75.0   
3  2.569444  2.458333  3.616667  2.750000  2.595238  3.050000  0.0      50.0   
4  4.020833  3.148148  3.687500  2.833333  2.523810  3.933333  0.0      50.0   

  avg_harm  
0      0.0  
1      0.0  
2      0.0  
3      0.0  
4      0.0  

[5 rows x 198 columns]
Model: DecisionTree
MSE on training data: 3.4174396005668243e-34
R^2 on training data: 1.0

/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
                         NAME  id  GENDER  AGE             RACETHN  \
0  0               Luke Walsh   1    Male   25  White non-Hispanic   
1  1  Matilde Izaguirre Checa   2  Female   70            Hispanic   
2  2               Ryan Smith   3    Male   85  White non-Hispanic   
3  3           Matthew Grimes   4    Male   59  White non-Hispanic   
4  4              Miraan Rama   5  Female   19               Asian   

       EDUCCAT5            DIVISION    MARITAL_ACS HHSIZECAT  ...        Q1  \
0  Some college            Mountain  Never married        3+  ...  3.666667   
1       HS Grad  West South Central       Divorced         1  ...  2.666667   
2  Less than HS     Middle Atlantic    Now married         2  ...  3.562500   
3       HS Grad            Mountain    Now married         2  ...  3.273810   
4  Some college             Pacific  Never married         1  ...  3.354167   

         Q2        Q3        Q4        Q5        Q6        Q7        Q8  \
0  3.316667  2.883333  2.266667  1.983333  3.766667  3.300000  4.183333   
1  2.488095  3.222222  2.520833  1.809524  2.736111  2.986111  4.500000   
2  2.366667  3.250000  3.150000  1.611111  2.716667  2.312500  4.645833   
3  2.569444  2.458333  3.616667  2.750000  2.595238  3.050000  3.020833   
4  4.020833  3.148148  3.687500  2.833333  2.523810  3.933333  4.488095   

  Certainty avg_harm  
0      50.0      0.0  
1      75.0      0.0  
2      75.0      0.0  
3      50.0      0.0  
4      50.0      0.0  

[5 rows x 198 columns]
InĀ [19]:
# drop syntheticpersonid column
df_final = df_final.drop(columns=['SyntheticPersonID'])

Graphs¶

InĀ [20]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_barplot_certainty_side_by_side(df, questions, save=False, outdir=".", fontsize=12, figsize=(16, 6), dpi=100, save_dpi=300):
    palette = sns.color_palette("Set2")
    pdf = None
    if save:
        os.makedirs(outdir, exist_ok=True)
        pdf_path = f"{outdir}/Barplots_Certainty.pdf"
        pdf = PdfPages(pdf_path)

    for score in questions:
        df_50 = df[df['Certainty'] == 50]
        df_75 = df[df['Certainty'] == 75]

        average_scores_50 = (
            df_50.groupby(['RACETHN', 'SEXUALITY'])[score]
            .median()
            .reset_index()
        )
        average_scores_75 = (
            df_75.groupby(['RACETHN', 'SEXUALITY'])[score]
            .median()
            .reset_index()
        )

        fig, axes = plt.subplots(1, 2, figsize=figsize, dpi=dpi, sharey=True)

        ax0 = sns.barplot(
            x='RACETHN', y=score, hue='SEXUALITY',
            data=average_scores_50, palette=palette, ax=axes[0]
        )
        ax0.set_title(f'{score} (Certainty = 50%)', fontsize=fontsize)
        ax0.set_xlabel('Race/Ethnicity', fontsize=fontsize)
        ax0.set_ylabel('Median Score', fontsize=fontsize)
        ax0.tick_params(axis='both', labelsize=fontsize)
        axes[0].tick_params(axis='x', rotation=45)
        leg0 = ax0.legend(title='Sexuality')
        if leg0:
            leg0.get_title().set_fontsize(fontsize)
            for t in leg0.get_texts():
                t.set_fontsize(fontsize)

        ax1 = sns.barplot(
            x='RACETHN', y=score, hue='SEXUALITY',
            data=average_scores_75, palette=palette, ax=axes[1]
        )
        ax1.set_title(f'{score} (Certainty = 75%)', fontsize=fontsize)
        ax1.set_xlabel('Race/Ethnicity', fontsize=fontsize)
        ax1.set_ylabel('Median Score', fontsize=fontsize)
        ax1.tick_params(axis='both', labelsize=fontsize)
        axes[1].tick_params(axis='x', rotation=45)
        leg1 = ax1.legend(title='Sexuality')
        if leg1:
            leg1.get_title().set_fontsize(fontsize)
            for t in leg1.get_texts():
                t.set_fontsize(fontsize)

        plt.suptitle(f'Comparison of Certainty Levels for {score}', fontsize=fontsize)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        if save:
            pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()
        plt.close(fig)

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_barplot_certainty_side_by_side(df_final, questions, save=True, outdir=".")
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
/tmp/ipython-input-4269703244.py:31: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax0 = sns.barplot(
/tmp/ipython-input-4269703244.py:46: UserWarning: The palette list has more values (8) than needed (4), which may not be intended.
  ax1 = sns.barplot(
No description has been provided for this image
Saved ./Barplots_Certainty.pdf
InĀ [21]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_heatmap_certainty_side_by_side(
    df,
    questions,
    mask_zeros=True,
    save=False,
    outdir=".",
    fontsize=12,
    figsize=(16, 6),
    dpi=100,
    save_dpi=300
):
    canonical_levels = [
        "Less than $20K",
        "$20K to less than $40K",
        "$40K to less than $75K",
        "$75K to less than $150K",
        "$150K or more",
    ]

    pdf = None
    if save:
        os.makedirs(outdir, exist_ok=True)
        pdf_path = f"{outdir}/Heatmaps_Certainty.pdf"
        pdf = PdfPages(pdf_path)

    for score in questions:
        df_copy = df.copy()
        if mask_zeros and score in df_copy:
            df_copy.loc[df_copy[score] == 0, score] = np.nan

        df_copy["FAMINC5"] = pd.Categorical(
            df_copy["FAMINC5"], categories=canonical_levels, ordered=True
        )

        df_50 = df_copy[df_copy["Certainty"] == 50]
        df_75 = df_copy[df_copy["Certainty"] == 75]

        means_50 = (
            df_50.groupby(["DIVISION", "FAMINC5"])[score]
                 .mean()
                 .unstack()
                 .reindex(columns=canonical_levels)
        )
        means_75 = (
            df_75.groupby(["DIVISION", "FAMINC5"])[score]
                 .mean()
                 .unstack()
                 .reindex(columns=canonical_levels)
        )

        fig, axes = plt.subplots(1, 2, figsize=figsize, dpi=dpi, sharey=True)

        ax0 = axes[0]
        sns.heatmap(means_50, annot=True, fmt=".2f", cmap="Greys", cbar=False, ax=ax0)
        ax0.set_title(f"{score}: Certainty = 50%", fontsize=fontsize)
        ax0.set_xlabel("Family Income", fontsize=fontsize)
        ax0.set_ylabel("Division", fontsize=fontsize)
        ax0.tick_params(axis='both', labelsize=fontsize)

        ax1 = axes[1]
        sns.heatmap(means_75, annot=True, fmt=".2f", cmap="Greys", cbar=False, ax=ax1)
        ax1.set_title(f"{score}: Certainty = 75%", fontsize=fontsize)
        ax1.set_xlabel("Family Income", fontsize=fontsize)
        ax1.set_ylabel("Division", fontsize=fontsize)
        ax1.tick_params(axis='both', labelsize=fontsize)

        plt.suptitle(f"{score}: Mean by Division Ɨ Family Income (50% or 75%)", fontsize=fontsize)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        if save:
            pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()
        plt.close(fig)

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_heatmap_certainty_side_by_side(df_final, questions, save=True, outdir=".")
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
/tmp/ipython-input-2878946505.py:46: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_50.groupby(["DIVISION", "FAMINC5"])[score]
/tmp/ipython-input-2878946505.py:52: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df_75.groupby(["DIVISION", "FAMINC5"])[score]
No description has been provided for this image
Saved ./Heatmaps_Certainty.pdf
InĀ [22]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_boxplot_certainty_side_by_side(df, questions, save=False, outdir=".", fontsize=12, figsize=(16, 6), dpi=100, save_dpi=300):
    pdf = None
    if save:
        os.makedirs(outdir, exist_ok=True)
        pdf_path = f"{outdir}/Boxplots_Certainty.pdf"
        pdf = PdfPages(pdf_path)

    for score in questions:
        df_50 = df[df["Certainty"] == 50]
        df_75 = df[df["Certainty"] == 75]

        fig, axes = plt.subplots(1, 2, figsize=figsize, dpi=dpi, sharey=True)

        ax0 = sns.boxplot(x="DIVISION", y=score, hue="FAMINC5", data=df_50, ax=axes[0])
        ax0.set_title(f"{score} (Certainty = 50%)", fontsize=fontsize)
        ax0.set_xlabel("Race/Ethnicity", fontsize=fontsize)
        ax0.set_ylabel(score, fontsize=fontsize)
        ax0.tick_params(axis="both", labelsize=fontsize)
        axes[0].tick_params(axis="x", rotation=45)
        leg0 = ax0.legend(title="Family Income")
        if leg0:
            leg0.get_title().set_fontsize(fontsize)
            for t in leg0.get_texts():
                t.set_fontsize(fontsize)

        ax1 = sns.boxplot(x="DIVISION", y=score, hue="FAMINC5", data=df_75, ax=axes[1])
        ax1.set_title(f"{score} (Certainty = 75%)", fontsize=fontsize)
        ax1.set_xlabel("Race/Ethnicity", fontsize=fontsize)
        ax1.set_ylabel(score, fontsize=fontsize)
        ax1.tick_params(axis="both", labelsize=fontsize)
        axes[1].tick_params(axis="x", rotation=45)
        leg1 = ax1.legend(title="Division")
        if leg1:
            leg1.get_title().set_fontsize(fontsize)
            for t in leg1.get_texts():
                t.set_fontsize(fontsize)

        plt.suptitle(f"Distribution of {score} by Division and Family Income (50% or 75%)", fontsize=fontsize)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        if save:
            pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()
        plt.close(fig)

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_boxplot_certainty_side_by_side(df_final, questions, save=True, outdir=".")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Saved ./Boxplots_Certainty.pdf
InĀ [23]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_linear_certainty_side_by_side(df, questions, save=False, outdir=".", fontsize=12, figsize=(16, 6), dpi=100, save_dpi=300):
    pdf = None
    if save:
        os.makedirs(outdir, exist_ok=True)
        pdf_path = f"{outdir}/LinearPlots_Certainty.pdf"
        pdf = PdfPages(pdf_path)

    for score in questions:
        sub = df[df["Certainty"].isin([50, 75])].copy()
        grouped = sub.groupby(["Certainty", "AGE", "PREG_STAT"])[score].median().reset_index()

        height = figsize[1]
        aspect = figsize[0] / (2 * height)

        g = sns.lmplot(
            x="AGE",
            y=score,
            data=grouped,
            hue="PREG_STAT",
            col="Certainty",
            col_order=[50, 75],
            height=height,
            aspect=aspect,
            legend_out=True
        )
        g.fig.set_dpi(dpi)

        g.set(xlim=(16, 40))
        g.set_xlabels("Age", fontsize=fontsize)
        g.set_ylabels(score, fontsize=fontsize)
        for ax in g.axes.flat:
            ax.tick_params(axis="both", labelsize=fontsize)

        if g._legend is not None:
            g._legend.set_title("PREG_STAT")
            g._legend.get_title().set_fontsize(fontsize)
            for t in g._legend.get_texts():
                t.set_fontsize(fontsize)

        plt.suptitle(f"{score}: Mean vs. Age by Pregnancy Status (50% or 75%)", fontsize=fontsize)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        if save:
            pdf.savefig(g.fig, bbox_inches="tight", dpi=save_dpi)

        plt.show()
        plt.close(g.fig)

    if save:
        pdf.close()
        print(f"Saved {pdf_path}")

# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_linear_certainty_side_by_side(df_final, questions, save=True, outdir=".")
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
Saved ./LinearPlots_Certainty.pdf

Does certainty of 50 vs 75 produce significantly diff harm scores? In almost all cases, it does.

InĀ [24]:
import itertools
from scipy import stats

score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]

for score in score_types:
    # Separate data for 50% vs 75% certainty
    data_50 = df_final[df_final['Certainty'] == 50][score].dropna()
    data_75 = df_final[df_final['Certainty'] == 75][score].dropna()

    # Only run a t-test if both groups have sufficient data
    if len(data_50) >= 2 and len(data_75) >= 2:
        t_stat, p_val = stats.ttest_ind(data_50, data_75, equal_var=False)
        print(f"{score} | t = {t_stat:.3f}, p = {p_val:.3e}")
    else:
        print(f"{score} | Not enough data for t-test.")
Q1 | t = -19.139, p = 6.557e-81
Q2 | t = 5.167, p = 2.403e-07
Q3 | t = 14.536, p = 1.255e-47
Q4 | t = 31.130, p = 9.248e-208
Q5 | t = 0.604, p = 5.461e-01
Q6 | t = 12.017, p = 3.763e-33
Q7 | t = -0.239, p = 8.114e-01
Q8 | t = -5.546, p = 2.955e-08