InĀ [1]:
from google.colab import auth
from google.oauth2 import service_account
from googleapiclient.discovery import build
auth.authenticate_user()
InĀ [2]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
scope = ['https://www.googleapis.com/auth/spreadsheets']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1CatLlB4axdBW0uJRoioR-x00RsPzQ54CFYq1PqzgV7E/edit?usp=drive_web&ouid=107190373735789167396'
sheet = client.open_by_url(spreadsheet_url)
worksheet = sheet.get_worksheet(0)
data = worksheet.get_all_records()
df_synthetic = pd.DataFrame(data)
df_synthetic.head()
Out[2]:
| NAME | id | GENDER | AGE | RACETHN | EDUCCAT5 | DIVISION | MARITAL_ACS | HHSIZECAT | ... | CI_LABEL_OWNGUN_GSS | CI_LABEL_SEXUALITY | CI_LABEL_HIV_STAT | CI_LABEL_PREG_STAT | CI_LABEL_CC_NUM | CI_LABEL_cc_encoded | CI_LABEL_cc_disclosed | CI_LABEL_NumChronicIllness | AGE_INT | Cluster | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Luke Walsh | 1 | Male | 25 | White non-Hispanic | Some college | Mountain | Never married | 3+ | ... | is | is | is probably | is probably | is | is probably | is possibly | is probably | 20-29 | 5 |
| 1 | 1 | Matilde Izaguirre Checa | 2 | Female | 70 | Hispanic | HS Grad | West South Central | Divorced | 1 | ... | is probably | is possibly | is | is | is | is | is | is possibly | 70-79 | 11 |
| 2 | 2 | Ryan Smith | 3 | Male | 85 | White non-Hispanic | Less than HS | Middle Atlantic | Now married | 2 | ... | is possibly | is probably | is probably | is | is probably | is probably | is probably | is probably | 80-89 | 2 |
| 3 | 3 | Matthew Grimes | 4 | Male | 59 | White non-Hispanic | HS Grad | Mountain | Now married | 2 | ... | is probably | is probably | is probably | is | is possibly | is probably | is | is probably | 50-59 | 12 |
| 4 | 4 | Miraan Rama | 5 | Female | 19 | Asian | Some college | Pacific | Never married | 1 | ... | is probably | is | is probably | is | is probably | is probably | is probably | is | 10-19 | 9 |
5 rows Ć 187 columns
InĀ [3]:
import pandas as pd
# Load the data file (adjust file name and method if it's an Excel file)
df = pd.read_csv('prolific_responses_survey1.csv')
# Define the mapping from Likert-scale text to numbers
likert_mapping = {
"Not at all harmful": 1,
"Slightly harmful": 2,
"Moderately harmful": 3,
"Very harmful": 4,
"Extremely harmful": 5
}
# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Replace the text values with numeric values in those columns
for col in likert_columns:
if col in df.columns: # only replace if column exists in the dataframe
df[col] = df[col].replace(likert_mapping)
/tmp/ipython-input-1113943154.py:22: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df[col] = df[col].replace(likert_mapping)
InĀ [4]:
import pandas as pd
# Assuming the DataFrame is called df and has columns:
# ["ProlificID", "SyntheticPersonID", "AttnCheck", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Condition"]
# 1) Group by SyntheticPersonID and compute the mean for Q1..Q8
df_avg = (
df.groupby("SyntheticPersonID")[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]]
.mean() # average across repeated rows for each SyntheticPersonID
.reset_index()
)
# 2) (Optional) If expecting a single "avg_harm" column across Q1..Q8
df_avg["avg_harm"] = df_avg[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]].mean(axis=1)
print(df_avg.head())
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 \
0 57 4.000000 4.333333 3.333333 3.333333 2.666667
1 89 3.400000 3.400000 3.400000 3.600000 3.400000
2 141 3.750000 3.750000 3.750000 3.750000 3.750000
3 153 5.000000 2.000000 3.000000 2.000000 4.000000
4 166 4.333333 4.333333 4.000000 4.333333 4.333333
Q6 Q7 Q8 avg_harm
0 2.666667 3.666667 4.333333 3.541667
1 3.400000 3.600000 3.800000 3.500000
2 3.750000 4.250000 4.500000 3.906250
3 4.000000 3.000000 5.000000 3.500000
4 4.000000 4.000000 4.333333 4.208333
InĀ [5]:
import pandas as pd
import itertools
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
# 1) Group by SyntheticPersonID and Condition, averaging Q1..Q8
# for repeated rows of the same SyntheticPersonID.
grouped_df = (
df.groupby(["SyntheticPersonID", "Condition"])[["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]]
.mean()
.reset_index()
)
# 2) Identify the questions to test separately
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
# 3) Get the unique conditions
conditions = grouped_df["Condition"].unique()
# 4) Prepare a list to store results from each test
results = []
# 5) For each question, do pairwise t-tests across all condition pairs
for question in questions:
for cond1, cond2 in itertools.combinations(conditions, 2):
# Extract data for this question, for each condition
group1 = grouped_df.loc[grouped_df["Condition"] == cond1, question].dropna()
group2 = grouped_df.loc[grouped_df["Condition"] == cond2, question].dropna()
# Perform Welch's t-test (equal_var=False)
t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
# Store the results
results.append((question, cond1, cond2, t_stat, p_val))
# 6) Convert results to a DataFrame
results_df = pd.DataFrame(
results,
columns=["Question", "Condition1", "Condition2", "t_stat", "p_value"]
)
# 7) Apply Bonferroni correction across all tests (all questions, all pairs)
adjusted = multipletests(results_df["p_value"], method="bonferroni")
results_df["p_adjusted"] = adjusted[1]
# 8) Print the final results
print("Separate t-test results for each question (with Bonferroni correction):")
print(results_df)
Separate t-test results for each question (with Bonferroni correction): Question Condition1 Condition2 t_stat p_value p_adjusted 0 Q1 Health Financial -3.587197 3.495687e-04 1.677930e-02 1 Q1 Health Control -4.902863 1.092902e-06 5.245930e-05 2 Q1 Health Sensitive 2.168184 3.037270e-02 1.000000e+00 3 Q1 Financial Control -0.980510 3.270632e-01 1.000000e+00 4 Q1 Financial Sensitive 5.636677 2.235649e-08 1.073112e-06 5 Q1 Control Sensitive 7.097109 2.386361e-12 1.145453e-10 6 Q2 Health Financial 3.229347 1.278838e-03 6.138424e-02 7 Q2 Health Control 7.193386 1.209643e-12 5.806285e-11 8 Q2 Health Sensitive 6.622718 5.664077e-11 2.718757e-09 9 Q2 Financial Control 3.890444 1.063662e-04 5.105578e-03 10 Q2 Financial Sensitive 3.563337 3.830099e-04 1.838447e-02 11 Q2 Control Sensitive -0.010752 9.914238e-01 1.000000e+00 12 Q3 Health Financial 1.677820 9.367899e-02 1.000000e+00 13 Q3 Health Control 4.875548 1.252754e-06 6.013218e-05 14 Q3 Health Sensitive 7.272086 7.006189e-13 3.362971e-11 15 Q3 Financial Control 3.346283 8.477206e-04 4.069059e-02 16 Q3 Financial Sensitive 5.868525 5.916269e-09 2.839809e-07 17 Q3 Control Sensitive 2.556409 1.071798e-02 5.144631e-01 18 Q4 Health Financial 4.356913 1.447577e-05 6.948371e-04 19 Q4 Health Control 4.663416 3.513352e-06 1.686409e-04 20 Q4 Health Sensitive 5.180697 2.656909e-07 1.275316e-05 21 Q4 Financial Control 0.179708 8.574161e-01 1.000000e+00 22 Q4 Financial Sensitive 0.980023 3.273059e-01 1.000000e+00 23 Q4 Control Sensitive 0.836422 4.031156e-01 1.000000e+00 24 Q5 Health Financial 8.191967 7.446464e-16 3.574303e-14 25 Q5 Health Control 6.830895 1.458423e-11 7.000431e-10 26 Q5 Health Sensitive 8.988436 1.250018e-18 6.000086e-17 27 Q5 Financial Control -1.883475 5.991251e-02 1.000000e+00 28 Q5 Financial Sensitive 2.136642 3.288764e-02 1.000000e+00 29 Q5 Control Sensitive 3.755913 1.841998e-04 8.841592e-03 30 Q6 Health Financial 9.017429 9.001603e-19 4.320769e-17 31 Q6 Health Control 5.551441 3.585966e-08 1.721264e-06 32 Q6 Health Sensitive 7.846908 1.063415e-14 5.104394e-13 33 Q6 Financial Control -3.565948 3.787077e-04 1.817797e-02 34 Q6 Financial Sensitive -0.522283 6.015889e-01 1.000000e+00 35 Q6 Control Sensitive 2.756823 5.941880e-03 2.852102e-01 36 Q7 Health Financial 6.845701 1.286144e-11 6.173493e-10 37 Q7 Health Control 3.078666 2.132678e-03 1.023685e-01 38 Q7 Health Sensitive 7.967726 4.248489e-15 2.039275e-13 39 Q7 Financial Control -3.756255 1.818530e-04 8.728946e-03 40 Q7 Financial Sensitive 1.128150 2.595183e-01 1.000000e+00 41 Q7 Control Sensitive 4.879113 1.233358e-06 5.920117e-05 42 Q8 Health Financial -4.672662 3.356588e-06 1.611162e-04 43 Q8 Health Control -6.334550 3.510638e-10 1.685106e-08 44 Q8 Health Sensitive 0.923002 3.562217e-01 1.000000e+00 45 Q8 Financial Control -1.799504 7.222315e-02 1.000000e+00 46 Q8 Financial Sensitive 5.752353 1.159085e-08 5.563608e-07 47 Q8 Control Sensitive 7.436152 2.172397e-13 1.042750e-11
InĀ [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
# Dictionary mapping question codes to descriptive labels
custom_labels = {
'Q1': 'Hackers/Cybercriminals',
'Q2': 'Government',
'Q3': 'Corporations',
'Q4': 'Employer/Colleagues',
'Q5': 'Family',
'Q6': 'Close Friends',
'Q7': 'Acquaintances',
'Q8': 'Publicly Available'
}
# Assume `grouped_df` has one row per (SyntheticPersonID, Condition),
# with columns: ["SyntheticPersonID", "Condition", "Q1", "Q2", ..., "Q8"].
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()
for cond in conditions:
# Filter data for this condition
cond_data = grouped_df[grouped_df["Condition"] == cond]
# Compute means and standard deviations for each question
mean_vals = cond_data[questions].mean(axis=0)
std_vals = cond_data[questions].std(axis=0)
# Number of SyntheticPersonIDs in this condition
n = cond_data.shape[0]
# Standard error for each question
se_vals = std_vals / np.sqrt(n) # If n <= 1, be mindful of dividing by zero
# 95% CI using the t-distribution (two-tailed)
if n > 1:
t_multiplier = st.t.ppf(1 - 0.025, df=n-1)
ci_vals = t_multiplier * se_vals
else:
# If there's only one data point in this condition, CI is not meaningful
ci_vals = np.zeros_like(se_vals)
# X positions for plotting
x_positions = np.arange(len(questions))
# Create the plot
plt.figure(figsize=(10,6))
plt.errorbar(
x_positions,
mean_vals,
yerr=ci_vals,
fmt='o',
capsize=5,
color='blue',
ecolor='black'
)
# Use custom labels for the questions
question_labels = [custom_labels[q] for q in questions]
plt.xticks(x_positions, question_labels, rotation=45, ha='right')
plt.xlabel("Question")
plt.ylabel("Mean Response")
plt.title(f"Mean Response (95% CI) for Condition: {cond}")
plt.tight_layout()
plt.show()
In summary, based on your adjusted pāvalues, significant differences exist between:
- Financial vs. Health
- Financial vs. Sensitive
- Health vs. Sensitive
- Control vs. Sensitive.
Financial vs. Control and Health vs. Control do not show significant differences after adjustment.
InĀ [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
# Assume grouped_df has one row per (SyntheticPersonID, Condition),
# including averaged columns ["Q1", ..., "Q8"].
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
conditions = grouped_df["Condition"].unique()
for question in questions:
# Prepare lists to store the mean and CI for each condition
means = []
ci_vals = []
for cond in conditions:
# Extract data for this question in the current condition
cond_data = grouped_df.loc[grouped_df["Condition"] == cond, question].dropna()
mean_val = cond_data.mean()
std_val = cond_data.std()
n = cond_data.shape[0]
# Compute 95% CI using the t-distribution
if n > 1:
se_val = std_val / np.sqrt(n)
t_multiplier = st.t.ppf(1 - 0.025, df=n-1) # 95% CI
ci = t_multiplier * se_val
else:
# If there's only one data point, the CI can't be computed meaningfully
ci = 0.0
means.append(mean_val)
ci_vals.append(ci)
# X positions for plotting
x_positions = np.arange(len(conditions))
plt.figure(figsize=(8,6))
plt.errorbar(
x_positions,
means,
yerr=ci_vals,
fmt='o',
capsize=5,
color='blue',
ecolor='black'
)
plt.xticks(x_positions, conditions, rotation=45, ha='right')
plt.xlabel("Condition")
plt.ylabel("Mean Response")
plt.title(f"Mean {question} by Condition (95% CI)")
plt.tight_layout()
plt.show()
InĀ [8]:
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
InĀ [9]:
import pandas as pd
# long-form summary: mean across participants Ć certainty
harm_summary = (
grouped_df
.groupby("Condition")[score_types]
.mean()
.T
.sort_values(by="Health", ascending=False)
)
print(harm_summary)
Condition Control Financial Health Sensitive Q8 4.145363 4.070019 3.870771 3.831052 Q1 3.728383 3.684524 3.518264 3.420635 Q7 3.315320 3.159649 3.445865 3.113724 Q2 3.122807 3.288252 3.436435 3.123280 Q3 3.133615 3.271147 3.344236 3.029200 Q4 3.141291 3.148747 3.342168 3.105423 Q6 2.917137 2.766071 3.168640 2.789848 Q5 2.749373 2.677694 3.029386 2.574603
InĀ [10]:
df_responses = pd.read_csv('prolific_responses_survey1.csv') # Adjust filename/path as needed
df_prolific = pd.read_csv('prolific_export.csv') # Adjust filename/path as needed
df_responses = df_responses.rename(columns={"ProlificID": "Participant id"})
demo_cols = ["Age","Sex","Ethnicity simplified","Country of birth","Country of residence",
"Nationality","Language","Student status","Employment status"]
keep_cols = ["Participant id"] + [c for c in demo_cols if c in df_prolific.columns]
df_prolific = df_prolific[keep_cols].copy()
df_prolific = df_prolific.replace({"DATA_EXPIRED": pd.NA, "Data expired": pd.NA})
for c in keep_cols[1:]:
if c in df_prolific:
df_prolific[c] = df_prolific[c].astype("string").str.strip()
df_prolific = df_prolific.applymap(
lambda v: pd.NA if isinstance(v, str) and v.strip().lower().replace(" ", "_") == "data_expired" else v
)
# Define the mapping from Likert-scale text to numbers
likert_mapping = {
"Not at all harmful": 1,
"Slightly harmful": 2,
"Moderately harmful": 3,
"Very harmful": 4,
"Extremely harmful": 5
}
# List the columns that contain Likert-scale responses.
# Update this list to include all columns that need conversion.
likert_columns = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Replace the text values with numeric values in those columns
for col in likert_columns:
if col in df.columns: # only replace if column exists in the dataframe
df_responses[col] = df_responses[col].replace(likert_mapping)
merged_df = df_responses.merge(df_prolific, on="Participant id", how="left")
# Now merged_df contains columns from both datasets.
print("Merged DataFrame:")
print(merged_df.head())
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
Merged DataFrame:
Participant id SyntheticPersonID AttnCheck Q1 Q2 Q3 Q4 Q5 \
0 674bfd855d300ddd3a4628ab 1545 NaN 3 2 3 3 4
1 674bfd855d300ddd3a4628ab 8617 NaN 3 3 3 4 4
2 674bfd855d300ddd3a4628ab 14105 NaN 3 3 3 3 4
3 674bfd855d300ddd3a4628ab 12001 NaN 3 3 3 4 4
4 674bfd855d300ddd3a4628ab 2586 NaN 3 3 3 3 3
Q6 Q7 ... Condition Age Sex Ethnicity simplified Country of birth \
0 4 4 ... Financial 28 Female White United States
1 4 4 ... Financial 28 Female White United States
2 4 4 ... Financial 28 Female White United States
3 4 4 ... Financial 28 Female White United States
4 4 4 ... Financial 28 Female White United States
Country of residence Nationality Language Student status \
0 United States United States English Yes
1 United States United States English Yes
2 United States United States English Yes
3 United States United States English Yes
4 United States United States English Yes
Employment status
0 Full-Time
1 Full-Time
2 Full-Time
3 Full-Time
4 Full-Time
[5 rows x 21 columns]
/tmp/ipython-input-1346675010.py:15: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
df_prolific = df_prolific.applymap(
/tmp/ipython-input-1346675010.py:35: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df_responses[col] = df_responses[col].replace(likert_mapping)
InĀ [11]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import itertools
# Define the questions and the demographic columns to analyze
questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# List of demographic columns (categorical) from the Prolific export.
# Update this list based on the columns available in your prolific_export.
demographic_vars = ["Sex", "Ethnicity simplified", "Country of residence", "Language", "Employment status"]
# Prepare a list to store the t-test results
results = []
# Loop over each demographic variable
for demo in demographic_vars:
# Get the unique groups in this demographic (drop missing values)
groups = merged_df[demo].dropna().unique()
# If there are less than 2 groups, skip this demographic.
if len(groups) < 2:
continue
# For each question, perform pairwise comparisons between groups in this demographic.
for question in questions:
# Use itertools.combinations to get all unique pairwise comparisons.
for group1, group2 in itertools.combinations(groups, 2):
# Filter data for each group for the given question
data1 = merged_df.loc[merged_df[demo] == group1, question].dropna()
data2 = merged_df.loc[merged_df[demo] == group2, question].dropna()
# Only perform the t-test if both groups have at least 2 observations
if len(data1) >= 2 and len(data2) >= 2:
t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
else:
t_stat, p_val = np.nan, np.nan # Not enough data
results.append((demo, question, group1, group2, t_stat, p_val))
# Convert the results list to a DataFrame for easier viewing
results_df = pd.DataFrame(results, columns=["Demographic", "Question", "Group1", "Group2", "t_stat", "p_value"])
# Optionally, adjust p-values for multiple comparisons using Bonferroni correction.
from statsmodels.stats.multitest import multipletests
adjusted = multipletests(results_df["p_value"].dropna(), method="bonferroni")
# Place adjusted p-values back into the results DataFrame.
# (needing to match the indices since multipletests was applied to non-NaN values.)
results_df.loc[results_df["p_value"].notna(), "p_adjusted"] = adjusted[1]
# Display the results
print("Pairwise t-test results for each question by demographic variable:")
print(results_df)
# get df to csv
results_df.to_csv('t-tests.csv', index=False)
/usr/local/lib/python3.12/dist-packages/scipy/stats/_axis_nan_policy.py:579: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable. res = hypotest_fun_out(*samples, **kwds)
Pairwise t-test results for each question by demographic variable:
Demographic Question \
0 Sex Q1
1 Sex Q1
2 Sex Q1
3 Sex Q2
4 Sex Q2
.. ... ...
387 Employment status Q8
388 Employment status Q8
389 Employment status Q8
390 Employment status Q8
391 Employment status Q8
Group1 \
0 Female
1 Female
2 Male
3 Female
4 Female
.. ...
387 Not in paid work (e.g. homemaker', 'retired or...
388 Not in paid work (e.g. homemaker', 'retired or...
389 Unemployed (and job seeking)
390 Unemployed (and job seeking)
391 Due to start a new job within the next month
Group2 t_stat p_value \
0 Male -1.797785 7.226345e-02
1 Prefer not to say -1.639231 1.120220e-01
2 Prefer not to say -1.159118 2.558171e-01
3 Male 6.125198 9.659961e-10
4 Prefer not to say -12.554503 3.037793e-14
.. ... ... ...
387 Due to start a new job within the next month 0.121551 9.033545e-01
388 Other -6.745376 5.085067e-11
389 Due to start a new job within the next month 4.802704 3.055997e-06
390 Other -0.982692 3.264334e-01
391 Other -5.108832 6.592380e-07
p_adjusted
0 1.000000e+00
1 1.000000e+00
2 1.000000e+00
3 3.786705e-07
4 1.190815e-11
.. ...
387 1.000000e+00
388 1.993346e-08
389 1.197951e-03
390 1.000000e+00
391 2.584213e-04
[392 rows x 7 columns]
Supervised ML¶
InĀ [12]:
from sklearn.preprocessing import LabelEncoder
feature_list = ["GENDER", "RACETHN", "EDUCCAT5", "DIVISION", "MARITAL_ACS",
"CHILDRENCAT", "CITIZEN_REC", "BORN_ACS", "AGE_INT",
"HIV_STAT", "PREG_STAT", "NumChronicIllness",
"FAMINC5", "CC_NUM", "FDSTMP_CPS",
"SEXUALITY", "OWNGUN_GSS", "RELIGCAT"
]
InĀ [13]:
df_synthetic.head()
Out[13]:
| NAME | id | GENDER | AGE | RACETHN | EDUCCAT5 | DIVISION | MARITAL_ACS | HHSIZECAT | ... | CI_LABEL_OWNGUN_GSS | CI_LABEL_SEXUALITY | CI_LABEL_HIV_STAT | CI_LABEL_PREG_STAT | CI_LABEL_CC_NUM | CI_LABEL_cc_encoded | CI_LABEL_cc_disclosed | CI_LABEL_NumChronicIllness | AGE_INT | Cluster | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Luke Walsh | 1 | Male | 25 | White non-Hispanic | Some college | Mountain | Never married | 3+ | ... | is | is | is probably | is probably | is | is probably | is possibly | is probably | 20-29 | 5 |
| 1 | 1 | Matilde Izaguirre Checa | 2 | Female | 70 | Hispanic | HS Grad | West South Central | Divorced | 1 | ... | is probably | is possibly | is | is | is | is | is | is possibly | 70-79 | 11 |
| 2 | 2 | Ryan Smith | 3 | Male | 85 | White non-Hispanic | Less than HS | Middle Atlantic | Now married | 2 | ... | is possibly | is probably | is probably | is | is probably | is probably | is probably | is probably | 80-89 | 2 |
| 3 | 3 | Matthew Grimes | 4 | Male | 59 | White non-Hispanic | HS Grad | Mountain | Now married | 2 | ... | is probably | is probably | is probably | is | is possibly | is probably | is | is probably | 50-59 | 12 |
| 4 | 4 | Miraan Rama | 5 | Female | 19 | Asian | Some college | Pacific | Never married | 1 | ... | is probably | is | is probably | is | is probably | is probably | is probably | is | 10-19 | 9 |
5 rows Ć 187 columns
InĀ [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
InĀ [15]:
import pandas as pd
likert_columns = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
df_merged = pd.merge(
df_synthetic,
df_avg,
how="left",
left_on="id",
right_on="SyntheticPersonID",
suffixes=("", "_drop")
)
cols_to_drop = [col for col in df_merged.columns if col.endswith("_drop")]
df_final = df_merged.drop(columns=cols_to_drop)
df_final[likert_columns] = df_final[likert_columns].fillna(0)
print(df_final.head())
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 avg_harm
0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
1 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
2 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
3 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
4 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
[5 rows x 197 columns]
InĀ [16]:
train_data = df_final[df_final['Q1'] > 0]
predict_data = df_final[df_final['Q1'] == 0]
X_train = train_data[feature_list]
y_train = train_data['Q1']
onehot_encoder = OneHotEncoder(
sparse_output=False,
handle_unknown='ignore',
drop='first'
)
X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
models = [
('RandomForest', RandomForestRegressor(n_estimators=100)),
('GradientBoosting', GradientBoostingRegressor()),
('LinearRegression', LinearRegression()),
('Ridge', Ridge()),
('Lasso', Lasso()),
('DecisionTree', DecisionTreeRegressor()),
('SVR', SVR())
]
def evaluate_model(model_name, model, X_train, y_train):
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Model: {model_name}')
print(f'MSE on training data: {mse}')
print(f'R^2 on training data: {r2}\n')
for name, model in models:
evaluate_model(name, model, X_train_encoded, y_train_scaled)
Model: RandomForest MSE on training data: 0.14696793433025557 R^2 on training data: 0.8530320656697444 Model: GradientBoosting MSE on training data: 0.8793177007084634 R^2 on training data: 0.12068229929153684 Model: LinearRegression MSE on training data: 0.12943632782427547 R^2 on training data: 0.8705636721757246 Model: Ridge MSE on training data: 0.3482815208841059 R^2 on training data: 0.6517184791158941 Model: Lasso MSE on training data: 1.0000000000000002 R^2 on training data: 0.0 Model: DecisionTree MSE on training data: 5.869500782894434e-35 R^2 on training data: 1.0 Model: SVR MSE on training data: 0.5913284111031373 R^2 on training data: 0.40867158889686284
InĀ [17]:
questions = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
for q in questions:
train_data = df_final[df_final[q] > 0]
predict_data = df_final[df_final[q] == 0]
X_train = train_data[feature_list]
y_train = train_data[q]
onehot_encoder = OneHotEncoder(
sparse_output=False,
handle_unknown='ignore',
drop='first'
)
X_train_encoded = onehot_encoder.fit_transform(X_train.astype(str))
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
models = [
('DecisionTree', DecisionTreeRegressor())
]
def evaluate_model(model_name, model, X_train, y_train):
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Model: {model_name}')
print(f'MSE on training data: {mse}')
print(f'R^2 on training data: {r2}\n')
for name, model in models:
evaluate_model(name, model, X_train_encoded, y_train_scaled)
best_model = DecisionTreeRegressor(
max_depth=30,
max_features=None,
min_samples_leaf=4,
min_samples_split=5,
random_state=42
)
best_model.fit(X_train_encoded, y_train_scaled)
X_predict = predict_data[feature_list]
X_predict_encoded = onehot_encoder.transform(X_predict.astype(str))
predicted_scores_scaled = best_model.predict(X_predict_encoded)
predicted_scores = scaler.inverse_transform(predicted_scores_scaled.reshape(-1, 1)).flatten()
df_final.loc[df_final[q] == 0, q] = predicted_scores
print(df_final.head())
Model: DecisionTree MSE on training data: 2.934750391447217e-35 R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 avg_harm
0 NaN 3.416667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
1 NaN 3.952381 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
2 NaN 2.916667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
3 NaN 3.458333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
4 NaN 3.786667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.0024006805786898e-34
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 avg_harm
0 NaN 3.416667 3.392857 0.0 0.0 0.0 0.0 0.0 0.0 NaN
1 NaN 3.952381 3.833333 0.0 0.0 0.0 0.0 0.0 0.0 NaN
2 NaN 2.916667 2.833333 0.0 0.0 0.0 0.0 0.0 0.0 NaN
3 NaN 3.458333 3.466667 0.0 0.0 0.0 0.0 0.0 0.0 NaN
4 NaN 3.786667 3.516667 0.0 0.0 0.0 0.0 0.0 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.1555579666323415e-34
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 \
0 NaN 3.416667 3.392857 2.604167 0.0 0.0 0.0 0.0 0.0
1 NaN 3.952381 3.833333 3.426667 0.0 0.0 0.0 0.0 0.0
2 NaN 2.916667 2.833333 3.458333 0.0 0.0 0.0 0.0 0.0
3 NaN 3.458333 3.466667 3.660000 0.0 0.0 0.0 0.0 0.0
4 NaN 3.786667 3.516667 3.250000 0.0 0.0 0.0 0.0 0.0
avg_harm
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.2619426683223032e-34
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 Q7 \
0 NaN 3.416667 3.392857 2.604167 3.041667 0.0 0.0 0.0
1 NaN 3.952381 3.833333 3.426667 2.538095 0.0 0.0 0.0
2 NaN 2.916667 2.833333 3.458333 4.027778 0.0 0.0 0.0
3 NaN 3.458333 3.466667 3.660000 3.511905 0.0 0.0 0.0
4 NaN 3.786667 3.516667 3.250000 3.290476 0.0 0.0 0.0
Q8 avg_harm
0 0.0 NaN
1 0.0 NaN
2 0.0 NaN
3 0.0 NaN
4 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.1739001565788867e-34
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 Q6 \
0 NaN 3.416667 3.392857 2.604167 3.041667 2.375000 0.0
1 NaN 3.952381 3.833333 3.426667 2.538095 1.958333 0.0
2 NaN 2.916667 2.833333 3.458333 4.027778 3.683333 0.0
3 NaN 3.458333 3.466667 3.660000 3.511905 3.333333 0.0
4 NaN 3.786667 3.516667 3.250000 3.290476 2.628571 0.0
Q7 Q8 avg_harm
0 0.0 0.0 NaN
1 0.0 0.0 NaN
2 0.0 0.0 NaN
3 0.0 0.0 NaN
4 0.0 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 6.419766481290787e-37
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 \
0 NaN 3.416667 3.392857 2.604167 3.041667 2.375000
1 NaN 3.952381 3.833333 3.426667 2.538095 1.958333
2 NaN 2.916667 2.833333 3.458333 4.027778 3.683333
3 NaN 3.458333 3.466667 3.660000 3.511905 3.333333
4 NaN 3.786667 3.516667 3.250000 3.290476 2.628571
Q6 Q7 Q8 avg_harm
0 2.547619 0.0 0.0 NaN
1 2.263889 0.0 0.0 NaN
2 4.125000 0.0 0.0 NaN
3 3.160000 0.0 0.0 NaN
4 2.112500 0.0 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 2.2010627935854123e-35
R^2 on training data: 1.0
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 \
0 NaN 3.416667 3.392857 2.604167 3.041667 2.375000
1 NaN 3.952381 3.833333 3.426667 2.538095 1.958333
2 NaN 2.916667 2.833333 3.458333 4.027778 3.683333
3 NaN 3.458333 3.466667 3.660000 3.511905 3.333333
4 NaN 3.786667 3.516667 3.250000 3.290476 2.628571
Q6 Q7 Q8 avg_harm
0 2.547619 3.522222 0.0 NaN
1 2.263889 3.872222 0.0 NaN
2 4.125000 3.770833 0.0 NaN
3 3.160000 3.522222 0.0 NaN
4 2.112500 3.537500 0.0 NaN
[5 rows x 197 columns]
Model: DecisionTree
MSE on training data: 1.7618246637092401e-34
R^2 on training data: 1.0
NAME id GENDER AGE RACETHN \
0 0 Luke Walsh 1 Male 25 White non-Hispanic
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic
2 2 Ryan Smith 3 Male 85 White non-Hispanic
3 3 Matthew Grimes 4 Male 59 White non-Hispanic
4 4 Miraan Rama 5 Female 19 Asian
EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... \
0 Some college Mountain Never married 3+ ...
1 HS Grad West South Central Divorced 1 ...
2 Less than HS Middle Atlantic Now married 2 ...
3 HS Grad Mountain Now married 2 ...
4 Some college Pacific Never married 1 ...
SyntheticPersonID Q1 Q2 Q3 Q4 Q5 \
0 NaN 3.416667 3.392857 2.604167 3.041667 2.375000
1 NaN 3.952381 3.833333 3.426667 2.538095 1.958333
2 NaN 2.916667 2.833333 3.458333 4.027778 3.683333
3 NaN 3.458333 3.466667 3.660000 3.511905 3.333333
4 NaN 3.786667 3.516667 3.250000 3.290476 2.628571
Q6 Q7 Q8 avg_harm
0 2.547619 3.522222 3.385417 NaN
1 2.263889 3.872222 3.907143 NaN
2 4.125000 3.770833 3.291667 NaN
3 3.160000 3.522222 4.770833 NaN
4 2.112500 3.537500 4.638889 NaN
[5 rows x 197 columns]
/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [13] during transform. These unknown categories will be encoded as all zeros warnings.warn(
InĀ [18]:
# drop syntheticpersonid column
df_final = df_final.drop(columns=['SyntheticPersonID'])
df_final = df_final.drop(columns=['avg_harm'])
Graphs¶
InĀ [19]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def plot_barplot_per_question(df, questions, save=False, outdir=".", fontsize=12, figsize=(12, 8), dpi=100, save_dpi=300):
palette = sns.color_palette("Set2")
pdf = None
if save:
pdf_path = f"{outdir}/Barplots.pdf"
pdf = PdfPages(pdf_path)
q = 1
for score in questions:
fig = plt.figure(figsize=figsize, dpi=dpi)
ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
ax.set_title(f'Average Scores by Race/Ethnicity and Sexuality for Question {q}', fontsize=fontsize)
q += 1
ax.set_xlabel('Race/Ethnicity', fontsize=fontsize)
ax.set_ylabel('Average Score', fontsize=fontsize)
ax.tick_params(axis='both', labelsize=fontsize)
plt.xticks(rotation=45)
leg = ax.legend(title='Sexuality')
if leg:
leg.get_title().set_fontsize(fontsize)
for t in leg.get_texts():
t.set_fontsize(fontsize)
plt.tight_layout()
if save:
pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)
plt.show()
if save:
pdf.close()
print(f"Saved {pdf_path}")
# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_barplot_per_question(df_final, questions, save=True, outdir=".")
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
/tmp/ipython-input-2728256619.py:15: UserWarning: The palette list has more values (8) than needed (4), which may not be intended. ax = sns.barplot(x='RACETHN', y=score, hue='SEXUALITY', data=df.groupby(['RACETHN', 'SEXUALITY'])[score].median().reset_index(), palette=palette)
Saved ./Barplots.pdf
Does sexuality impact harm?
InĀ [20]:
import itertools
import scipy.stats as stats
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Loop over each question
for score in score_types:
print(f"\nResults for {score}:")
# Pairwise t-tests for SEXUALITY
print("Comparisons by SEXUALITY:")
sexuality_groups = df_final['SEXUALITY'].dropna().unique()
for group1, group2 in itertools.combinations(sexuality_groups, 2):
data1 = df_final[df_final['SEXUALITY'] == group1][score].dropna()
data2 = df_final[df_final['SEXUALITY'] == group2][score].dropna()
if len(data1) >= 2 and len(data2) >= 2:
t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
print(f" {group1} vs {group2}: t = {t_stat:.3f}, p = {p_val:.3e}")
Results for Q1: Comparisons by SEXUALITY: Heterosexual vs Gay: t = 0.020, p = 9.841e-01 Heterosexual vs Bisexual: t = -4.631, p = 4.155e-06 Heterosexual vs Lesbian: t = 1.024, p = 3.075e-01 Gay vs Bisexual: t = -2.687, p = 7.365e-03 Gay vs Lesbian: t = 0.850, p = 3.962e-01 Bisexual vs Lesbian: t = 2.876, p = 4.462e-03 Results for Q2: Comparisons by SEXUALITY: Heterosexual vs Gay: t = -0.255, p = 7.985e-01 Heterosexual vs Bisexual: t = -3.453, p = 5.799e-04 Heterosexual vs Lesbian: t = -0.574, p = 5.671e-01 Gay vs Bisexual: t = -1.804, p = 7.161e-02 Gay vs Lesbian: t = -0.383, p = 7.021e-01 Bisexual vs Lesbian: t = 0.707, p = 4.806e-01 Results for Q3: Comparisons by SEXUALITY: Heterosexual vs Gay: t = 1.721, p = 8.594e-02 Heterosexual vs Bisexual: t = -5.635, p = 2.325e-08 Heterosexual vs Lesbian: t = -1.538, p = 1.262e-01 Gay vs Bisexual: t = -4.778, p = 2.100e-06 Gay vs Lesbian: t = -2.202, p = 2.862e-02 Bisexual vs Lesbian: t = 0.750, p = 4.540e-01 Results for Q4: Comparisons by SEXUALITY: Heterosexual vs Gay: t = -1.942, p = 5.287e-02 Heterosexual vs Bisexual: t = -6.565, p = 8.627e-11 Heterosexual vs Lesbian: t = -3.053, p = 2.706e-03 Gay vs Bisexual: t = -2.252, p = 2.462e-02 Gay vs Lesbian: t = -1.700, p = 9.045e-02 Bisexual vs Lesbian: t = -0.369, p = 7.128e-01 Results for Q5: Comparisons by SEXUALITY: Heterosexual vs Gay: t = 0.591, p = 5.548e-01 Heterosexual vs Bisexual: t = 3.610, p = 3.227e-04 Heterosexual vs Lesbian: t = 0.671, p = 5.033e-01 Gay vs Bisexual: t = 1.460, p = 1.447e-01 Gay vs Lesbian: t = 0.265, p = 7.910e-01 Bisexual vs Lesbian: t = -0.691, p = 4.905e-01 Results for Q6: Comparisons by SEXUALITY: Heterosexual vs Gay: t = -3.237, p = 1.308e-03 Heterosexual vs Bisexual: t = -5.511, p = 4.613e-08 Heterosexual vs Lesbian: t = -2.668, p = 8.519e-03 Gay vs Bisexual: t = -0.468, p = 6.397e-01 Gay vs Lesbian: t = -0.736, p = 4.627e-01 Bisexual vs Lesbian: t = -0.491, p = 6.240e-01 Results for Q7: Comparisons by SEXUALITY: Heterosexual vs Gay: t = -0.108, p = 9.143e-01 Heterosexual vs Bisexual: t = 0.563, p = 5.739e-01 Heterosexual vs Lesbian: t = -0.567, p = 5.716e-01 Gay vs Bisexual: t = 0.406, p = 6.847e-01 Gay vs Lesbian: t = -0.440, p = 6.604e-01 Bisexual vs Lesbian: t = -0.734, p = 4.637e-01 Results for Q8: Comparisons by SEXUALITY: Heterosexual vs Gay: t = -0.327, p = 7.441e-01 Heterosexual vs Bisexual: t = -4.607, p = 4.658e-06 Heterosexual vs Lesbian: t = -2.006, p = 4.672e-02 Gay vs Bisexual: t = -2.553, p = 1.086e-02 Gay vs Lesbian: t = -1.640, p = 1.024e-01 Bisexual vs Lesbian: t = -0.195, p = 8.458e-01
InĀ [21]:
import itertools
import scipy.stats as stats
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Loop over each harm question
for score in score_types:
print(f"\n=== T-test Results for {score} ===")
# 1) Pairwise t-tests by HIV_STAT
print("Comparisons by HIV_STAT:")
hiv_groups = df_final['HIV_STAT'].dropna().unique()
for grp1, grp2 in itertools.combinations(hiv_groups, 2):
data1 = df_final.loc[df_final['HIV_STAT'] == grp1, score].dropna()
data2 = df_final.loc[df_final['HIV_STAT'] == grp2, score].dropna()
# Ensure both groups have at least 2 values to run a t-test
if len(data1) >= 2 and len(data2) >= 2:
t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
print(f" {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
else:
print(f" {grp1} vs {grp2}: Not enough data.")
=== T-test Results for Q1 === Comparisons by HIV_STAT: negative vs positive: t = -0.171, p = 8.642e-01 === T-test Results for Q2 === Comparisons by HIV_STAT: negative vs positive: t = 0.110, p = 9.130e-01 === T-test Results for Q3 === Comparisons by HIV_STAT: negative vs positive: t = -0.233, p = 8.158e-01 === T-test Results for Q4 === Comparisons by HIV_STAT: negative vs positive: t = 1.262, p = 2.094e-01 === T-test Results for Q5 === Comparisons by HIV_STAT: negative vs positive: t = 0.426, p = 6.709e-01 === T-test Results for Q6 === Comparisons by HIV_STAT: negative vs positive: t = -1.796, p = 7.498e-02 === T-test Results for Q7 === Comparisons by HIV_STAT: negative vs positive: t = 0.204, p = 8.385e-01 === T-test Results for Q8 === Comparisons by HIV_STAT: negative vs positive: t = 0.523, p = 6.018e-01
InĀ [22]:
import itertools
import scipy.stats as stats
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Loop over each harm question
for score in score_types:
print(f"\n=== T-test Results for {score} ===")
# 1) Pairwise t-tests by HIV_STAT
print("Comparisons by FAMINC5:")
hiv_groups = df_final['FAMINC5'].dropna().unique()
for grp1, grp2 in itertools.combinations(hiv_groups, 2):
data1 = df_final.loc[df_final['FAMINC5'] == grp1, score].dropna()
data2 = df_final.loc[df_final['FAMINC5'] == grp2, score].dropna()
# Ensure both groups have at least 2 values to run a t-test
if len(data1) >= 2 and len(data2) >= 2:
t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
print(f" {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
else:
print(f" {grp1} vs {grp2}: Not enough data.")
=== T-test Results for Q1 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 10.354, p = 6.751e-25 $150K or more vs $40K to less than $75K: t = 2.848, p = 4.419e-03 $150K or more vs $20K to less than $40K: t = 5.689, p = 1.340e-08 $150K or more vs $75K to less than $150K: t = -3.250, p = 1.162e-03 Less than $20K vs $40K to less than $75K: t = -9.091, p = 1.339e-19 Less than $20K vs $20K to less than $40K: t = -5.221, p = 1.838e-07 Less than $20K vs $75K to less than $150K: t = -15.307, p = 8.899e-52 $40K to less than $75K vs $20K to less than $40K: t = 3.646, p = 2.681e-04 $40K to less than $75K vs $75K to less than $150K: t = -7.657, p = 2.070e-14 $20K to less than $40K vs $75K to less than $150K: t = -10.312, p = 9.442e-25 === T-test Results for Q2 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 1.803, p = 7.145e-02 $150K or more vs $40K to less than $75K: t = -0.908, p = 3.638e-01 $150K or more vs $20K to less than $40K: t = -2.290, p = 2.208e-02 $150K or more vs $75K to less than $150K: t = -2.442, p = 1.465e-02 Less than $20K vs $40K to less than $75K: t = -2.996, p = 2.746e-03 Less than $20K vs $20K to less than $40K: t = -4.251, p = 2.164e-05 Less than $20K vs $75K to less than $150K: t = -4.614, p = 4.033e-06 $40K to less than $75K vs $20K to less than $40K: t = -1.694, p = 9.033e-02 $40K to less than $75K vs $75K to less than $150K: t = -1.848, p = 6.465e-02 $20K to less than $40K vs $75K to less than $150K: t = 0.091, p = 9.278e-01 === T-test Results for Q3 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 2.109, p = 3.500e-02 $150K or more vs $40K to less than $75K: t = 3.541, p = 4.026e-04 $150K or more vs $20K to less than $40K: t = 5.172, p = 2.391e-07 $150K or more vs $75K to less than $150K: t = -0.912, p = 3.618e-01 Less than $20K vs $40K to less than $75K: t = 1.048, p = 2.945e-01 Less than $20K vs $20K to less than $40K: t = 2.877, p = 4.028e-03 Less than $20K vs $75K to less than $150K: t = -3.339, p = 8.469e-04 $40K to less than $75K vs $20K to less than $40K: t = 2.303, p = 2.128e-02 $40K to less than $75K vs $75K to less than $150K: t = -5.519, p = 3.485e-08 $20K to less than $40K vs $75K to less than $150K: t = -7.126, p = 1.127e-12 === T-test Results for Q4 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 4.746, p = 2.127e-06 $150K or more vs $40K to less than $75K: t = 0.836, p = 4.030e-01 $150K or more vs $20K to less than $40K: t = 4.490, p = 7.265e-06 $150K or more vs $75K to less than $150K: t = -5.198, p = 2.087e-07 Less than $20K vs $40K to less than $75K: t = -4.515, p = 6.458e-06 Less than $20K vs $20K to less than $40K: t = -0.426, p = 6.704e-01 Less than $20K vs $75K to less than $150K: t = -10.521, p = 1.124e-25 $40K to less than $75K vs $20K to less than $40K: t = 4.238, p = 2.282e-05 $40K to less than $75K vs $75K to less than $150K: t = -7.044, p = 1.976e-12 $20K to less than $40K vs $75K to less than $150K: t = -10.517, p = 1.072e-25 === T-test Results for Q5 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 1.090, p = 2.758e-01 $150K or more vs $40K to less than $75K: t = 3.502, p = 4.651e-04 $150K or more vs $20K to less than $40K: t = 2.781, p = 5.437e-03 $150K or more vs $75K to less than $150K: t = -0.328, p = 7.427e-01 Less than $20K vs $40K to less than $75K: t = 2.326, p = 2.004e-02 Less than $20K vs $20K to less than $40K: t = 1.710, p = 8.735e-02 Less than $20K vs $75K to less than $150K: t = -1.620, p = 1.052e-01 $40K to less than $75K vs $20K to less than $40K: t = -0.421, p = 6.741e-01 $40K to less than $75K vs $75K to less than $150K: t = -4.751, p = 2.046e-06 $20K to less than $40K vs $75K to less than $150K: t = -3.629, p = 2.871e-04 === T-test Results for Q6 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 7.296, p = 3.381e-13 $150K or more vs $40K to less than $75K: t = 1.267, p = 2.051e-01 $150K or more vs $20K to less than $40K: t = 7.933, p = 2.546e-15 $150K or more vs $75K to less than $150K: t = -2.626, p = 8.656e-03 Less than $20K vs $40K to less than $75K: t = -7.016, p = 2.531e-12 Less than $20K vs $20K to less than $40K: t = 0.339, p = 7.350e-01 Less than $20K vs $75K to less than $150K: t = -11.143, p = 1.563e-28 $40K to less than $75K vs $20K to less than $40K: t = 7.779, p = 8.298e-15 $40K to less than $75K vs $75K to less than $150K: t = -4.748, p = 2.086e-06 $20K to less than $40K vs $75K to less than $150K: t = -12.183, p = 8.516e-34 === T-test Results for Q7 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = 7.919, p = 2.875e-15 $150K or more vs $40K to less than $75K: t = 3.448, p = 5.685e-04 $150K or more vs $20K to less than $40K: t = 0.657, p = 5.111e-01 $150K or more vs $75K to less than $150K: t = -7.597, p = 3.608e-14 Less than $20K vs $40K to less than $75K: t = -5.603, p = 2.201e-08 Less than $20K vs $20K to less than $40K: t = -7.759, p = 9.979e-15 Less than $20K vs $75K to less than $150K: t = -17.055, p = 1.266e-63 $40K to less than $75K vs $20K to less than $40K: t = -2.971, p = 2.975e-03 $40K to less than $75K vs $75K to less than $150K: t = -13.629, p = 6.067e-42 $20K to less than $40K vs $75K to less than $150K: t = -9.081, p = 1.373e-19 === T-test Results for Q8 === Comparisons by FAMINC5: $150K or more vs Less than $20K: t = -1.502, p = 1.331e-01 $150K or more vs $40K to less than $75K: t = -3.706, p = 2.126e-04 $150K or more vs $20K to less than $40K: t = -0.828, p = 4.075e-01 $150K or more vs $75K to less than $150K: t = -9.260, p = 3.053e-20 Less than $20K vs $40K to less than $75K: t = -2.044, p = 4.099e-02 Less than $20K vs $20K to less than $40K: t = 0.778, p = 4.363e-01 Less than $20K vs $75K to less than $150K: t = -7.771, p = 9.232e-15 $40K to less than $75K vs $20K to less than $40K: t = 3.136, p = 1.720e-03 $40K to less than $75K vs $75K to less than $150K: t = -7.172, p = 7.888e-13 $20K to less than $40K vs $75K to less than $150K: t = -9.387, p = 8.156e-21
InĀ [23]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_heatmap_per_question(df, questions, mask_zeros=True, save=False, outdir="."):
"""
Generate heatmaps of mean values for each question (Q1..Q8)
by Division Ć Family Income group.
"""
canonical_levels = [
"Less than $20K",
"$20K to less than $40K",
"$40K to less than $75K",
"$75K to less than $150K",
"$150K or more",
]
for score in questions:
df_copy = df.copy()
# If 0 was just a placeholder, treat as missing
if mask_zeros and score in df_copy:
df_copy.loc[df_copy[score] == 0, score] = np.nan
df_copy["FAMINC5"] = pd.Categorical(
df_copy["FAMINC5"], categories=canonical_levels, ordered=True
)
# Compute group means
means = (
df_copy.groupby(["DIVISION","FAMINC5"])[score]
.mean()
.unstack()
.reindex(columns=canonical_levels)
)
# Plot heatmap
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(means, annot=True, fmt=".2f", cmap="Greys", cbar=False, ax=ax)
ax.set_title(f"{score}: Mean by Division Ć Family Income")
ax.set_xlabel("Family Income")
ax.set_ylabel("Division")
plt.tight_layout()
if save:
pdf_path = f"{outdir}/Heatmap_{score}.pdf"
png_path = f"{outdir}/Heatmap_{score}.png"
plt.savefig(pdf_path, bbox_inches="tight")
plt.savefig(png_path, bbox_inches="tight", dpi=300)
print(f"Saved {pdf_path} and {png_path}")
plt.show()
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_heatmap_per_question(df_final, questions, mask_zeros=True, save=True, outdir=".")
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q1.pdf and ./Heatmap_Q1.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q2.pdf and ./Heatmap_Q2.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q3.pdf and ./Heatmap_Q3.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q4.pdf and ./Heatmap_Q4.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q5.pdf and ./Heatmap_Q5.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q6.pdf and ./Heatmap_Q6.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q7.pdf and ./Heatmap_Q7.png
/tmp/ipython-input-3807495455.py:30: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION","FAMINC5"])[score]
Saved ./Heatmap_Q8.pdf and ./Heatmap_Q8.png
InĀ [24]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def plot_heatmap_per_question(
df, questions, mask_zeros=True, save=False, outdir=".",
fontsize=18, figsize=(8, 6), dpi=100, save_dpi=300
):
"""
Heatmaps of mean values (Q1..Q8) by Division Ć Family Income.
Shows at a normal size in Jupyter and saves with large fonts.
"""
canonical_levels = [
"Less than $20K",
"$20K to less than $40K",
"$40K to less than $75K",
"$75K to less than $150K",
"$150K or more",
]
sns.set_theme(style="white")
for score in questions:
df_copy = df.copy()
if mask_zeros and score in df_copy:
df_copy.loc[df_copy[score] == 0, score] = np.nan
df_copy["FAMINC5"] = pd.Categorical(
df_copy["FAMINC5"], categories=canonical_levels, ordered=True
)
means = (
df_copy.groupby(["DIVISION", "FAMINC5"])[score]
.mean()
.unstack()
.reindex(columns=canonical_levels)
)
# normal-size preview
fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
sns.heatmap(
means,
annot=True,
fmt=".2f",
cmap="Greys",
cbar=False,
ax=ax,
annot_kws={"size": fontsize - 2}, # numbers inside cells
linewidths=0.5,
linecolor="lightgray"
)
# Explicitly set label and tick font sizes
ax.set_title(f"{score}: Mean by Division Ć Family Income",
fontsize=fontsize + 2, pad=10)
ax.set_xlabel("Family Income", fontsize=fontsize)
ax.set_ylabel("Division", fontsize=fontsize)
ax.tick_params(axis="x", labelsize=fontsize - 2, rotation=30)
ax.tick_params(axis="y", labelsize=fontsize - 2)
plt.tight_layout()
if save:
base = f"{outdir}/Heatmap_{score}"
plt.savefig(f"{base}.pdf", bbox_inches="tight", dpi=save_dpi)
plt.savefig(f"{base}.png", bbox_inches="tight", dpi=save_dpi)
print(f"Saved {base}.pdf / .png")
plt.show()
InĀ [25]:
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_heatmap_per_question(
df_final,
questions,
fontsize=18, # big labels
figsize=(8,6), # normal display
dpi=100, # notebook view
save=True,
save_dpi=300 # publication quality
)
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q1.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q2.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q3.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q4.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q5.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q6.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q7.pdf / .png
/tmp/ipython-input-2717744192.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. df_copy.groupby(["DIVISION", "FAMINC5"])[score]
Saved ./Heatmap_Q8.pdf / .png
InĀ [26]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def plot_boxplot_per_question(df, questions, save=False, outdir=".", fontsize=12, figsize=(12, 6), dpi=100, save_dpi=300):
pdf = None
if save:
pdf_path = f"{outdir}/Boxplots.pdf"
pdf = PdfPages(pdf_path)
for score in questions:
fig = plt.figure(figsize=figsize, dpi=dpi)
ax = sns.boxplot(x='DIVISION', y=score, hue='FAMINC5', data=df)
ax.set_title(f'Distribution of {score} by Division and Family Income', fontsize=fontsize)
ax.set_xlabel('Race/Ethnicity', fontsize=fontsize)
ax.set_ylabel(score, fontsize=fontsize)
ax.tick_params(axis='both', labelsize=fontsize)
plt.xticks(rotation=45)
leg = ax.legend(title='Division')
if leg:
leg.get_title().set_fontsize(fontsize)
for t in leg.get_texts():
t.set_fontsize(fontsize)
plt.tight_layout()
if save:
pdf.savefig(fig, bbox_inches="tight", dpi=save_dpi)
plt.show()
if save:
pdf.close()
print(f"Saved {pdf_path}")
# --- Example usage ---
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
plot_boxplot_per_question(df_final, questions, save=True, outdir=".")
Saved ./Boxplots.pdf
InĀ [27]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
# List of your harm questions
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
# We'll loop over each question and fit a model like:
# QX ~ C(HIV_STAT)*C(GENDER) + C(HIV_STAT)*C(RACETHN) + C(HIV_STAT)*C(SEXUALITY)
for q in questions:
formula = (
f"{q} ~ C(HIV_STAT)*C(GENDER) "
f"+ C(HIV_STAT)*C(RACETHN) "
f"+ C(HIV_STAT)*C(SEXUALITY)"
)
model = ols(formula, data=df_final).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(f"ANOVA for {q} (HIV status interactions):")
print(anova_table)
print("\n")
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q1 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 1.031297 1.0 3.459178 6.291625e-02
C(RACETHN) 32.247881 4.0 27.041477 2.061234e-22
C(SEXUALITY) 7.345514 3.0 8.212780 2.720823e-04
C(HIV_STAT):C(GENDER) 1.084251 1.0 3.636796 5.652991e-02
C(HIV_STAT):C(RACETHN) 1.855918 4.0 1.556281 1.830005e-01
C(HIV_STAT):C(SEXUALITY) 0.074259 3.0 0.083027 9.203269e-01
Residual 5957.601692 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q2 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 7.351555 1.0 26.167008 3.160221e-07
C(RACETHN) 11.284552 4.0 10.041513 4.072944e-08
C(SEXUALITY) 4.480533 3.0 5.315980 4.919414e-03
C(HIV_STAT):C(GENDER) 1.152986 1.0 4.103919 4.279721e-02
C(HIV_STAT):C(RACETHN) 1.409422 4.0 1.254169 2.856295e-01
C(HIV_STAT):C(SEXUALITY) 0.355331 3.0 0.421587 6.560110e-01
Residual 5614.173538 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q3 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 7.399024 1.0 30.548239 3.297363e-08
C(RACETHN) 9.770807 4.0 10.085145 3.748663e-08
C(SEXUALITY) 15.220055 3.0 20.946271 8.178470e-10
C(HIV_STAT):C(GENDER) 0.397629 1.0 1.641685 2.001094e-01
C(HIV_STAT):C(RACETHN) 2.161597 4.0 2.231138 6.305302e-02
C(HIV_STAT):C(SEXUALITY) 0.996982 3.0 1.372075 2.536041e-01
Residual 4840.039940 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q4 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 20.080138 1.0 75.534928 3.862163e-18
C(RACETHN) 39.594250 4.0 37.235163 4.492368e-31
C(SEXUALITY) 18.490424 3.0 23.184981 8.761028e-11
C(HIV_STAT):C(GENDER) 0.193486 1.0 0.727831 3.935975e-01
C(HIV_STAT):C(RACETHN) 1.108683 4.0 1.042626 3.834517e-01
C(HIV_STAT):C(SEXUALITY) 0.115409 3.0 0.144710 8.652739e-01
Residual 5312.262878 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q5 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 16.643305 1.0 58.359518 2.281531e-14
C(RACETHN) 42.171325 4.0 36.968292 7.577198e-31
C(SEXUALITY) 8.424883 3.0 9.847245 5.314981e-05
C(HIV_STAT):C(GENDER) 0.040882 1.0 0.143352 7.049751e-01
C(HIV_STAT):C(RACETHN) 0.201964 4.0 0.177046 9.503129e-01
C(HIV_STAT):C(SEXUALITY) 0.496024 3.0 0.579767 5.600380e-01
Residual 5698.867506 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q6 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 0.076645 1.0 0.266522 6.056804e-01
C(RACETHN) 33.771216 4.0 29.358518 2.227144e-24
C(SEXUALITY) 17.352454 3.0 20.113473 1.877630e-09
C(HIV_STAT):C(GENDER) 0.807353 1.0 2.807444 9.384342e-02
C(HIV_STAT):C(RACETHN) 1.922260 4.0 1.671089 1.535850e-01
C(HIV_STAT):C(SEXUALITY) 1.066029 3.0 1.235649 2.906682e-01
Residual 5746.630356 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
ANOVA for Q7 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 0.406088 1.0 1.617649 2.034348e-01
C(RACETHN) 34.499272 4.0 34.356930 1.260325e-28
C(SEXUALITY) 0.115167 3.0 0.152922 8.581976e-01
C(HIV_STAT):C(GENDER) 0.869957 1.0 3.465470 6.267736e-02
C(HIV_STAT):C(RACETHN) 1.962703 4.0 1.954605 9.850534e-02
C(HIV_STAT):C(SEXUALITY) 0.208720 3.0 0.277146 7.579471e-01
Residual 5016.447647 19983.0 NaN NaN
ANOVA for Q8 (HIV status interactions):
sum_sq df F PR(>F)
C(HIV_STAT) NaN 1.0 NaN NaN
C(GENDER) 18.214321 1.0 69.728470 7.243990e-17
C(RACETHN) 30.396160 4.0 29.090815 3.758651e-24
C(SEXUALITY) 7.216094 3.0 9.208271 1.006330e-04
C(HIV_STAT):C(GENDER) 0.004230 1.0 0.016195 8.987378e-01
C(HIV_STAT):C(RACETHN) 1.005566 4.0 0.962382 4.267776e-01
C(HIV_STAT):C(SEXUALITY) 0.215740 3.0 0.275300 7.593473e-01
Residual 5219.916239 19983.0 NaN NaN
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1, but rank is 0
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1923: RuntimeWarning: invalid value encountered in divide
F /= J
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.12/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 3, but rank is 2
warnings.warn('covariance of constraints does not have full '
InĀ [28]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
# List of your harm questions
questions = ["Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]
# We'll loop over each question and fit a model like:
# QX ~ C(HIV_STAT)*C(GENDER) + C(HIV_STAT)*C(RACETHN) + C(HIV_STAT)*C(SEXUALITY)
for q in questions:
formula = (
f"{q} ~ C(PREG_STAT)*C(OWNGUN_GSS) "
f"+ C(PREG_STAT)*C(EDUCCAT5) "
f"+ C(PREG_STAT)*C(AGE_INT)"
)
model = ols(formula, data=df_final).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(f"ANOVA for {q} (Pregnancy):")
print(anova_table)
print("\n")
ANOVA for Q1 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 4.095558 2.0 7.190990 7.552962e-04
C(OWNGUN_GSS) 48.686100 1.0 170.966305 6.577359e-39
C(EDUCCAT5) 16.195391 4.0 14.217951 1.366747e-11
C(AGE_INT) 217.003435 7.0 108.861449 3.168010e-157
C(PREG_STAT):C(OWNGUN_GSS) 0.019176 2.0 0.033669 9.668912e-01
C(PREG_STAT):C(EDUCCAT5) 11.344730 8.0 4.979775 3.483861e-06
C(PREG_STAT):C(AGE_INT) 11.206338 14.0 2.810873 3.248167e-04
Residual 5684.296889 19961.0 NaN NaN
ANOVA for Q2 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 24.899177 2.0 45.243136 2.486321e-20
C(OWNGUN_GSS) 0.514457 1.0 1.869590 1.715374e-01
C(EDUCCAT5) 1.714461 4.0 1.557634 1.826261e-01
C(AGE_INT) 78.603977 7.0 40.807894 1.854515e-57
C(PREG_STAT):C(OWNGUN_GSS) 2.241790 2.0 4.073452 1.703269e-02
C(PREG_STAT):C(EDUCCAT5) 14.606735 8.0 6.635305 1.068917e-08
C(PREG_STAT):C(AGE_INT) 21.944858 14.0 5.696426 3.335045e-11
Residual 5492.683668 19961.0 NaN NaN
ANOVA for Q3 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 5.711883 2.0 11.976923 6.332961e-06
C(OWNGUN_GSS) 0.002498 1.0 0.010477 9.184737e-01
C(EDUCCAT5) 9.624765 4.0 10.090813 3.708555e-08
C(AGE_INT) 72.944647 7.0 43.701000 1.017678e-61
C(PREG_STAT):C(OWNGUN_GSS) 2.349021 2.0 4.925528 7.267716e-03
C(PREG_STAT):C(EDUCCAT5) 5.655299 8.0 2.964569 2.566736e-03
C(PREG_STAT):C(AGE_INT) 13.800257 14.0 4.133854 2.832743e-07
Residual 4759.773744 19961.0 NaN NaN
ANOVA for Q4 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 26.198360 2.0 49.933199 2.335363e-22
C(OWNGUN_GSS) 13.570312 1.0 51.729125 6.596279e-13
C(EDUCCAT5) 25.658131 4.0 24.451771 3.231299e-20
C(AGE_INT) 48.503461 7.0 26.413136 2.698495e-36
C(PREG_STAT):C(OWNGUN_GSS) 4.833991 2.0 9.213426 1.001165e-04
C(PREG_STAT):C(EDUCCAT5) 15.703446 8.0 7.482561 5.148189e-10
C(PREG_STAT):C(AGE_INT) 20.042889 14.0 5.457297 1.377490e-10
Residual 5236.450585 19961.0 NaN NaN
ANOVA for Q5 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 15.439003 2.0 27.264287 1.497663e-12
C(OWNGUN_GSS) 10.606511 1.0 37.460834 9.500684e-10
C(EDUCCAT5) 0.264859 4.0 0.233862 9.194177e-01
C(AGE_INT) 37.935763 7.0 19.140605 1.136515e-25
C(PREG_STAT):C(OWNGUN_GSS) 8.393320 2.0 14.822064 3.695171e-07
C(PREG_STAT):C(EDUCCAT5) 22.745501 8.0 10.041773 4.487110e-14
C(PREG_STAT):C(AGE_INT) 15.484201 14.0 3.906300 1.002161e-06
Residual 5651.678040 19961.0 NaN NaN
ANOVA for Q6 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 0.029660 2.0 0.052414 9.489358e-01
C(OWNGUN_GSS) 0.337160 1.0 1.191635 2.750132e-01
C(EDUCCAT5) 45.369680 4.0 40.087821 1.679842e-33
C(AGE_INT) 77.746520 7.0 39.254518 3.593111e-55
C(PREG_STAT):C(OWNGUN_GSS) 2.941062 2.0 5.197338 5.538758e-03
C(PREG_STAT):C(EDUCCAT5) 3.376778 8.0 1.491830 1.542246e-01
C(PREG_STAT):C(AGE_INT) 19.467936 14.0 4.914718 3.304355e-09
Residual 5647.751318 19961.0 NaN NaN
ANOVA for Q7 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 0.280737 2.0 0.572010 5.643991e-01
C(OWNGUN_GSS) 30.722838 1.0 125.197605 5.619283e-29
C(EDUCCAT5) 16.087573 4.0 16.389482 2.064751e-13
C(AGE_INT) 70.312616 7.0 40.932655 1.214827e-57
C(PREG_STAT):C(OWNGUN_GSS) 0.619657 2.0 1.262571 2.829482e-01
C(PREG_STAT):C(EDUCCAT5) 7.443571 8.0 3.791631 1.859487e-04
C(PREG_STAT):C(AGE_INT) 24.180292 14.0 7.038307 9.930476e-15
Residual 4898.325020 19961.0 NaN NaN
ANOVA for Q8 (Pregnancy):
sum_sq df F PR(>F)
C(PREG_STAT) 18.975797 2.0 36.741649 1.181962e-16
C(OWNGUN_GSS) 21.629464 1.0 83.759560 6.112023e-20
C(EDUCCAT5) 36.710777 4.0 35.540392 1.242215e-29
C(AGE_INT) 9.243550 7.0 5.113631 8.018321e-06
C(PREG_STAT):C(OWNGUN_GSS) 0.341448 2.0 0.661124 5.162820e-01
C(PREG_STAT):C(EDUCCAT5) 22.163422 8.0 10.728412 3.530147e-15
C(PREG_STAT):C(AGE_INT) 4.884761 14.0 1.351151 1.683065e-01
Residual 5154.584575 19961.0 NaN NaN
InĀ [29]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def plot_linear_per_question(df_final, score_types, score_type_to_question, save=False, outdir=".", fontsize=12, height=6, aspect=1.5, dpi=100, save_dpi=300, xlim=(16, 40)):
pdf = None
if save:
pdf_path = f"{outdir}/LinearPlots.pdf"
pdf = PdfPages(pdf_path)
for score_type in score_types:
grouped_scores = df_final.groupby(['AGE', 'PREG_STAT'])[score_type].median().reset_index()
g = sns.lmplot(x='AGE', y=score_type, data=grouped_scores, hue='PREG_STAT', height=height, aspect=aspect, legend_out=True)
g.set_axis_labels('Age', score_type)
g.set(xlim=xlim)
g.fig.set_dpi(dpi)
g.fig.suptitle(f"Mean {score_type_to_question[score_type]} vs. Age colored by Pregnancy Status", fontsize=fontsize)
g.fig.subplots_adjust(top=0.9)
if save:
pdf.savefig(g.fig, bbox_inches="tight", dpi=save_dpi)
plt.show()
if save:
pdf.close()
print(f"Saved {pdf_path}")
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
score_type_to_question = {
'Q1': 'Question 1',
'Q2': 'Question 2',
'Q3': 'Question 3',
'Q4': 'Question 4',
'Q5': 'Question 5',
'Q6': 'Question 6',
'Q7': 'Question 7',
'Q8': 'Question 8'
}
plot_linear_per_question(df_final, score_types, score_type_to_question, save=True, outdir=".")
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
/usr/local/lib/python3.12/dist-packages/seaborn/regression.py:598: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`. warnings.warn(msg, UserWarning)
Saved ./LinearPlots.pdf
InĀ [30]:
import itertools
import scipy.stats as stats
score_types = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]
# Loop over each harm question
for score in score_types:
print(f"\n=== T-test Results for {score} ===")
# 1) Pairwise t-tests by HIV_STAT
print("Comparisons by PREG_STAT:")
hiv_groups = df_final['PREG_STAT'].dropna().unique()
for grp1, grp2 in itertools.combinations(hiv_groups, 2):
data1 = df_final.loc[df_final['PREG_STAT'] == grp1, score].dropna()
data2 = df_final.loc[df_final['PREG_STAT'] == grp2, score].dropna()
# Ensure both groups have at least 2 values to run a t-test
if len(data1) >= 2 and len(data2) >= 2:
t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
print(f" {grp1} vs {grp2}: t = {t_stat:.3f}, p = {p_val:.3e}")
else:
print(f" {grp1} vs {grp2}: Not enough data.")
=== T-test Results for Q1 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = -1.945, p = 5.175e-02 Not Applicable vs Positive: t = -0.950, p = 3.422e-01 Negative vs Positive: t = -0.174, p = 8.622e-01 === T-test Results for Q2 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = -3.497, p = 4.712e-04 Not Applicable vs Positive: t = -9.526, p = 1.230e-20 Negative vs Positive: t = -8.049, p = 2.362e-15 === T-test Results for Q3 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = 5.203, p = 1.979e-07 Not Applicable vs Positive: t = -0.162, p = 8.712e-01 Negative vs Positive: t = -2.197, p = 2.829e-02 === T-test Results for Q4 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = -8.949, p = 3.901e-19 Not Applicable vs Positive: t = -2.173, p = 3.005e-02 Negative vs Positive: t = 1.632, p = 1.029e-01 === T-test Results for Q5 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = -6.842, p = 8.036e-12 Not Applicable vs Positive: t = -3.514, p = 4.620e-04 Negative vs Positive: t = -0.813, p = 4.167e-01 === T-test Results for Q6 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = 0.347, p = 7.283e-01 Not Applicable vs Positive: t = 0.017, p = 9.864e-01 Negative vs Positive: t = -0.127, p = 8.988e-01 === T-test Results for Q7 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = 1.422, p = 1.549e-01 Not Applicable vs Positive: t = -0.314, p = 7.533e-01 Negative vs Positive: t = -0.875, p = 3.819e-01 === T-test Results for Q8 === Comparisons by PREG_STAT: Not Applicable vs Negative: t = -8.060, p = 8.099e-16 Not Applicable vs Positive: t = -3.962, p = 7.981e-05 Negative vs Positive: t = -0.708, p = 4.789e-01