Deployment id: AKfycbwcTkewNGFh14BRe0JGe7zlQNexU1fnZnageLzR3BRCFb6ma8U5_IR8LyrCLCApB8Hs_g

In [ ]:
# Frequently used constants
cluster_total = 14
survey_total = 76
In [ ]:
from google.colab import auth
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Authenticate and authorize using a service account
auth.authenticate_user()
In [ ]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd

scope = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)
In [ ]:
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1CatLlB4axdBW0uJRoioR-x00RsPzQ54CFYq1PqzgV7E/edit?usp=drive_web&ouid=107190373735789167396'
sheet = client.open_by_url(spreadsheet_url)
worksheet = sheet.get_worksheet(0)
data = worksheet.get_all_records()
df = pd.DataFrame(data)

df.head()
Out[ ]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... CI_LABEL_OWNGUN_GSS CI_LABEL_SEXUALITY CI_LABEL_HIV_STAT CI_LABEL_PREG_STAT CI_LABEL_CC_NUM CI_LABEL_cc_encoded CI_LABEL_cc_disclosed CI_LABEL_NumChronicIllness AGE_INT Cluster
0 0 Luke Walsh 1 Male 25 White non-Hispanic Some college Mountain Never married 3+ ... is is is probably is probably is is probably is possibly is probably 20-29 5
1 1 Matilde Izaguirre Checa 2 Female 70 Hispanic HS Grad West South Central Divorced 1 ... is probably is possibly is is is is is is possibly 70-79 11
2 2 Ryan Smith 3 Male 85 White non-Hispanic Less than HS Middle Atlantic Now married 2 ... is possibly is probably is probably is is probably is probably is probably is probably 80-89 2
3 3 Matthew Grimes 4 Male 59 White non-Hispanic HS Grad Mountain Now married 2 ... is probably is probably is probably is is possibly is probably is is probably 50-59 12
4 4 Miraan Rama 5 Female 19 Asian Some college Pacific Never married 1 ... is probably is is probably is is probably is probably is probably is 10-19 9

5 rows × 187 columns

In [ ]:
# Mute it after creating the representative data set
'''
# Select 152 profiles from each cluster evenly
import pandas as pd
import itertools
from googleapiclient.discovery import build

selected_data = []

for cluster_id in range(cluster_total):
    cluster_data = df[df['Cluster'] == cluster_id]
    selected_cluster_data = []

    if len(selected_cluster_data) < 152:
        selected_cluster_data += cluster_data.sample(n=152 - len(selected_cluster_data), random_state=42).to_dict('records')

    selected_data.extend(selected_cluster_data)

selected_df = pd.DataFrame(selected_data)
'''
In [ ]:
for cluster_id in range(cluster_total):
    cluster_data = df[df['Cluster'] == cluster_id]

    # Separate out subsets of interest
    hiv_positive_data = cluster_data[cluster_data['HIV_STAT'] == 'positive']
    pregnant_data = cluster_data[cluster_data['PREG_STAT'] == 'Pregnant']

    # Start building our selection for this cluster in a list
    selection_list = []

    # 1) If HIV+ data exists, pick at least 1 row for sure
    hiv_count = min(4, len(hiv_positive_data))  # up to 4
    if hiv_count > 0:
        selected_hiv = hiv_positive_data.sample(n=hiv_count, random_state=42)
        selection_list.append(selected_hiv)

    # 2) If pregnant data exists, pick at least 1 row for sure
    pregnant_count = min(4, len(pregnant_data))
    if pregnant_count > 0:
        selected_pregnant = pregnant_data.sample(n=pregnant_count, random_state=43)
        # Only add if not the same row (in case one row is both HIV+ & Pregnant):
        if not selected_pregnant.index.isin(selected_hiv.index).all():
            selection_list.append(selected_pregnant)

        # Only add if it's not the exact same row as the HIV+ one selected
        # (In case the same participant is both HIV+ and Pregnant)
        if not selected_pregnant.index.isin(selected_hiv.index).all():
            selection_list.append(selected_pregnant)

    # Convert the forced picks into one DataFrame
    forced_picks = pd.concat(selection_list).drop_duplicates()

    remaining_needed = 152 - forced_picks.shape[0]
    if remaining_needed < 0:
        remaining_needed = 0

    # Exclude forced_picks from the random pool
    leftover_pool = cluster_data.drop(forced_picks.index)

    if leftover_pool.shape[0] < remaining_needed:
        print(f"Not enough rows in cluster {cluster_id} to meet the 152-row requirement.")
        selected_cluster_data = pd.concat([forced_picks, leftover_pool])
    else:
        selected_cluster_data = pd.concat([
            forced_picks,
            leftover_pool.sample(n=remaining_needed, random_state=44)
        ])

    # Finally extend your overall selection
    selected_data.extend(selected_cluster_data.to_dict('records'))
In [ ]:
#see all HIV positive people
selected_df[selected_df['PREG_STAT'] == 'Negative']
Out[ ]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... CI_LABEL_OWNGUN_GSS CI_LABEL_SEXUALITY CI_LABEL_HIV_STAT CI_LABEL_PREG_STAT CI_LABEL_CC_NUM CI_LABEL_cc_encoded CI_LABEL_cc_disclosed CI_LABEL_NumChronicIllness AGE_INT Cluster
0 3413 Karen Campbell 3414 Female 85 White non-Hispanic HS Grad East South Central Widowed 1 ... is probably is probably is probably is probably is probably is probably is probably is possibly 80-89 0
1 10657 Lori Delacruz 10658 Female 80 White non-Hispanic HS Grad South Atlantic Widowed 1 ... is possibly is is probably is is probably is probably is probably is probably 80-89 0
2 1669 Shannon Mcdonald 1670 Female 80 White non-Hispanic Some college South Atlantic Widowed 2 ... is probably is is possibly is probably is probably is is probably is 80-89 0
4 2285 Jennifer Phillips 2286 Female 80 White non-Hispanic HS Grad South Atlantic Divorced 2 ... is probably is possibly is probably is probably is possibly is probably is possibly is probably 80-89 0
6 7536 Alondra Benavente Sales 7537 Female 42 Hispanic Some college South Atlantic Now married 3+ ... is probably is probably is is probably is possibly is is is possibly 40-49 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2122 5102 Kathleen Chaney 5103 Female 20 White non-Hispanic Some college New England Never married 3+ ... is is probably is probably is is is is probably is probably 20-29 13
2123 12570 Heather Anderson 12571 Female 26 White non-Hispanic College grad Middle Atlantic Never married 1 ... is probably is probably is is probably is possibly is is probably is probably 20-29 13
2125 4852 Lisa Lang 4853 Female 24 White non-Hispanic HS Grad West North Central Never married 3+ ... is probably is unlikely but might be is probably is possibly is probably is is probably is 20-29 13
2126 13247 Kayla Mcdonald 13248 Female 21 White non-Hispanic Some college South Atlantic Never married 3+ ... is is probably is unlikely but might be is possibly is is probably is probably is possibly 20-29 13
2127 14580 Cindy Hodges 14581 Female 24 White non-Hispanic Some college Pacific Never married 3+ ... is possibly is probably is is unlikely but might be is possibly is is is probably 20-29 13

1004 rows × 187 columns

In [ ]:
# Mute it after creating the representative sheet
'''
new_sheet_title = 'representative_set'
new_spreadsheet = client.create(new_sheet_title)
new_worksheet = new_spreadsheet.get_worksheet(0)
new_worksheet.update_title('selected_data')
new_worksheet.update([selected_df.columns.values.tolist()] + selected_df.values.tolist())
spreadsheet_id = new_spreadsheet.id
drive_service = build('drive', 'v3', credentials=creds)

# Move the spreadsheet to the folder
try:
    drive_service.files().update(
        fileId=spreadsheet_id,
        addParents='1uJa_ZyNHqZn-YBju4kRre0HBfbcUWCSQ',
        removeParents='root',
        fields='id, parents'
    ).execute()
    print(f"New sheet '{new_sheet_title}' created and moved to the folder.")
except Exception as e:
    print(f"An error occurred while moving the file: {e}")
'''
New sheet 'representative_set' created and moved to the folder.
In [ ]:
# Added: representative sheet access
rpt_sheet_url = 'https://docs.google.com/spreadsheets/d/1OGcWVmFb5ulAp9UEi8BV5zShPU3SP-gSEjhSllZQqb0/edit?gid=0#gid=0'
rpt_sheet = client.open_by_url(rpt_sheet_url)

worksheet = rpt_sheet.get_worksheet(0)
data = worksheet.get_all_records()
df = pd.DataFrame(data)

df.head()
Out[ ]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... CI_LABEL_OWNGUN_GSS CI_LABEL_SEXUALITY CI_LABEL_HIV_STAT CI_LABEL_PREG_STAT CI_LABEL_CC_NUM CI_LABEL_cc_encoded CI_LABEL_cc_disclosed CI_LABEL_NumChronicIllness AGE_INT Cluster
0 3413 Karen Campbell 3414 Female 85 White non-Hispanic HS Grad East South Central Widowed 1 ... is probably is probably is probably is probably is probably is probably is probably is possibly 80-89 0
1 10657 Lori Delacruz 10658 Female 80 White non-Hispanic HS Grad South Atlantic Widowed 1 ... is possibly is is probably is is probably is probably is probably is probably 80-89 0
2 1669 Shannon Mcdonald 1670 Female 80 White non-Hispanic Some college South Atlantic Widowed 2 ... is probably is is possibly is probably is probably is is probably is 80-89 0
3 15793 Albina Cobo Manzano 15794 Female 36 Hispanic College grad Pacific Now married 2 ... is is probably is probably is probably is is probably is probably is 30-39 0
4 2285 Jennifer Phillips 2286 Female 80 White non-Hispanic HS Grad South Atlantic Divorced 2 ... is probably is possibly is probably is probably is possibly is probably is possibly is probably 80-89 0

5 rows × 187 columns

In [ ]:
from google.colab import drive
drive.mount('/content/drive')

# Initialize a set to store unique IDs
unique_ids = set()
Mounted at /content/drive
In [ ]:
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import random

selected_ids = []
In [ ]:
# Mute it after creating worksheets
'''
# Added: Create 76 surveys, with each containing profiles of different clusters
import time

main_worksheet = rpt_sheet.worksheet("selected_data")
all_data = main_worksheet.get_all_records()

df = pd.DataFrame(all_data)

selected_indices = set()

for survey_num in range(survey_total):
    survey_sheet_name = f"Quiz_{survey_num + 1}"
    survey_data = []

    try:
        try:
            survey_worksheet = rpt_sheet.worksheet(survey_sheet_name)
            survey_worksheet.clear()
        except gspread.exceptions.WorksheetNotFound:
            rpt_sheet.add_worksheet(title=survey_sheet_name, rows="15", cols="20")
            survey_worksheet = rpt_sheet.worksheet(survey_sheet_name)

        survey_data = []
        for cluster_id in range(cluster_total):
            cluster_data = df[df['Cluster'] == cluster_id]
            available_rows = cluster_data[~cluster_data.index.isin(selected_indices)]
            if len(available_rows) >= 2:
                selected_rows = available_rows.sample(n=2)
                survey_data.append(selected_rows)
                selected_indices.update(selected_rows.index)
            elif not available_rows.empty:
                selected_row = available_rows.sample(n=1)
                survey_data.append(selected_row)
                selected_indices.add(selected_row.index[0])

        header_row = df.columns.tolist()
        all_sheet_data = [header_row] + [list(row) for row in pd.concat(survey_data).itertuples(index=False, name=None)]

        survey_worksheet.update('A1', all_sheet_data)

    except gspread.exceptions.APIError as e:
        print(f"API Error for sheet {survey_sheet_name}: {e}")
        # Add program delay to avoid exceeding API limit
        time.sleep(5)
        continue

    except Exception as e:
        print(f"Unexpected error for sheet {survey_sheet_name}: {e}")
        break
    # Program delay
    time.sleep(2)

print("Worksheet created.")
'''
<ipython-input-23-6fb946d17020>:41: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  survey_worksheet.update('A1', all_sheet_data)
Worksheet created.
In [ ]:
import time
import gspread
import pandas as pd

initial_information = ["NAME", "id"]
base_features = [
    "GENDER", "RACETHN", "EDUCCAT5", "DIVISION", "MARITAL_ACS",
    "CHILDRENCAT", "CITIZEN_REC", "BORN_ACS", "AGE_INT"
]
health_features = ["HIV_STAT", "PREG_STAT", "NumChronicIllness"]
finance_features = ["FAMINC5", "CC_NUM", "FDSTMP_CPS"]
sensitive_features = ["SEXUALITY", "OWNGUN_GSS", "RELIGCAT"]

def delete_unnecessary_columns(sheet_name, visible_columns, condition_label):
    worksheet = rpt_sheet.worksheet(sheet_name)
    data = worksheet.get_all_records()
    df = pd.DataFrame(data)

    df['Condition'] = condition_label

    if 'Condition' not in visible_columns:
        visible_columns.append('Condition')

    columns_to_keep = [col for col in visible_columns if col in df.columns]

    if columns_to_keep:
        print(f"Sheet '{sheet_name}' - Unnecessary columns discovered and will be deleted.")
        df = df[columns_to_keep]
        worksheet.clear()
        worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
    else:
        print(f"Sheet '{sheet_name}' - Unnecessary columns may have already been deleted.")

for survey_num in range(survey_total):
    survey_sheet_name = f"Quiz_{survey_num + 1}"

    if survey_num <= 18:
        condition_label = "Control"
        visible_columns = initial_information + base_features
    elif survey_num <= 37:
        condition_label = "Health"
        visible_columns = initial_information + base_features + health_features
    elif survey_num <= 56:
        condition_label = "Finance"
        visible_columns = initial_information + base_features + finance_features
    else:
        condition_label = "Sensitive"
        visible_columns = initial_information + base_features + sensitive_features

    try:
        delete_unnecessary_columns(survey_sheet_name, visible_columns, condition_label)

        time.sleep(2)
    except gspread.exceptions.APIError as e:
        if 'RATE_LIMIT_EXCEEDED' in str(e):
            print("Rate limit exceeded. Waiting for 5 seconds before retrying...")
            time.sleep(5)
            delete_unnecessary_columns(survey_sheet_name, visible_columns, condition_label)
        else:
            print(f"Unexpected error while processing {survey_sheet_name}: {e}")
            break

print("Necessary columns have been selected.")
Sheet 'Quiz_1' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_2' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_3' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_4' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_5' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_6' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_7' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_8' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_9' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_10' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_11' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_12' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_13' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_14' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_15' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_16' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_17' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_18' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_19' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_20' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_21' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_22' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_23' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_24' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_25' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_26' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_27' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_28' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_29' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_30' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_31' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_32' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_33' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_34' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_35' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_36' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_37' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_38' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_39' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_40' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_41' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_42' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_43' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_44' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_45' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_46' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_47' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_48' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_49' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_50' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_51' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_52' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_53' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_54' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_55' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_56' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_57' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_58' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_59' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_60' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_61' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_62' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_63' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_64' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_65' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_66' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_67' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_68' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_69' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_70' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_71' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_72' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_73' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_74' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_75' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Sheet 'Quiz_76' - Unnecessary columns discovered and will be deleted.
<ipython-input-25-b00a5a2258ad>:30: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  worksheet.update('A1', [df.columns.tolist()] + df.values.tolist())
Necessary columns have been selected.
In [ ]:
# Add corresponding conditions of each profile in every survey within representative data set to the last column
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import time

scope = ['https://www.googleapis.com/auth/spreadsheets',
         'https://www.googleapis.com/auth/drive.file',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)

rpt_sheet_url = 'https://docs.google.com/spreadsheets/d/1OGcWVmFb5ulAp9UEi8BV5zShPU3SP-gSEjhSllZQqb0/edit?gid=352691556#gid=352691556'
sheet = client.open_by_url(rpt_sheet_url)

sheet_names = [f"Quiz_{i}" for i in range(1, 41)]

for sheet_name in sheet_names:
    worksheet = sheet.worksheet(sheet_name)

    data = worksheet.get_all_records()

    df = pd.DataFrame(data)

    if 'Condition' not in df.columns:
        if 1 <= int(sheet_name.split('_')[1]) <= 10:
            df['Condition'] = 'Base'
        elif 11 <= int(sheet_name.split('_')[1]) <= 20:
            df['Condition'] = 'Base + Health'
        elif 21 <= int(sheet_name.split('_')[1]) <= 30:
            df['Condition'] = 'Base + Finance'
        elif 31 <= int(sheet_name.split('_')[1]) <= 40:
            df['Condition'] = 'Base + Sensitive'

        header = df.columns.tolist()
        worksheet.update([header] + df.values.tolist())
        print(f"Updated {sheet_name} with 'Condition' column.")

    time.sleep(2)

print("Conditions have been added to each profile in every survey.")
In [ ]:
# Integrating each profile and condition to another sheet called integration.
# It moves values intermittently to minimize the probability of API runtime error occurrence.
import time
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ['https://www.googleapis.com/auth/spreadsheets',
         'https://www.googleapis.com/auth/drive.file',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)

rpt_sheet_url = 'https://docs.google.com/spreadsheets/d/1OGcWVmFb5ulAp9UEi8BV5zShPU3SP-gSEjhSllZQqb0/edit?gid=352691556#gid=352691556'
spreadsheet = client.open_by_url(rpt_sheet_url)

try:
    integration_sheet = spreadsheet.add_worksheet(title="integration", rows="1000", cols="26")  # Default 26 columns for flexibility
except Exception as e:
    integration_sheet = spreadsheet.worksheet("integration")

integration_sheet.update("A1:B1000", [[""]] * 1000)
integration_sheet.update('A1', [["id", "Condition"]])

row_index = 2

for i in range(66, 77): # If API runtime error occurs, delete the incomplete area and adjust the range
    sheet_name = f"Quiz_{i}"
    quiz_sheet = spreadsheet.worksheet(sheet_name)
    data = quiz_sheet.get_all_values()[1:]
    rows_to_add = []
    for row in data:
        rows_to_add.append([row[1], row[-1]])
    cell_range = f"A{row_index}:B{row_index + len(rows_to_add) - 1}"
    try:
        integration_sheet.update(cell_range, rows_to_add)
    except Exception as e:
        time.sleep(5)
        integration_sheet.update(cell_range, rows_to_add)
    row_index += len(rows_to_add)
    time.sleep(1)
<ipython-input-31-4840b0e9d117>:21: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  integration_sheet.update("A1:B1000", [[""]] * 1000)
<ipython-input-31-4840b0e9d117>:22: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  integration_sheet.update('A1', [["id", "Condition"]])
<ipython-input-31-4840b0e9d117>:35: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  integration_sheet.update(cell_range, rows_to_add)
In [ ]:
# Add corresponding conditions of each profile in Sheet 3 of update counter file to the last column
# Changing less than 5 surveys each time running can minimize API error probability
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import time

scope = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)

rpt_sheet_url = 'https://docs.google.com/spreadsheets/d/1OGcWVmFb5ulAp9UEi8BV5zShPU3SP-gSEjhSllZQqb0/edit?gid=0#gid=0'
update_sheet_url = 'https://docs.google.com/spreadsheets/d/1OSoeG07uhYDsYEUP38Rc6bMwJySvflqGWfryTjx1gIo/edit?gid=2023416044#gid=2023416044'

rpt_sheet = client.open_by_url(rpt_sheet_url)
update_sheet = client.open_by_url(update_sheet_url).worksheet("Sheet4")

integration_sheet = rpt_sheet.worksheet("integration")

integration_data = integration_sheet.get_all_values()[1:]
integration_dict = {str(row[0]): row[1] for row in integration_data}

sheet3_data = update_sheet.get_all_values()
header = sheet3_data[0]
rows = sheet3_data[1:]

if 'Condition' not in header:
    header.append('Condition')
    update_sheet.update('A1', [header])

updated_rows = []
for row in rows:
    person_id = str(row[0])
    condition = integration_dict.get(person_id, 'Unknown')
    if len(row) < len(header):
        row.append(condition)
    else:
        row[-1] = condition
    updated_rows.append(row)

update_sheet.update(f"A2:{chr(64 + len(header))}{len(rows) + 1}", updated_rows)

print("Conditions have been added to the Sheet4 of update counter table.")
<ipython-input-4-9c58a3e09918>:28: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  update_sheet.update('A1', [header])
<ipython-input-4-9c58a3e09918>:40: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  update_sheet.update(f"A2:{chr(64 + len(header))}{len(rows) + 1}", updated_rows)
Conditions have been added to the Sheet4 of update counter table.
In [ ]:
def create_description(data_row):
    # Basic features
    name = data_row['NAME']
    id = data_row['id']

    gender = data_row['GENDER'].lower()
    if gender == 'male':
        pronoun = 'He'
        pronoun2 = 'His'
    else:
        pronoun = 'She'
        pronoun2 = 'Her'

    racethn = data_row['RACETHN']
    if racethn == "Other race":
        racethn = "unidentified"

    educcat5 = data_row['EDUCCAT5'].lower()
    if educcat5 == 'less than hs':
        educcat5 = 'a less than high school'
    elif educcat5 == 'hs grad':
        educcat5 = 'a high school graduate'
    elif educcat5 == 'some college':
        educcat5 = 'some college'
    elif educcat5 == 'College grad':
        educcat5 = 'a college graduate'
    else:
        educcat5 = 'a postgraduate'

    marital_acs = data_row['MARITAL_ACS'].lower()
    if marital_acs == 'never married':
        marital_acs = 'has never married'
    elif marital_acs == 'now married':
        marital_acs = 'is now married'
    else:
        marital_acs = 'is divorced'

    childrencat = data_row['CHILDRENCAT'].lower()
    if childrencat == "no children":
        childrencat = "doesn't have"
    else:
        childrencat = "has"

    born_acs = data_row['BORN_ACS']
    if born_acs == 'Inside the United States':
      born_acs = 'inside the U.S.'
    else:
      born_acs = 'outside the U.S.'

    age_int = data_row['AGE_INT']

    citizen_rec = data_row['CITIZEN_REC'].lower()
    if citizen_rec == 'Yes, a U.S. citizen':
        citizen_rec = "US citizen"
    else:
        citizen_rec = "non-US citizen"

    division = data_row['DIVISION']

    division_to_common_name = {
    'Pacific': 'the West Coast',
    'Mountain': 'the Mountain States',
    'West North Central': 'the Upper Midwest',
    'West South Central': 'the South Central',
    'East North Central': 'the Great Lakes region',
    'East South Central': 'the Deep South',
    'South Atlantic': 'the South',
    'Middle Atlantic': 'the Northeast',
    'New England': 'New England'
}

    if division in division_to_common_name:
        division = division_to_common_name[division]
    else:
        division = ""

    base_description = (
        f"{name} has the following attributes:\r"
        f"• {pronoun2} age is between {age_int} years.\r"
        f"• {pronoun} is of {racethn} descent.\r"
        f"• {pronoun2} gender is {gender}.\r"
        f"• {pronoun} has {educcat5} level education.\r"
        f"• {pronoun} {marital_acs}.\r"
        f"• {pronoun} {childrencat} children.\r"
        f"• {pronoun} is a {citizen_rec}.\r"
        f"• {pronoun} was born {born_acs}\r"
        f"• {pronoun} resides in {division} region of the U.S.\r"
    )

    # Health features
    hiv = data_row.get('HIV_STAT', None)
    if hiv:
        hiv = hiv.lower()

    pregnancy = data_row.get('PREG_STAT', None)
    if pregnancy:
        if gender == 'male':
          pregnancy = ""
        else:
            pregnancy = data_row.get('PREG_STAT', None)
            if pregnancy == 'Positive':
                pregnancy = "• She is pregnant.\r"
            else:
                pregnancy = "• She is not pregnant.\r"

    num_chronic_illness = data_row.get('NumChronicIllness', None)
    if isinstance(num_chronic_illness, (int, float)):
        num_chronic_illness = str(num_chronic_illness)
        num_chronic_illness = num_chronic_illness.lower()
        if num_chronic_illness == '0':
            chronill = 'not chronically ill'
        else:
            chronill = 'chronically ill'

    if hiv and num_chronic_illness:
        health_description = (
            f"• {pronoun2} HIV status is {hiv}.\r"
            f"{pregnancy}"
            f"• {pronoun} is {chronill}.\r"
        )
    else:
        health_description = ""

    # finance features
    faminc5 = data_row.get('FAMINC5', None)
    if faminc5:
        faminc5.lower()
        if faminc5 == '$20K to less than $40K':
            faminc5 = 'between $20K to less than $40K'
        elif faminc5 == '$40K to less than $75K':
            faminc5 = 'between $40K to less than $75K'
        elif faminc5 == '$75K to less than $150K':
            faminc5 = 'between $75K to less than $150K'
        else:
            None

    cred = data_row.get('CC_NUM', None)
    if cred:
        if isinstance(cred, (int, float)):
            cred = str(cred)
        cred = cred.lower()
        if cred != 0:
            cred = "has"
        else:
            cred = "does not have"

    fdstmp_cps =  data_row.get('FDSTMP_CPS', None)
    if fdstmp_cps == 'yes':
        fdstmp_cps = "currently receives foodstamps"
    else:
        fdstmp_cps = "does not currently receive foodstamps"

    if faminc5 and cred and fdstmp_cps:
        finance_description = (
            f"• {pronoun2} family's annual income is {faminc5}.\r"
            f"• {pronoun} {cred} a credit card.\r"
            f"• {pronoun} {fdstmp_cps}.\r"
        )
    else:
        finance_description = ""

    # sensitive features
    sexuality = data_row.get('SEXUALITY', None)
    if sexuality:
        sexuality = sexuality.lower()

    owngun_gss = data_row.get('OWNGUN_GSS', None)
    if owngun_gss == 'Yes':
        owngun_gss = "owns a gun"
    else:
        owngun_gss = "doesn't own a gun"

    religcat = data_row.get('RELIGCAT', None)

    if sexuality and religcat and owngun_gss:
        sensitive_description = (
            f"• {pronoun} is {sexuality}.\r"
            f"• {pronoun} {owngun_gss}.\r"
            f"• {pronoun} is {religcat}.\r"
        )
    else:
      sensitive_description = ""

    description = base_description + health_description + finance_description + sensitive_description

    return description
In [ ]:
# Mute it after creating forms and URLs
'''
from google.oauth2 import service_account
from googleapiclient.discovery import build
import gspread

# Authentication and service setup
service_account_key_path = 'credentials.json'
credentials = service_account.Credentials.from_service_account_file(
    service_account_key_path,
    scopes=['https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/spreadsheets']
)
client = gspread.authorize(credentials)
forms_service = build("forms", "v1", credentials=credentials)

# Open the source and destination spreadsheets
source_spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1OGcWVmFb5ulAp9UEi8BV5zShPU3SP-gSEjhSllZQqb0/edit?gid=0#gid=0')
destination_spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/12fMUJYC8ZpayamsyzzpcRK8XJLE2WU-spBYLOTr-jBs/edit?gid=0#gid=0')
summary_worksheet = destination_spreadsheet.sheet1

# Prepare the destination worksheet
summary_worksheet.update('A1:B1', [["Quiz Name", "Form URL"]])

# Iterate through each survey, create forms, and log URLs
for i, worksheet in enumerate(source_spreadsheet.worksheets()[1:]):
    survey_name = worksheet.title
    form_body = {"info": {"title": f"Harm Scores Survey: {survey_name}"}}
    result = forms_service.forms().create(body=form_body).execute()
    form_id = result["formId"]
    form_url = f"https://docs.google.com/forms/d/{form_id}/edit"
    summary_worksheet.append_row([survey_name, form_url])

print("All surveys and form URLs have been logged.")
'''
<ipython-input-36-32025a29155f>:22: DeprecationWarning: The order of arguments in worksheet.update() has changed. Please pass values first and range_name secondor used named arguments (range_name=, values=)
  summary_worksheet.update('A1:B1', [["Quiz Name", "Form URL"]])
All surveys and form URLs have been logged.
In [ ]:
import re
import pandas as pd
import time

summary_worksheet = destination_spreadsheet.worksheet('Sheet1')
form_urls_records = summary_worksheet.get_all_records()

for record in form_urls_records:
    quiz_name = record['Quiz Name']
    form_url = record['Form URL']

    form_id_match = re.search(r'/d/(.*?)/edit', form_url)
    if form_id_match:
        form_id = form_id_match.group(1)
        matching_worksheet = source_spreadsheet.worksheet(quiz_name)
        # Assuming each row in the worksheet is a person's information
        person_records = matching_worksheet.get_all_records()
        if not person_records:
            print(f"No data found in the worksheet for quiz: {quiz_name}")
            continue

        df_worksheet = pd.DataFrame(person_records)

        df_worksheet['Description'] = ''

        for index, row in df_worksheet.iterrows():
            full_description = create_description(row)
            df_worksheet.at[index, 'Description'] = full_description

        update_requests = []

        first_request = {
            "createItem": {
                "item": {
                    "textItem": {},
                    "title": f"We are trying to quantify harm from the release of personal information. Please answer each question to the best of your abilities. For each subject, read the entire profile—sometimes, the harm might (or might not) be context-dependent.\
                              \r\rAll of the people included in this survey are fake (synthetic)—we are not showing you real data about real people. This study is IRB approved."
                },
                "location": {"index": 0},
            },
        }
        update_requests.append(first_request)

        break_request = {
            "createItem": {
                "item": {
                    "pageBreakItem": {},
                  },
                  "location": {"index": 1},
            },
        }

        update_requests.append(break_request)

        prolific_id =  {
             "createItem": {
                 "item": {
                    "title": "What is your Prolific ID?",
                       "questionItem": {
                            "question": {
                                "required": True,
                                "textQuestion": {
                                },

                            },
                        },
                    },
                    "location": {"index": 2},
                },
            }

        update_requests.append(prolific_id)

        break_request = {
            "createItem": {
                "item": {
                    "pageBreakItem": {},
                  },
                  "location": {"index": 3},
            },
        }

        update_requests.append(break_request)

        question_number = 1
        location_index = 4
        row_count = 0

        for index, row in df_worksheet.iterrows():
            row_count += 1
            description_value = row['Description']
            id = row['id']
            description_request = {
                "createItem": {
                    "item": {
                        "textItem": {},
                        "title": f"Sample person {question_number}\r\r{description_value}"
                    },
                    "location": {"index": location_index},
                },
            }

            update_requests.append(description_request)
            question_number += 1
            location_index += 1

            attention_check =  {
                "createItem": {
                    "item": {
                        "title": "What is this person's ID number? (Please select the given answer; this question is purely for data processing purposes.)",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": f"{id}"},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }

            update_requests.append(attention_check)
            location_index += 1

            if row_count == 10:
                attention_check2 = {
                "createItem": {
                    "item": {
                        "title": "What do you hope elected leaders in Washington do in the next year? This is an attention check question.",
                          "questionItem": {
                                "question": {
                                    "required": True,
                                    "textQuestion": {
                                    },

                                },
                            },
                        },
                        "location": {"index": location_index},
                    },
                }
                update_requests.append(attention_check2)
                location_index += 1

            else:
                attention_check2 = {}

            if row_count == 20:
                attention_check3 =  {
                "createItem": {
                    "item": {
                        "title": "To ensure that you have read this question carefully, please type “Green” as your answer. This is an attention check question.",
                          "questionItem": {
                                "question": {
                                    "required": True,
                                    "textQuestion": {
                                    },

                                },
                            },
                        },
                        "location": {"index": location_index},
                    },
                }
                update_requests.append(attention_check3)
                location_index += 1

            else:
                attention_check3 = {}

            name = row['NAME']
            gender = row['GENDER'].lower()
            if gender == 'male':
                pronoun2 = 'his'
            else:
                pronoun2 = 'her'

            q1_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if hackers or cybercriminals stole {name}'s personal data in a data breach?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q1_request)
            location_index += 1

            q2_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if government agencies accessed {name}'s personal information without {pronoun2} knowledge or consent?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q2_request)
            location_index += 1

            q3_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if a corporation collected and used {name}'s personal data for {pronoun2} own purposes without {pronoun2} consent?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q3_request)
            location_index += 1

            q4_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if {name}'s employer or colleagues viewed {pronoun2} personal data without {pronoun2} permission?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q4_request)
            location_index += 1

            q5_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if {name}'s personal information was inadvertently shared with {pronoun2} family members without {pronoun2} consent?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q5_request)
            location_index += 1

            q6_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if personal details about {name} were disclosed to {pronoun2} close friends without {pronoun2} knowledge?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q6_request)
            location_index += 1

            q7_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if {name}'s personal data was accessed by acquaintances who know {name} by face and name but don't interact with {name} regularly?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q7_request)
            location_index += 1

            q8_request = {
                "createItem": {
                    "item": {
                        "title": f"How harmful do you think most people in the U.S. would consider it if {name}'s personal information was made publicly available on the internet for anyone to access without {pronoun2} consent?",
                        "questionItem": {
                            "question": {
                                "required": True,
                                "choiceQuestion": {
                                    "type": "RADIO",
                                    "options": [
                                        {"value": 'Not at all harmful'},
                                        {"value": 'Slightly harmful'},
                                        {"value": 'Moderately harmful'},
                                        {"value": 'Very harmful'},
                                        {"value": 'Extremely harmful'},
                                    ],
                                },
                            },
                        },
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(q8_request)
            location_index += 1

            break_request = {
                "createItem": {
                    "item": {
                        "pageBreakItem": {},
                    },
                    "location": {"index": location_index},
                },
            }
            update_requests.append(break_request)
            location_index += 1

            update = {"requests": update_requests}

        last_request = {
            "createItem": {
                "item": {
                    "textItem": {},
                    "title": f"Prolific code: COLYGFMB"
                },
                "location": {"index": location_index},
            },
        }
        update_requests.append(last_request)
        location_index += 1

        try:
          t = "Harm Scores Survey"
          forms_service.forms().batchUpdate(formId=form_id, body={"requests": update_requests}).execute()

          drive_service = build('drive', 'v3', credentials=credentials)
          # Temporarily muted

          drive_service.permissions().create(
              fileId=form_id,
              body={
                  'role': 'writer',
                  'type': 'user',
                  'emailAddress': 'shreya.kochar1@gmail.com'
              }
          ).execute()

          print(f"Form updated successfully: {form_id}")
        except Exception as e:
          print(f"Unable to update form {form_id}: {e}")
    else:
        print(f"Unable to extract form ID from URL: {form_url}")
    # Program delay
    time.sleep(2)
Form updated successfully: 1Wrfzzm8GeDNKBQsEH4h2UUtzRzhSmj1c02aQaJkBI6s
Form updated successfully: 17seIXv0sOCcGT2s4Ah2mkkL086vRWVaf6kaXqXl-Dzw
Form updated successfully: 1JHYAiMRe_gLQyNFn4U22cA0WuuBTFLfsvAKMgSmoLXg
Form updated successfully: 1hTA4DRe7s89xwvADnZLNti814zsC0K-PgLB4n8GoUpI
Form updated successfully: 1EGnzEvl-Y6oDOUlX1sLYjLjK3QB6DVM5NrlwfAD9l2w
Form updated successfully: 1Tvj4A_kbEieqUSaLEAPSXyijQKp-RyDQFqDIiaya1OE
Form updated successfully: 1nq3rp6okjDp6-ykr3Cb4Sr-eAzlXJ2bZUMJJrpULUbg
Form updated successfully: 1KWFeBj4yxaSSXhnUsoj38UJMx-4qgHclvhNn2JmdaaE
Form updated successfully: 1aJ2IgCSh3pajXpxPOeXSfpJ-gnYcH2RAZxT9XskRC54
Form updated successfully: 1Nipu71J8H9dKOfFAnGmT18MFV_hQBxXMjkbD_PgJY3Y
Form updated successfully: 113ApJyceab8b6rGq0aoQXDeF7zWUHKMT_J41PFGlfvs
Form updated successfully: 1Pp6X8sQgno5JAlboTJcim7qzgQ7CuK5K2STIQuuf2-s
Form updated successfully: 16dzWs1GBFdESLi9dI2yx6siITik0JbrRKwdD9LIfMbs
Form updated successfully: 1Y6mKTuHwq436qryhsQzhJTErOclcRpP2eFsaGA8c6e4
Form updated successfully: 17VT1pAC2suPtlLm0KzxUI9W_Re8nX4AzMKuakYLGAR0
Form updated successfully: 1jBINeal587U1WvF7McXOU6DE0SFEcLfQmql5okX3KRA
Form updated successfully: 1S5DBWZIsenzzlR8oCmCXI7Z4ewWafCgfA6rwllsb_24
Form updated successfully: 1bIBjipXlppgfcqJ085Fz1zoBANJh_50uw_G2FRHj5AM
Form updated successfully: 1CRLdS9tFu1qRWhJORYRlq9_rygBlqAvpKFSaXnQMMyY
Form updated successfully: 168x9HBGV_n7dy6WFdJxv4fYtv5NHkMG7VXJZmCdQQW0
Form updated successfully: 1ataLuMJynbE0rCed_ozMFwX-mF4MPjpBr0EUyhi_WGU
Form updated successfully: 1utzIm5L1MSH8S98y7xjZffgFk0-qxUJEfeBEqUsVDpU
Form updated successfully: 1xZVOjUN87a4grVetzXs61Glg0PhCbY1hDae_GjnIFeI
Form updated successfully: 1pocKFVmAHDqYVvpww3Xan8eK0Af087AQVTX4Gcdf2P0
Form updated successfully: 1gcx-4wyNLQr8samszcZwcG8zqwHGgv3oZAYJ6Ewzz3A
Form updated successfully: 1DB13Q2tOuVrA2Bd7tbN53wLAKyw8BqqDCD4ldQdmzK4
Form updated successfully: 1l-iwkMK9nZ77ptnELTuxFp6EnmvJTPlYwdjnzLTXV8o
Form updated successfully: 1oz9Ubo2ni1E_I58c0gZyQuDRS5-Q3t5T5s7wLpoY3FA
Form updated successfully: 1c7lfXlyC7QUyWLH9NlPSqYb25b0u0ajjYVB8ckwjqg8
Form updated successfully: 1UIyYQ75V0yQhF8pcMUq7CDsl7dFFh8mm7VkrXHvGg5E
Form updated successfully: 1qngG5Z9SSGx-4adpb1mTiSrPr9leIJLooSSWrYm1LT8
Form updated successfully: 1I7TJtrzUJ-QI7EreJb8uQbLPnG3Q8vyDxz6kGk8GFuk
Form updated successfully: 1yz-Xh6qEApabcSxCOYEwNNb1I1PbU2uLw_EM9xC3CqE
Form updated successfully: 1RKqnwF4TFWBA4INU5GAvo04bGd6Fpb_mxd2cgoBOY6M
Form updated successfully: 1moV1l7Jp6-nW-NpGKxiFGhPtTUetnahVwzMdrzOLRIo
Form updated successfully: 1EcVcMNdgHkyxNKxGxHeTUqrJf6ZvSnBkenxoHJuC0Rs
Form updated successfully: 1Il3UiaW42jLiSkQn9Lkimg9u95CDg4o8UfK9pQt7Gs0
Form updated successfully: 1aG4BcuYqWaM_vv_f_hg-hb8qk5i1Ni2HMlZmaOvV2d4
Form updated successfully: 1ZP3ivl1Ia_RrRlrTZKal4Mi93Op3YeweNJIEQMPYxao
Form updated successfully: 11mjX9JP8sdhSuZPfUD4NTJbSSwBQr8-GxoG7rjMY3-E
Form updated successfully: 1-tmZrTf5ZrGhqE1QAOcNpk42-NBsa-h7wOL_2pd58Ok
In [ ]:
# Muted after creating a sheet with certainty
'''
from google.colab import auth
from google.oauth2 import service_account
from googleapiclient.discovery import build
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from gspread_dataframe import set_with_dataframe

# Added: representative sheet with certainty access
auth.authenticate_user()
scope = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
client = gspread.authorize(creds)
rpt_sheet_cty_url = 'https://docs.google.com/spreadsheets/d/1vzWPqmy_HcM_Ntowg4V5HD307ZGFS0XXBInrXmN-xYo/edit?gid=0#gid=0'
rpt_sheet_cty = client.open_by_url(rpt_sheet_cty_url)
worksheet = rpt_sheet_cty.worksheet("selected_data")
data = worksheet.get_all_records()
df = pd.DataFrame(data)

def replicate_rows_with_certainty(row):
    replicated_rows = []
    for certainty in [50, 75, 100]:
        new_row = row.copy()
        new_row['certainty'] = certainty
        replicated_rows.append(new_row)
    return replicated_rows

transformed_rows = []
for _, row in df.iterrows():
    transformed_rows.extend(replicate_rows_with_certainty(row))

transformed_df = pd.DataFrame(transformed_rows)

worksheet.clear()
columns = transformed_df.columns.values.tolist()
worksheet.update([columns] + transformed_df.values.tolist())

transformed_df.head()
'''
Out[ ]:
NAME id GENDER AGE RACETHN EDUCCAT5 DIVISION MARITAL_ACS HHSIZECAT ... SEXUALITY HIV_STAT PREG_STAT CC_NUM cc_encoded cc_disclosed NumChronicIllness AGE_INT Cluster certainty
0 4775 Joseph Burton 4776 Male 36 White non-Hispanic College grad West South Central Now married 3+ ... Heterosexual negative Not Applicable 2.237470e+15 1 1 2 30-39 0 50
0 4775 Joseph Burton 4776 Male 36 White non-Hispanic College grad West South Central Now married 3+ ... Heterosexual negative Not Applicable 2.237470e+15 1 1 2 30-39 0 75
0 4775 Joseph Burton 4776 Male 36 White non-Hispanic College grad West South Central Now married 3+ ... Heterosexual negative Not Applicable 2.237470e+15 1 1 2 30-39 0 100
1 3539 Joseph Payne 3540 Male 50 White non-Hispanic HS Grad Middle Atlantic Now married 3+ ... Heterosexual negative Not Applicable 5.273270e+15 1 0 1 50-59 0 50
1 3539 Joseph Payne 3540 Male 50 White non-Hispanic HS Grad Middle Atlantic Now married 3+ ... Heterosexual negative Not Applicable 5.273270e+15 1 0 1 50-59 0 75

5 rows × 50 columns

After experiment

In [ ]:
# Temporarily muted
'''
import scipy.stats as stats

anova_df = df_updated[['RACETHN', 'AGE_INT', 'SEXUALITY', 'EDUCCAT5', 'scores']].dropna()
groups = [group['scores'].values for name, group in anova_df.groupby(['EDUCCAT5'])]

# Perform ANOVA
f_stat, p_value = stats.f_oneway(*groups)
print('ANOVA result: F-statistic = {:.2f}, p-value = {:.3f}'.format(f_stat, p_value))
'''
In [ ]:
# df_updated.to_csv('harm3.csv', index=False)