pip install pyreadstat

Requirement already satisfied: pyreadstat in c:\users\86177\anaconda3\lib\site-packages (1.2.7)
Requirement already satisfied: pandas>=1.2.0 in c:\users\86177\anaconda3\lib\site-packages (from pyreadstat) (2.1.4)
Requirement already satisfied: numpy<2,>=1.23.2 in c:\users\86177\anaconda3\lib\site-packages (from pandas>=1.2.0->pyreadstat) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\86177\anaconda3\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\86177\anaconda3\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\86177\anaconda3\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2023.3)
Requirement already satisfied: six>=1.5 in c:\users\86177\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.2.0->pyreadstat) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

import pandas as pd
import pyreadstat

pop_df = pd.read_spss('synthetic_population_dataset.sav')
pop_df.head()

pop_df.tail()

import pandas as pd
import numpy as np

# Assuming 'pop_df' is your DataFrame
# Initialize 'SEXUALITY' column with default value
pop_df['SEXUALITY'] = 'Heterosexual'

# Generate random probabilities
prob_lgb = np.random.uniform(0, 1, len(pop_df)) <= 0.07

# Assign 'Lesbian, Gay, or Bisexual' based on probabilities
pop_df.loc[prob_lgb, 'SEXUALITY'] = 'Lesbian, Gay, or Bisexual'

# "Among Americans who are lesbian, gay or bisexual, the vast majority of women say they are bisexual (79%) while the majority of men say they are gay (57%)." (Pew Research Center)
female_lgb = pop_df[prob_lgb & (pop_df['GENDER'] == 'Female')]
prob_bisexual_female = np.random.uniform(0, 1, len(female_lgb)) <= 0.79
pop_df.loc[female_lgb.index, 'SEXUALITY'] = np.where(prob_bisexual_female, 'Bisexual', 'Lesbian')
male_lgb = pop_df[prob_lgb & (pop_df['GENDER'] == 'Male')]
prob_gay_male = np.random.uniform(0, 1, len(male_lgb)) <= 0.57
pop_df.loc[male_lgb.index, 'SEXUALITY'] = np.where(prob_gay_male, 'Gay', 'Bisexual')

# Ensure at least one gay and one lesbian person in each racial group and religious group
racial_groups = pop_df['RACETHN'].unique()
religious_groups = pop_df['RELIGCAT'].unique()

for race in racial_groups:
    for religion in religious_groups:
        subset = pop_df[(pop_df['RACETHN'] == race) & (pop_df['RELIGCAT'] == religion)]

        if 'Gay' not in subset['SEXUALITY'].values:
            males = subset[subset['GENDER'] == 'Male']
            if not males.empty:
                index = males.sample(1).index
                pop_df.loc[index, 'SEXUALITY'] = 'Gay'
            else:
                females = subset[subset['GENDER'] == 'Female']
                if not females.empty:
                    index = females.sample(1).index
                    pop_df.loc[index, 'SEXUALITY'] = 'Gay'

        if 'Lesbian' not in subset['SEXUALITY'].values:
            females = subset[subset['GENDER'] == 'Female']
            if not females.empty:
                index = females.sample(1).index
                pop_df.loc[index, 'SEXUALITY'] = 'Lesbian'
            else:
                males = subset[subset['GENDER'] == 'Male']
                if not males.empty:
                    index = males.sample(1).index
                    pop_df.loc[index, 'SEXUALITY'] = 'Lesbian'

# Verify the adjustments
for race in racial_groups:
    for religion in religious_groups:
        subset = pop_df[(pop_df['RACETHN'] == race) & (pop_df['RELIGCAT'] == religion)]
        assert 'Gay' in subset['SEXUALITY'].values, f"Missing 'Gay' person in {race} and {religion}"
        assert 'Lesbian' in subset['SEXUALITY'].values, f"Missing 'Lesbian' person in {race} and {religion}"

# Group by 'GENDER' and 'SEXUALITY' and calculate the size (counts) of each group
distribution_gender_sxly = pop_df.groupby(['GENDER', 'SEXUALITY'], observed=True).size()

# Calculate the percentage distribution within each gender
distribution_gender_sxly = distribution_gender_sxly.groupby(level=0, observed=True).apply(lambda x: 100 * x / x.sum())

# Print the result
print(distribution_gender_sxly)

GENDER  GENDER  SEXUALITY   
Female  Female  Bisexual         5.427552
                Heterosexual    93.213150
                Lesbian          1.359298
Male    Male    Bisexual         2.991586
                Gay              4.103044
                Heterosexual    92.905370
dtype: float64

file_path = 'GeographyChartData.csv'
df = pd.read_csv(file_path)

# Correcting the column name to match the DataFrame
df = df[['Geography', 'Rate per 100000']]

# Defining the divisions
divisions = {
    'New England': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont'],
    'Middle Atlantic': ['New Jersey', 'New York', 'Pennsylvania'],
    'East North Central': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin'],
    'West North Central': ['Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
    'South Atlantic': ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'Washington, D.C.', 'West Virginia'],
    'East South Central': ['Alabama', 'Kentucky', 'Mississippi', 'Tennessee'],
    'West South Central': ['Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
    'Mountain': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming'],
    'Pacific': ['Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
}

# Calculate the average rate per division
division_rates = {}
for division, states in divisions.items():
    division_df = df[df['Geography'].isin(states)]
    avg_rate = division_df['Rate per 100000'].mean()
    division_rates[division] = avg_rate

division_rates

{'New England': 225.6,
 'Middle Atlantic': 501.5333333333333,
 'East North Central': 225.06000000000003,
 'West North Central': 146.18571428571428,
 'South Atlantic': 491.7428571428572,
 'East South Central': 288.6,
 'West South Central': 355.35,
 'Mountain': 197.375,
 'Pacific': 231.08}

import pandas as pd
import numpy as np

# Assuming you have a DataFrame named pop_df with columns ['DIVISION', 'RACETHN', 'AGE']

def generate_HIV_status(row):
    # Define the probabilities based on conditions
    division_probabilities = {
        'New England': 0.2256,
        'Middle Atlantic': 0.5015,
        'East North Central': 0.2251,
        'West North Central': 0.1462,
        'South Atlantic': 0.4917,
        'East South Central': 0.2886,
        'West South Central': 0.3554,
        'Mountain': 0.1974,
        'Pacific': 0.2311
    }

    racethn_probabilities = { # from Atlas db
        'Black non-Hispanic': 1.23,  # Rate for African Americans
        'White non-Hispanic': 0.176,
        'Asian': 0.097,
        'Hispanic': 0.520,           # Rate for Hispanic/Latino persons
        'Other race': 0.482  # avg between indig, native american, multi race
    }

    age_probabilities = {
        (13, 24): 0.053, # ages are from Atlas db
        (25, 34): 0.340,  # Rate for persons aged 25-34
        (35, 44): 0.470,   # Rate for persons aged 35-44
        (45, 54): 0.597,
        (55, 64): 0.677,
        (65, 100): 0.255

    }

    sexuality_probabilities = {
        'Heterosexual': 0.2, # 333.3 mil * 85.6% of the pop is het = 285.3 tot het people; tweeked within range of error for some more samples
        'Lesbian': 0.067,
        'Bisexual': 1.28, # MSM + HET + Other / 3 / tot LGBTQ pop
        'Gay': 2.63 # MSM num from Atlas db / half of LGBTQ pop (since only men)
    }

    gender_probabilities = {
        'Female': 0.172, # 173 per 100000 * 100
        'Male': 0.594 # 598 per 100000 * 100
    }

      # Apply division probabilities
    division_prob = division_probabilities.get(row['DIVISION'], 0)

    # Apply race/ethnicity adjustments
    racethn_adjustment = racethn_probabilities.get(row['RACETHN'], 1)

    # Apply age adjustments
    age_adjustment = next((adjust for (age_min, age_max), adjust in age_probabilities.items() if age_min <= row['AGE'] <= age_max), 1)

    # Apply sexuality adjustments
    sexuality_adjustment = sexuality_probabilities.get(row['SEXUALITY'], 1)

    # Apply gender adjustments
    gender_adjustment = gender_probabilities.get(row['GENDER'], 1)

    # Calculate the combined probability
    combined_prob = division_prob * racethn_adjustment * age_adjustment * sexuality_adjustment * gender_adjustment

    # Generate HIV status based on combined probability
    if np.random.rand() <= combined_prob:
        return 'positive'
    else:
        return 'negative'

# Apply the function to create the new column 'HIV_STAT'
pop_df['HIV_STAT'] = pop_df.apply(generate_HIV_status, axis=1)

pop_df.head()

pop_df.tail()

distribution = pop_df.groupby(['DIVISION', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())

# Print the distribution
print(distribution)

DIVISION            DIVISION            HIV_STAT
East North Central  East North Central  negative    99.593496
                                        positive     0.406504
East South Central  East South Central  negative    99.737762
                                        positive     0.262238
Middle Atlantic     Middle Atlantic     negative    99.308490
                                        positive     0.691510
Mountain            Mountain            negative    99.795082
                                        positive     0.204918
New England         New England         negative    99.688474
                                        positive     0.311526
Pacific             Pacific             negative    99.350248
                                        positive     0.649752
South Atlantic      South Atlantic      negative    98.867553
                                        positive     1.132447
West North Central  West North Central  negative    99.922300
                                        positive     0.077700
West South Central  West South Central  negative    99.258613
                                        positive     0.741387
dtype: float64

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\20797225.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  distribution = pop_df.groupby(['DIVISION', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())
C:\Users\86177\AppData\Local\Temp\ipykernel_18680\20797225.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  distribution = pop_df.groupby(['DIVISION', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())

distribution = pop_df.groupby(['RACETHN', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())
print(distribution)

RACETHN             RACETHN             HIV_STAT
Asian               Asian               negative    99.637353
                                        positive     0.362647
Black non-Hispanic  Black non-Hispanic  negative    98.111624
                                        positive     1.888376
Hispanic            Hispanic            negative    99.094144
                                        positive     0.905856
Other race          Other race          negative    99.607843
                                        positive     0.392157
White non-Hispanic  White non-Hispanic  negative    99.651514
                                        positive     0.348486
dtype: float64

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\878391969.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  distribution = pop_df.groupby(['RACETHN', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())
C:\Users\86177\AppData\Local\Temp\ipykernel_18680\878391969.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  distribution = pop_df.groupby(['RACETHN', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())

distribution = pop_df.groupby(['AGE', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())
print(distribution)

AGE   AGE   HIV_STAT
18.0  18.0  negative    100.000000
19.0  19.0  negative     99.722222
            positive      0.277778
20.0  20.0  negative    100.000000
21.0  21.0  negative     99.746835
                           ...    
78.0  78.0  negative     99.107143
            positive      0.892857
79.0  79.0  negative    100.000000
80.0  80.0  negative    100.000000
85.0  85.0  negative    100.000000
Length: 112, dtype: float64

distribution = pop_df.groupby(['SEXUALITY', 'HIV_STAT']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum())
print(distribution)

SEXUALITY     SEXUALITY     HIV_STAT
Bisexual      Bisexual      negative     98.589894
                            positive      1.410106
Gay           Gay           negative     90.886076
                            positive      9.113924
Heterosexual  Heterosexual  negative     99.591683
                            positive      0.408317
Lesbian       Lesbian       negative    100.000000
dtype: float64

pop_df['HIV_STAT'].value_counts()

HIV_STAT
negative    19876
positive      124
Name: count, dtype: int64

# Get unique values in 'Column_Name'
gender = pop_df['MARITAL_ACS'].unique()

# Convert to a set
gender_set = set(gender)

print(gender_set)

{'Divorced', 'Now married', 'Separated', 'Widowed', 'Never married'}

import pandas as pd
import numpy as np

# Data provided in the problem
total_female_population = 166.58 * 10**6  # in millions
total_pregnancies_2019 = 5.507 * 10**6  # in millions

# Pregnancy rates per 1,000 females for different groups in 2019
pregnancy_rates_2019 = {
    'total': 85.6,
    'age_15_19': 29.4,
    'age_20_24': 98.8,
    'age_25_29': 132.6,
    'age_30_34': 139.7,
    'age_35_39': 77.0,
    'age_40_plus': 24.7,
    'hispanic': 85.5,
    'non_hispanic_black': 109.8,
    'non_hispanic_white': 82.6,
    'non_hispanic_other': 68.7,
    'unmarried': 66.4,
    'married': 115.7
}

# Assuming pop_df exists, categorize the age and calculate weighted pregnancy probabilities
def calculate_age_group(age):
    if 15 <= age <= 19:
        return 'age_15_19'
    elif 20 <= age <= 24:
        return 'age_20_24'
    elif 25 <= age <= 29:
        return 'age_25_29'
    elif 30 <= age <= 34:
        return 'age_30_34'
    elif 35 <= age <= 39:
        return 'age_35_39'
    else:
        return 'age_40_plus'

def calculate_pregnancy_probability(row):
    # Automatically assign 'Not Applicable' for males
    if row['GENDER'] == 'Male':
        return 'Not Applicable'

    # Age-based probability
    age_group = calculate_age_group(row['AGE'])
    age_based_prob = pregnancy_rates_2019[age_group] / 1000

    # Race-based adjustment
    if row['RACETHN'] == 'Hispanic':
        race_based_prob = pregnancy_rates_2019['hispanic'] / 1000
    elif row['RACETHN'] == 'Black non-Hispanic':
        race_based_prob = pregnancy_rates_2019['non_hispanic_black'] / 1000
    elif row['RACETHN'] == 'White non-Hispanic':
        race_based_prob = pregnancy_rates_2019['non_hispanic_white'] / 1000
    else:  # Other non-Hispanic races
        race_based_prob = pregnancy_rates_2019['non_hispanic_other'] / 1000

    # Marital status adjustment
    if row['MARITAL_ACS'] == 'Now married':
        marital_based_prob = pregnancy_rates_2019['married'] / 1000
    else:
        marital_based_prob = pregnancy_rates_2019['unmarried'] / 1000

    # Combine probabilities (taking an average for simplicity)
    combined_prob = (age_based_prob + race_based_prob + marital_based_prob) / 3

    # Return 'Positive' or 'Negative' based on combined probability
    return np.random.choice(['Positive', 'Negative'], p=[combined_prob, 1 - combined_prob])

# Assuming pop_df exists and has the columns AGE, RACETHN, and MARITAL_ACS
# Here we create a mock pop_df for demonstration
np.random.seed(0)  # For reproducibility

# Apply the function to calculate pregnancy status
pop_df['PREG_STAT'] = pop_df.apply(calculate_pregnancy_probability, axis=1)

pop_df.head()

pop_df.tail()

preg_stat_by_ethnicity = pop_df.groupby(['RACETHN', 'PREG_STAT']).size().unstack(fill_value=0)
print(preg_stat_by_ethnicity)

PREG_STAT           Negative  Not Applicable  Positive
RACETHN                                               
Asian                    545             513        45
Black non-Hispanic      1185            1065       133
Hispanic                1401            1545       145
Other race               259             229        22
White non-Hispanic      6156            6275       482

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\978890423.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  preg_stat_by_ethnicity = pop_df.groupby(['RACETHN', 'PREG_STAT']).size().unstack(fill_value=0)

#pop_df.to_csv('synthetic_population_dataset.csv')

# Get unique values in 'Column_Name'
religion = pop_df['RELIGCAT'].unique()

# Convert to a set
religion_set = set(religion)

print(religion_set)

{'Other', 'Unaffiliated', 'Catholic', 'Mainline Protestant', 'Evangelical Protestant'}

import random

# Define distributions by demographic groups
religion_race_stats = {
    'Buddhist': {'White non-Hispanic': 44, 'Black non-Hispanic': 3, 'Asian': 33, 'Hispanic': 12, 'Other race': 8},
    'Jehovah\'s Witness': {'White non-Hispanic': 36, 'Black non-Hispanic': 27, 'Asian': 0, 'Hispanic': 32, 'Other race': 6},
    'Jewish': {'White non-Hispanic': 90, 'Black non-Hispanic': 2, 'Asian': 2, 'Hispanic': 4, 'Other race': 2},
    'Mormon': {'White non-Hispanic': 85, 'Black non-Hispanic': 1, 'Asian': 1, 'Hispanic': 8, 'Other race': 5},
    'Muslim': {'White non-Hispanic': 38, 'Black non-Hispanic': 28, 'Asian': 28, 'Hispanic': 4, 'Other race': 3},
    'Evangelical Protestant': {'White non-Hispanic': 76, 'Black non-Hispanic': 6, 'Asian': 2, 'Hispanic': 11, 'Other race': 5},
    'Mainline Protestant': {'White non-Hispanic': 86, 'Black non-Hispanic': 3, 'Asian': 1, 'Hispanic': 6, 'Other race': 3},
    'Unaffiliated': {'White non-Hispanic': 68, 'Black non-Hispanic': 9, 'Asian': 5, 'Hispanic': 13, 'Other race': 4},
    'Hindu': {'White non-Hispanic': 4, 'Black non-Hispanic': 2, 'Asian': 91, 'Hispanic': 1, 'Other race': 2}, # modified since most hindus are asian, but most asians aren't necessarily hindu
    'Orthodox Christian': {'White non-Hispanic': 81, 'Black non-Hispanic': 8, 'Asian': 3, 'Hispanic': 6, 'Other race': 2}
}

religion_age_stats = {
    'Buddhist': {'18-29': 34, '30-49': 30, '50-64': 23, '65-100': 14},
    'Jehovah\'s Witness': {'18-29': 15, '30-49': 34, '50-64': 29, '65-100': 23},
    'Jewish': {'18-29': 22, '30-49': 27, '50-64': 26, '65-100': 26},
    'Mormon': {'18-29': 22, '30-49': 40, '50-64': 22, '65-100': 16},
    'Muslim': {'18-29': 44, '30-49': 37, '50-64': 13, '65-100': 5},
    'Evangelical Protestant': {'18-29': 17, '30-49': 33, '50-64': 29, '65-100': 20},
    'Mainline Protestant': {'18-29': 16, '30-49': 29, '50-64': 29, '65-100': 26},
    'Unaffiliated': {'18-29': 35, '30-49': 37, '50-64': 19, '65-100': 9},
    'Hindu': {'18-29': 34, '30-49': 56, '50-64': 6, '65-100': 4},
    'Orthodox Christian': {'18-29': 26, '30-49': 40, '50-64': 21, '65-100': 13}
}

religion_gender_stats = {
    'Buddhist': {'Female': 49, 'Male': 51},
    'Jehovah\'s Witness': {'Female': 65, 'Male': 35},
    'Jewish': {'Female': 48, 'Male': 52},
    'Mormon': {'Female': 54, 'Male': 46},
    'Muslim': {'Female': 35, 'Male': 65},
    'Evangelical Protestant': {'Female': 55, 'Male': 45},
    'Mainline Protestant': {'Female': 55, 'Male': 45},
    'Unaffiliated': {'Female': 43, 'Male': 57},
    'Hindu': {'Female': 38, 'Male': 62},
    'Orthodox Christian': {'Female': 44, 'Male': 56}
}

religion_marital_stats = {
    'Buddhist': {'Now married': 39, 'Living with a partner': 11, 'Divorced': 10, 'Widowed': 3, 'Never married': 37},
    'Jehovah\'s Witness': {'Now married': 53, 'Living with a partner': 5, 'Divorced': 12, 'Widowed': 8, 'Never married': 21},
    'Jewish': {'Now married': 56, 'Living with a partner': 6, 'Divorced': 6, 'Widowed': 9, 'Never married': 23},
    'Mormon': {'Now married': 66, 'Living with a partner': 7, 'Divorced': 12, 'Widowed': 5, 'Never married': 19},
    'Muslim': {'Now married': 41, 'Living with a partner': 8, 'Divorced': 9, 'Widowed': 6, 'Never married': 36},
    'Evangelical Protestant': {'Now married': 56, 'Living with a partner': 14, 'Divorced': 8, 'Widowed': 8, 'Never married': 18},
    'Mainline Protestant': {'Now married': 55, 'Living with a partner': 6, 'Divorced': 12, 'Widowed': 9, 'Never married': 18},
    'Unaffiliated': {'Now married': 37, 'Living with a partner': 11, 'Divorced': 11, 'Widowed': 7, 'Never married': 37},
    'Hindu': {'Now married': 60, 'Living with a partner': 0, 'Divorced': 2, 'Widowed': 1, 'Never married': 37},
    'Orthodox Christian': {'Now married': 48, 'Living with a partner': 5, 'Divorced': 9, 'Widowed': 6, 'Never married': 31}
}

religion_edu_stats = {
    'Buddhist': {'Less than HS': 20, 'HS Grad': 33, 'Some college': 28, 'College grad': 20, 'Postgraduate': 20},
    'Jehovah\'s Witness': {'Less than HS': 63, 'HS Grad': 25, 'Some college': 9, 'College grad': 3, 'Postgraduate': 9},
    'Jewish': {'Less than HS': 19, 'HS Grad': 22, 'Some college': 29, 'College grad': 31, 'Postgraduate': 31},
    'Mormon': {'Less than HS': 27, 'HS Grad': 40, 'Some college': 23, 'College grad': 10, 'Postgraduate': 10},
    'Muslim': {'Less than HS': 36, 'HS Grad': 25, 'Some college': 23, 'College grad': 17, 'Postgraduate': 17},
    'Evangelical Protestant': {'Less than HS': 43, 'HS Grad': 35, 'Some college': 14, 'College grad': 7, 'Postgraduate': 7},
    'Mainline Protestant': {'Less than HS': 37, 'HS Grad': 30, 'Some college': 19, 'College grad': 14, 'Postgraduate': 14},
    'Unaffiliated': {'Less than HS': 38, 'HS Grad': 32, 'Some college': 18, 'College grad': 11, 'Postgraduate': 11},
    'Hindu': {'Less than HS': 12, 'HS Grad': 11, 'Some college': 29, 'College grad': 48, 'Postgraduate': 48},
    'Orthodox Christian': {'Less than HS': 27, 'HS Grad': 34, 'Some college': 21, 'College grad': 18, 'Postgraduate': 18}
}

# All possible religions
all_religions = list(religion_race_stats.keys())

def calculate_combined_probability(row, religion):
    race = row['RACETHN']
    age = row['AGE']
    gender = row['GENDER']
    marital_status = row['MARITAL_ACS']
    education = row['EDUCCAT5']

    # Determine age group
    if 18 <= age <= 29:
        age_group = '18-29'
    elif 30 <= age <= 49:
        age_group = '30-49'
    elif 50 <= age <= 64:
        age_group = '50-64'
    else:
        age_group = '65-100'

    # Calculate probabilities
    race_prob = religion_race_stats.get(religion, {}).get(race, 1) / 100
    age_prob = religion_age_stats.get(religion, {}).get(age_group, 1) / 100
    gender_prob = religion_gender_stats.get(religion, {}).get(gender, 1) / 100
    marital_prob = religion_marital_stats.get(religion, {}).get(marital_status, 1) / 100
    edu_prob = religion_edu_stats.get(religion, {}).get(education, 1) / 100

    # Combine probabilities
    return race_prob * age_prob * gender_prob * marital_prob * edu_prob

# Assign religions based on combined probabilities
def assign_religion(row):
    probabilities = [calculate_combined_probability(row, religion) for religion in all_religions]
    return random.choices(all_religions, weights=probabilities)[0]

pop_df['RELIGCAT'] = pop_df.apply(assign_religion, axis=1)

# Display the final religion distribution
pop_df['RELIGCAT'].value_counts()

RELIGCAT
Jewish                    2834
Mormon                    2340
Orthodox Christian        2289
Mainline Protestant       2127
Unaffiliated              2027
Jehovah's Witness         2009
Muslim                    1891
Buddhist                  1879
Evangelical Protestant    1673
Hindu                      931
Name: count, dtype: int64

religion_count_by_race = pop_df.groupby(['RACETHN', 'RELIGCAT']).size().reset_index(name='Count')

print(religion_count_by_race)

               RACETHN                RELIGCAT  Count
0                Asian                Buddhist    207
1                Asian  Evangelical Protestant     21
2                Asian                   Hindu    640
3                Asian       Jehovah's Witness      0
4                Asian                  Jewish     19
5                Asian     Mainline Protestant      7
6                Asian                  Mormon     12
7                Asian                  Muslim    139
8                Asian      Orthodox Christian     26
9                Asian            Unaffiliated     32
10  Black non-Hispanic                Buddhist     85
11  Black non-Hispanic  Evangelical Protestant    117
12  Black non-Hispanic                   Hindu     71
13  Black non-Hispanic       Jehovah's Witness    553
14  Black non-Hispanic                  Jewish     89
15  Black non-Hispanic     Mainline Protestant     92
16  Black non-Hispanic                  Mormon     23
17  Black non-Hispanic                  Muslim    811
18  Black non-Hispanic      Orthodox Christian    268
19  Black non-Hispanic            Unaffiliated    274
20            Hispanic                Buddhist    485
21            Hispanic  Evangelical Protestant    286
22            Hispanic                   Hindu     37
23            Hispanic       Jehovah's Witness    831
24            Hispanic                  Jewish    141
25            Hispanic     Mainline Protestant    156
26            Hispanic                  Mormon    289
27            Hispanic                  Muslim    173
28            Hispanic      Orthodox Christian    211
29            Hispanic            Unaffiliated    482
30          Other race                Buddhist    118
31          Other race  Evangelical Protestant     57
32          Other race                   Hindu     45
33          Other race       Jehovah's Witness     60
34          Other race                  Jewish     37
35          Other race     Mainline Protestant     25
36          Other race                  Mormon     57
37          Other race                  Muslim     35
38          Other race      Orthodox Christian     18
39          Other race            Unaffiliated     58
40  White non-Hispanic                Buddhist    984
41  White non-Hispanic  Evangelical Protestant   1192
42  White non-Hispanic                   Hindu    138
43  White non-Hispanic       Jehovah's Witness    565
44  White non-Hispanic                  Jewish   2548
45  White non-Hispanic     Mainline Protestant   1847
46  White non-Hispanic                  Mormon   1959
47  White non-Hispanic                  Muslim    733
48  White non-Hispanic      Orthodox Christian   1766
49  White non-Hispanic            Unaffiliated   1181

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\116033954.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  religion_count_by_race = pop_df.groupby(['RACETHN', 'RELIGCAT']).size().reset_index(name='Count')

pop_df.tail()

pop_df.head()

pop_df.isnull().sum()

id                    0
GENDER                0
AGE                   0
RACETHN               0
EDUCCAT5              0
DIVISION              0
MARITAL_ACS           0
HHSIZECAT             0
CHILDRENCAT           0
CITIZEN_REC           0
BORN_ACS              0
FAMINC5               0
EMPLOYED              0
worker_class          0
usual_hrs_per_week    0
hours_vary            0
MIL_ACS_REC           0
HOME_ACS_REC          0
metropolitan          0
internet_access       0
FDSTMP_CPS            0
TENURE_ACS            0
PUB_OFF_CPS           0
boycott               0
COMGRP_CPS            0
TALK_CPS              0
TRUST_CPS             0
TABLET_CPS            0
TEXTIM_CPS            0
SOCIAL_CPS            0
VOLSUM                0
REGISTERED            0
VOTE14                0
PARTYSCALE5           0
RELIGCAT              0
IDEO3                 0
FOLGOV                0
OWNGUN_GSS            0
SEXUALITY             0
HIV_STAT              0
PREG_STAT             0
dtype: int64

pip install Faker

Requirement already satisfied: Faker in c:\users\86177\anaconda3\lib\site-packages (26.1.0)
Requirement already satisfied: python-dateutil>=2.4 in c:\users\86177\anaconda3\lib\site-packages (from Faker) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\86177\anaconda3\lib\site-packages (from python-dateutil>=2.4->Faker) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

pop_df['RACETHN'].unique()

['White non-Hispanic', 'Hispanic', 'Asian', 'Black non-Hispanic', 'Other race']
Categories (5, object): ['Asian', 'Black non-Hispanic', 'Hispanic', 'Other race', 'White non-Hispanic']

from faker import Faker
import pandas as pd

fake = Faker()

# Define the percentages of individuals having credit cards for each racial group
credit_card_percentages = {
    'White non-Hispanic': 88,
    'Black non-Hispanic': 72,
    'Hispanic': 77,
    'Asian': 93,
    'Other race': 93
}

pop_df['RACETHN'] = pop_df['RACETHN'].astype(str)

def generate_credit_card_number(race):
    selected_percentage = credit_card_percentages[race]

    if random.randint(0, 100) <= selected_percentage:
        return fake.credit_card_number(card_type='mastercard')
    else:
        return 0  # For individuals without credit cards

# Generate credit card numbers based on racial groups and add them as a new column 'CC_NUM' in pop_df
pop_df['CC_NUM'] = pop_df['RACETHN'].apply(generate_credit_card_number)

pop_df.head()

# Grouping the DataFrame by 'RACETHN' and counting the non-null values in 'CC_NUM'
cc_num_count_per_race = pop_df.groupby('RACETHN')['CC_NUM'].apply(lambda x: x.notnull().sum()).reset_index(name='CreditCardCount')

# Displaying the count of credit card numbers per racial group
print(cc_num_count_per_race)

              RACETHN  CreditCardCount
0               Asian             1103
1  Black non-Hispanic             2383
2            Hispanic             3091
3          Other race              510
4  White non-Hispanic            12913

pop_df['cc_encoded'] = (pop_df['CC_NUM'] != 0).astype(int)

import numpy as np

# Add a column 'cc_disclosed' based on the condition that only those with a credit card (cc_encoded = 1) can disclose it
pop_df['cc_disclosed'] = np.where(
    pop_df['cc_encoded'] == 1,  # Only for individuals with a credit card
    np.random.choice([0, 1], size=pop_df.shape[0], p=[0.5, 0.5]),  # 50% chance to disclose
    0  # For those without a credit card, disclosure is 0
)

# Display the first few rows to verify the result
print(pop_df[['RACETHN', 'CC_NUM', 'cc_encoded', 'cc_disclosed']].head())

              RACETHN            CC_NUM  cc_encoded  cc_disclosed
0  White non-Hispanic  2248213226375654           1             0
1            Hispanic  2254119539211802           1             0
2  White non-Hispanic                 0           0             0
3  White non-Hispanic  2397750752924503           1             1
4               Asian                 0           0             0

pip install ArabicNames

Requirement already satisfied: ArabicNames in c:\users\86177\anaconda3\lib\site-packages (0.1.2)
Requirement already satisfied: pandas in c:\users\86177\anaconda3\lib\site-packages (from ArabicNames) (2.1.4)
Requirement already satisfied: numpy<2,>=1.23.2 in c:\users\86177\anaconda3\lib\site-packages (from pandas->ArabicNames) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\86177\anaconda3\lib\site-packages (from pandas->ArabicNames) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\86177\anaconda3\lib\site-packages (from pandas->ArabicNames) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\86177\anaconda3\lib\site-packages (from pandas->ArabicNames) (2023.3)
Requirement already satisfied: six>=1.5 in c:\users\86177\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas->ArabicNames) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

from faker import Faker
import pandas as pd
import ArabicNames

# Initialize Faker
us = Faker('en_US')
es = Faker('es_ES')
ind = Faker('en_IN')
ch = Faker('zh_CN')
fake = Faker()

# Function to generate Indian name
def generate_indian_name_w():
    ind.seed_locale('en_IN')  # For Indian names
    return ind.name_female()

# Function to generate Chinese name
def generate_chinese_name_w():
    ch.seed_locale('zh_CN')  # For Chinese names
    return ch.romanized_name()

def generate_random_name_w():
    return fake.name_female()

# Function to generate Indian name
def generate_indian_name_m():
    ind.seed_locale('en_IN')  # For Indian names
    return ind.name_male()

# Function to generate Chinese name
def generate_chinese_name_m():
    ch.seed_locale('zh_CN')  # For Chinese names
    return ch.romanized_name()

def generate_random_name_m():
    return fake.name_male()

# Function to generate Indian name
def generate_indian_name_n():
    ind.seed_locale('en_IN')  # For Indian names
    return ind.name_nonbinary()

# Function to generate Chinese name
def generate_chinese_name_n():
    ch.seed_locale('zh_CN')  # For Chinese names
    return ch.romanized_name()

def generate_random_name_n():
    return fake.name_nonbinary()

# Function to generate first names based on gender and race
def generate_name(gender, race, religion):
    if gender == 'Male':
        if race == 'White non-Hispanic':

            if religion == 'Muslim':
                return ArabicNames.get_full_name()

            us.seed_locale('en_US')
            return us.name_male()
        elif race == 'Black non-Hispanic':

            if religion == 'Muslim':
                return ArabicNames.get_full_name()

            us.seed_locale('en_US')
            return us.name_male()
        elif race == 'Asian':

            # Define probabilities for male Asian names
            indian_prob = 21  # Probability percentage for Indian names
            chinese_prob = 24  # Probability percentage for Chinese names

            rand_num = random.randint(1, 100)  # Generate a random number between 1-100
            if religion == 'Hindu':
                return generate_indian_name_m()
            elif rand_num <= indian_prob:
                return generate_indian_name_m()
            elif rand_num <= (indian_prob + chinese_prob):
                return generate_chinese_name_m()
            else:
                return generate_random_name_m()  # For example, random name for the rest
        elif race == 'Hispanic':
            es.seed_locale('es_ES')  # For Spanish names
            return es.name_male()
        else:
            return fake.name_male()  # Handle other races as needed

    elif gender == 'Female':  # Female
        if race == 'White non-Hispanic':
            us.seed_locale('en_US')
            return us.name_female()
        elif race == 'Black non-Hispanic':
            us.seed_locale('en_US')
            return us.name_female()
        elif race == 'Asian':

            # Define probabilities for male Asian names
            indian_prob = 21  # Probability percentage for Indian names
            chinese_prob = 24  # Probability percentage for Chinese names

            rand_num = random.randint(1, 100)  # Generate a random number between 1-100

            if religion == 'Hindu':
                return generate_indian_name_m()
            elif rand_num <= indian_prob:
                return generate_indian_name_m()
            elif rand_num <= (indian_prob + chinese_prob):
                return generate_chinese_name_w()
            else:
                return generate_random_name_w()  # For example, random name for the rest
        elif race == 'Hispanic':
            es.seed_locale('es_ES')  # For Spanish names
            return es.name_female()
        else:
            return fake.name_female()  # Handle other races as needed

    else:
          if race == 'White non-Hispanic':

              if religion == 'Muslim':
                return ArabicNames.get_full_name()

              us.seed_locale('en_US')
              return us.name_nonbinary()

          elif race == 'Black non-Hispanic':

              if religion == 'Muslim':
                return ArabicNames.get_full_name()

              us.seed_locale('en_US')
              return us.name_nonbinary()
          elif race == 'Asian':

              # Define probabilities for male Asian names
              indian_prob = 21  # Probability percentage for Indian names
              chinese_prob = 24  # Probability percentage for Chinese names

              rand_num = random.randint(1, 100)  # Generate a random number between 1-100

              if rand_num <= indian_prob:
                  return generate_indian_name_n()
              elif rand_num <= (indian_prob + chinese_prob):
                  return generate_chinese_name_n()
              else:
                  return generate_random_name_n()  # For example, random name for the rest
          elif race == 'Hispanic':
              es.seed_locale('es_ES')  # For Spanish names
              return es.name_nonbinary
          else:
              return fake.name_nonbinary  # Handle other races as needed

# Generate first names based on gender and race
pop_df['NAME'] = [generate_name(g, r, z) for g, r, z in zip(pop_df['GENDER'], pop_df['RACETHN'], pop_df['RELIGCAT'])]

pop_df = pop_df[['NAME'] + [col for col in pop_df if col not in ['NAME']]]

pop_df.head()

pop_df.tail()

race_prob = {
    'Black non-Hispanic': {'0': 47.6, '1': 25.4, '2+': 27.0},
    'Hispanic': {'0': 61.5, '1': 20.8, '2+': 17.7},
    'Other race': {'0': 62.0, '1': 21.6, '2+': 16.4},
    'White non-Hispanic': {'0': 43.8, '1': 25.6, '2+': 30.6},
    'Asian': {'0': 62.0, '1': 21.6, '2+': 16.4}
}

gender_prob = {'Female': 46.7, 'Male': 49.8}

# Declaration: the original data from the website shows no less than 65 without upper limit as the last interval. Setting 100 as upper bound here is for dataset fit purpose.
age_prob = {'18-44': 72.6, '45-64': 36.6, '65-100': 12.4}

def get_age_group(age):
    if 18 <= age <= 44:
        return '18-44'
    elif 45 <= age <= 64:
        return '45-64'
    elif 65 <= age <= 100:
        return '65-100'
    else:
        return None

def assign_chronic_conditions(row):
    race_ethn = row['RACETHN']
    gender = row['GENDER']
    age = row['AGE']

    age_group = get_age_group(age)

    probability_0 = race_prob[race_ethn]['0'] * gender_prob[gender] * age_prob[age_group] / 100**2
    probability_1 = race_prob[race_ethn]['1'] * gender_prob[gender] * age_prob[age_group] / 100**2
    probability_2_plus = race_prob[race_ethn]['2+'] * gender_prob[gender] * age_prob[age_group] / 100**2

    total_prob = probability_0 + probability_1 + probability_2_plus
    probability_0 = (probability_0 / total_prob) * 100
    probability_1 = (probability_1 / total_prob) * 100
    probability_2_plus = (probability_2_plus / total_prob) * 100

    random_value = random.uniform(0, 100)
    if random_value < probability_0:
        return 0
    elif random_value < (probability_0 + probability_1):
        return 1
    else:
        return 2

# Apply the function to create the new column 'NumChronicIllness' based on race, gender, and age probabilities
pop_df['NumChronicIllness'] = pop_df.apply(assign_chronic_conditions, axis=1)

pop_df.tail()

pop_df.head()

chronic_illness_counts = pop_df['NumChronicIllness'].value_counts()

print(chronic_illness_counts)

NumChronicIllness
0    9896
2    5279
1    4825
Name: count, dtype: int64

import pandas as pd
import numpy as np
# Assuming pop_df is your DataFrame containing the AGE column

# Calculate the percentage of missing values in the AGE column

missing_percentage = 5.95

# Generate confidence levels based on whether the value is imputed or not for AGE column
def generate_confidence(is_imputed):
    if is_imputed == 1:
        return np.random.uniform(0, 100)  # Random value between 0 and 100 for imputed values
    else:
        return np.random.uniform(70, 100)  # Random value between 70 and 100 for non-imputed values

def generate_imputation(df, column):
    imputed_values = np.random.choice(df[column].dropna().index, size=int(df[column].notnull().sum() * (missing_percentage / 100)), replace=False)
    df['IMPUTED_' + column] = 0
    df.loc[imputed_values, 'IMPUTED_' + column] = 1
    df['CONFIDENCE_LEVEL_' + column] = df['IMPUTED_' + column].apply(lambda x: generate_confidence(x))

# Call function for 'AGE' column
generate_imputation(pop_df, 'AGE')

# Calculate the percentage of missing values in the RACETHN column
missing_percentage_racethn = 5.77  # Given percentage of imputed data for RACETHN (5.77%)

# Generate confidence levels based on whether the value is imputed or not for AGE column
def generate_confidence(is_imputed):
    if is_imputed == 1:
        return np.random.uniform(0, 100)  # Random value between 0 and 100 for imputed values
    else:
        return np.random.uniform(70, 100)  # Random value between 70 and 100 for non-imputed values

def generate_imputation(df, column):
    imputed_values = np.random.choice(df[column].dropna().index, size=int(df[column].notnull().sum() * (missing_percentage / 100)), replace=False)
    df['IMPUTED_' + column] = 0
    df.loc[imputed_values, 'IMPUTED_' + column] = 1
    df['CONFIDENCE_LEVEL_' + column] = df['IMPUTED_' + column].apply(lambda x: generate_confidence(x))

# Call function for 'AGE' column
generate_imputation(pop_df, 'RACETHN')

import pandas as pd
import numpy as np

# Assuming pop_df is your DataFrame containing multiple columns

# List of columns (excluding AGE and RACETHN)
excluded_terms = ['age', 'race']  # Words to exclude from column selection
columns_to_impute = [col for col in pop_df.columns if not any(term in col.lower() for term in excluded_terms)]

# Generate random missing percentages for each column
missing_percentages = {col: np.random.uniform(5.1, 6.0) for col in columns_to_impute}
# Function to generate confidence levels

def generate_confidence(is_imputed):
    if is_imputed:
        return np.random.uniform(0, 100)  # Random value between 0 and 100 for imputed values
    else:
        return np.random.uniform(70, 100)  # Random value between 70 and 100 for non-imputed values

def generate_imputation(df, column, missing_percentage):
    imputed_values = np.random.choice(df[column].dropna().index, size=int(df[column].notnull().sum() * (missing_percentage / 100)), replace=False)
    df['IMPUTED_' + column] = 0
    df.loc[imputed_values, 'IMPUTED_' + column] = 1
    df['CONFIDENCE_LEVEL_' + column] = df['IMPUTED_' + column].apply(lambda x: generate_confidence(x))

# Loop through each column and create corresponding 'IMPUTED' and 'CONFIDENCE_LEVEL' columns
for col in columns_to_impute:
    generate_imputation(pop_df, col, missing_percentages[col])

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\999154425.py:22: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df['IMPUTED_' + column] = 0
C:\Users\86177\AppData\Local\Temp\ipykernel_18680\999154425.py:24: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df['CONFIDENCE_LEVEL_' + column] = df['IMPUTED_' + column].apply(lambda x: generate_confidence(x))
C:\Users\86177\AppData\Local\Temp\ipykernel_18680\999154425.py:22: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df['IMPUTED_' + column] = 0
C:\Users\86177\AppData\Local\Temp\ipykernel_18680\999154425.py:24: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df['CONFIDENCE_LEVEL_' + column] = df['IMPUTED_' + column].apply(lambda x: generate_confidence(x))

pop_df.head()

def label_confidence_intervals(row):
    for col in row.index:
        if col.startswith('CONFIDENCE_LEVEL_'):
            ci_value = row[col]
            attribute_name = col.replace('CONFIDENCE_LEVEL_', '')  # Extracting attribute name
            label = ""
            if ci_value >= 90:
                label = "is"
            elif 75 <= ci_value < 90:
                label = "is probably"
            elif 35 <= ci_value < 75:
                label = "is possibly"
            elif 10 <= ci_value < 35:
                label = "is unlikely but might be"
            elif ci_value < 10:
                label = "is not"

            label_col_name = f"CI_LABEL_{attribute_name}"  # Constructing new column name
            row[label_col_name] = label  # Assigning label to the new column for this row
    return row

# Apply the labeling function to each row
pop_df = pop_df.apply(label_confidence_intervals, axis=1)

pop_df.head()

# Assuming pop_df is your DataFrame and 'AGE' is the column with age values
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120] # Note that the bins go up to the next integer
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']
pop_df['AGE_INT'] = pd.cut(pop_df['AGE'], bins=age_bins, labels=age_labels, right=False)

pop_df.head()

C:\Users\86177\AppData\Local\Temp\ipykernel_18680\3809539463.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  pop_df['AGE_INT'] = pd.cut(pop_df['AGE'], bins=age_bins, labels=age_labels, right=False)

pop_df.to_csv('synthetic_population_dataset.csv')

	NAME	id	GENDER	AGE	RACETHN	EDUCCAT5	DIVISION	MARITAL_ACS	HHSIZECAT	CHILDRENCAT	...	CONFIDENCE_LEVEL_PREG_STAT	IMPUTED_CC_NUM	CONFIDENCE_LEVEL_CC_NUM	CONFIDENCE_LEVEL_cc_encoded	IMPUTED_cc_disclosed	CONFIDENCE_LEVEL_cc_disclosed	CONFIDENCE_LEVEL_NumChronicIllness
0	Luke Walsh	1	Male	25.0	White non-Hispanic	Some college	Mountain	Never married	3+	No children	...	76.481045	0	99.541838	85.298769	1	71.573091	88.177855
1	Matilde Izaguirre Checa	2	Female	70.0	Hispanic	HS Grad	West South Central	Divorced	1	No children	...	90.419122	0	90.982195	97.266417	0	94.458229	74.236974
2	Ryan Smith	3	Male	85.0	White non-Hispanic	Less than HS	Middle Atlantic	Now married	2	No children	...	93.107682	0	78.302624	78.672888	0	86.103928	79.358419
3	Matthew Grimes	4	Male	59.0	White non-Hispanic	HS Grad	Mountain	Now married	2	No children	...	95.171588	1	54.344487	84.718319	0	90.482390	77.533226
4	Miraan Rama	5	Female	19.0	Asian	Some college	Pacific	Never married	1	No children	...	97.779464	0	83.706609	87.997850	0	80.257402	98.153271

Import Pew Research Center Dataset¶

Appending new columns of data¶

LGBTQ+¶

HIV Status¶

Pregnancy Status¶

Religion: Include Non-Christian Distribution¶

Credit Card numbers¶

Illnesses¶

Imputations¶

	id	GENDER	AGE	RACETHN	EDUCCAT5	DIVISION	MARITAL_ACS	HHSIZECAT	CHILDRENCAT	CITIZEN_REC	...	TEXTIM_CPS	SOCIAL_CPS	VOLSUM	REGISTERED	VOTE14	PARTYSCALE5	RELIGCAT	IDEO3	FOLGOV	OWNGUN_GSS
19995	19996	Female	46.0	White non-Hispanic	Less than HS	Middle Atlantic	Now married	1	No children	No, not a U.S. citizen	...	Yes	Yes	Did not volunteer	No	Did not vote (includes too young to vote)	Lean Democrat	Unaffiliated	Moderate	Only now and then	Yes
19996	19997	Female	26.0	Hispanic	College grad	West South Central	Never married	2	No children	Yes, a U.S. citizen	...	Yes	Yes	Did not volunteer	Yes	Did not vote (includes too young to vote)	Ind/No Lean	Catholic	Moderate	Some of the time	No
19997	19998	Female	25.0	Black non-Hispanic	HS Grad	West North Central	Never married	3+	One or more children	No, not a U.S. citizen	...	No	Yes	Did not volunteer	No	Did not vote (includes too young to vote)	Democrat	Other	Liberal	Most of the time	No
19998	19999	Female	53.0	Hispanic	Some college	Mountain	Now married	3+	No children	Yes, a U.S. citizen	...	Yes	Yes	Did not volunteer	No	Did not vote (includes too young to vote)	Ind/No Lean	Mainline Protestant	Moderate	Most of the time	Yes
19999	20000	Female	26.0	White non-Hispanic	College grad	West South Central	Never married	2	No children	Yes, a U.S. citizen	...	Yes	No	Did not volunteer	No	Did not vote (includes too young to vote)	Democrat	Other	Moderate	Most of the time	No

	NAME	id	GENDER	AGE	RACETHN	EDUCCAT5	DIVISION	MARITAL_ACS	HHSIZECAT	CHILDRENCAT	...	RELIGCAT	IDEO3	FOLGOV	OWNGUN_GSS	SEXUALITY	HIV_STAT	PREG_STAT	CC_NUM	cc_encoded	cc_disclosed
19995	Gabrielle Francis	19996	Female	46.0	White non-Hispanic	Less than HS	Middle Atlantic	Now married	1	No children	...	Jehovah's Witness	Moderate	Only now and then	Yes	Heterosexual	negative	Positive	0	0	0
19996	Anselma Llobet Ibáñez	19997	Female	26.0	Hispanic	College grad	West South Central	Never married	2	No children	...	Buddhist	Moderate	Some of the time	No	Heterosexual	negative	Negative	2286010456900323	1	1
19997	Mary Carrillo	19998	Female	25.0	Black non-Hispanic	HS Grad	West North Central	Never married	3+	One or more children	...	Muslim	Liberal	Most of the time	No	Heterosexual	negative	Negative	2640673477975722	1	1
19998	Ruperta Pazos Alvarez	19999	Female	53.0	Hispanic	Some college	Mountain	Now married	3+	No children	...	Evangelical Protestant	Moderate	Most of the time	Yes	Heterosexual	negative	Negative	5196895632351314	1	0
19999	Catherine Ibarra	20000	Female	26.0	White non-Hispanic	College grad	West South Central	Never married	2	No children	...	Muslim	Moderate	Most of the time	No	Heterosexual	negative	Negative	2225505240379407	1	0