### Data Import

In [1]:
# Libraries
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load up data
pd.set_option('display.max_rows', 10)
df = pd.read_csv('./data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Columns: 820 entries, ID to CASEDIF
dtypes: float64(93), int64(6), object(721)
memory usage: 18.8+ MB


# Feature Deletion

In [3]:
# Features with high amount of missing values
empty_values = df.isna().sum()
total_rows = len(df)
empty_percentages = (empty_values / total_rows) * 100
filtered_empty_values_ab25per = empty_percentages[empty_percentages > 25]
cols_to_drop = filtered_empty_values_ab25per.index.tolist()
df = df.drop(columns = cols_to_drop)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Columns: 556 entries, ID to CASEDIF
dtypes: float64(49), int64(6), object(501)
memory usage: 12.7+ MB


In [5]:
# Medecine features except for ones with high correlation with depression
medicine_cols_to_keep = ['PSYCHOTHERAGENTS', 'ANTIDEPRESSANTS', 'SSRIANTIDEPRESSA',
                      'ANXIOLYTICSSEDAT', 'ANTICONVULSANTS', 'BENZODIAZEPINES']
medicine_cols = [
    "antiinfectives",
    "amebicides",
    "antifungals",
    "antimalarialagen",
    "antituberagents",
    "cephalosporins",
    "leprostatics",
    "macrolidederivat",
    "miscantibiotics",
    "penicillins",
    "quinolones",
    "sulfonamides",
    "tetracyclines",
    "urinaryantiinfec",
    "antihyplipagents",
    "antineoplastics",
    "alkylatingagents",
    "antimetabolites",
    "hormonesantineop",
    "miscantineoplast",
    "biologicals",
    "recombinanthuman",
    "cardiovascularag",
    "angiotensinconve",
    "antiadrenergperi",
    "antiadrenergcent",
    "antianginalagent",
    "antiarrhythmicag",
    "betaadrenergicbl",
    "calciumchannelbl",
    "diuretics",
    "inotropicagents",
    "misccardiovascul",
    "peripheralvasodi",
    "vasodilators",
    "vasopressors",
    "antihypertensive",
    "angiotensiniiinh",
    "centralnervoussy",
    "analgesics",
    "miscanalgesics",
    "narcanalgs",
    "nonsteroidalanti",
    "salicylates",
    "analgesiccombina",
    "anticonvulsants",
    "antiemeticantive",
    "antiparkinsonage",
    "anxiolyticssedat",
    "barbiturates",
    "benzodiazepines",
    "miscanxiolyticss",
    "cnsstimulants",
    "musclerelaxants",
    "miscantidepressa",
    "miscantipsychoti",
    "psychothercombin",
    "misccentralnervo",
    "coagulationmodif",
    "anticoagulants",
    "antiplateletagen",
    "misccoagulationm",
    "gastrointestinal",
    "antacids",
    "anticholsantispa",
    "antidiarrheals",
    "digestiveenzymes",
    "gallstonesolubil",
    "gistimulants",
    "h2antagonists",
    "laxatives",
    "miscgiagents",
    "hormones",
    "adrenalcorticals",
    "antidiabeticagen",
    "mischormones",
    "sexhormones",
    "contraceptives",
    "thyroiddrugs",
    "immunosuppressiv",
    "miscagents",
    "antidotes",
    "chelatingagents",
    "cholinergicmuscl",
    "localinjectablea",
    "miscuncategorize",
    "genitourinarytra",
    "nutritionalprods",
    "ironproducts",
    "mineralsandelect",
    "vitamins",
    "vitaminmineral",
    "respiratoryagent",
    "antihistamines",
    "antitussives",
    "bronchodilators",
    "methylxanthines",
    "decongestants",
    "expectorants",
    "miscrespiratorya",
    "respiratoryinhal",
    "upperrespiratory",
    "topicalagents",
    "dermatologicalag",
    "topicalantiinfec",
    "topicalsteroids",
    "topicalanestheti",
    "misctopicalagent",
    "topicalacneagent",
    "mouthandthroatpr",
    "ophthalpreparati",
    "oticpreparations",
    "vaginalpreparati",
    "loopdiuretics",
    "potassiumsparing",
    "thiazidediuretic",
    "carbonicanhydras",
    "firstgenerationc",
    "thirdgenerationc",
    "ophthalantiinfec",
    "ophthalglaucomaa",
    "ophthalsteroids",
    "ophthalsteroidsw",
    "ophthalantiinfla",
    "miscophthalagent",
    "oticsteroidswith",
    "miscoticagents",
    "hmgcoareductasei",
    "miscantihyplipag",
    "skelmuscrels",
    "adrenergicbronch",
    "bronchodilatorco",
    "androgensandanab",
    "estrogens",
    "progestins",
    "sexhormonecombin",
    "narcanalgcombina",
    "antirheumatics",
    "antimigraineagen",
    "antigoutagents",
    "fiveht3receptora",
    "phenthiazantieme",
    "anticholantiemet",
    "miscantiemetics",
    "hydantoinanticon",
    "barbiturateantic",
    "benzodiazepinean",
    "miscanticonvulsa",
    "anticholantipark",
    "ssriantidepressa",
    "tricyclicantidep",
    "phenthiazantipsy",
    "plateletaggregat",
    "sulfonylureas",
    "nonsulfonylureas",
    "insulin",
    "alphaglucosidase",
    "bisphosphonates",
    "alternativemeds",
    "nutraceuticals",
    "herbalproducts",
    "penicillinaseres",
    "aminopenicillins",
    "betalactamaseinh",
    "adamantaneantivi",
    "purinenucleoside",
    "miscantituberage",
    "polyenes",
    "azoleantifungals",
    "miscantifungals",
    "antimalarialquin",
    "miscantimalarial",
    "lincomycinderiva",
    "fibricacidderiva",
    "psychotheragents",
    "leukotrienemodif",
    "nasallubricants",
    "nasalsteroids",
    "nasalantihistami",
    "nasalpreparation",
    "antidepressants",
    "monoamineoxidase",
    "antipsychotics",
    "bileacidsequestr",
    "anorexiants",
    "immunologicagent",
    "monoclonalantibo",
    "heparins",
    "coumarinsandinda",
    "impotenceagents",
    "urinaryantispasm",
    "urinaryphmodifie",
    "miscgenitourinar",
    "ophthalantihista",
    "miscvaginalagent",
    "antipsoriatics",
    "thiazolidinedion",
    "protonpumpinhibi",
    "cardioselectiveb",
    "noncardioselecti",
    "dopaminergicanti",
    "fiveaminosalic",
    "cox2inhibitors",
    "meglitinides",
    "fivealphareducti",
    "antihyperuricemi",
    "topicalantibioti",
    "topicalantifunga",
    "inhaledcorticost",
    "mastcellstabiliz",
    "anticholbronchod",
    "glucocorticoids",
    "mineralocorticoi",
    "agentsforpulmona",
    "macrolides",
    "ketolides",
    "phenylpiperazine",
    "tetracyclicantid",
    "ssnriantidepress",
    "miscantidiabetic",
    "dibenzazepineant",
    "cholinergicagoni",
    "cholinesterasein",
    "antidiabeticcomb",
    "cholesterolabsor",
    "antihyplipcombin",
    "smokingcessation",
    "othersupplements"
]

medicine_cols_to_drop = [x.upper() for x in medicine_cols]
    
medicine_cols_to_drop = [element for element in medicine_cols_to_drop if element not in medicine_cols_to_keep]

df = df.drop(columns = medicine_cols_to_drop)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Columns: 334 entries, ID to CASEDIF
dtypes: float64(49), int64(6), object(279)
memory usage: 7.7+ MB


In [7]:
# Delete useless features
useless_cols = ['ID', 'FI_ID', 'PATH', 'VERSION', 'INT_START',
               'WEIGHT_SEL', 'WEIGHT_ADJ', 'STRATUM', 
               'CLUSTER']

df = df.drop(columns = useless_cols)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Columns: 325 entries, GENDER to CASEDIF
dtypes: float64(47), int64(1), object(277)
memory usage: 7.5+ MB


# Depression Scale Creation and Entry Cleaning

In [9]:
df_pure = df

symptoms_array = [
    "NOTGETGO",  # Difficulty getting going
    "FLTDEP",    # Feeling of deep sadness or emptiness
    "NOSLEEP",   # Insomnia or sleeping too much
    "RESTLES",   # Feeling restless
    "NOTEAT",    # Changes in appetite or weight
    "CONFIDNT",  # Lack of confidence
    "FLTEFF",   # Feeling things are out of control
    "RELAXED",   # Unable to feel relaxed
    "WORRY"   # Worrying thoughts
]

df_phq9 = df_pure[symptoms_array]

missing_counts = df_phq9.isna().sum()
missing_counts_sorted = missing_counts.sort_values(ascending=False)

# Print the sorted counts
print(missing_counts_sorted)

RESTLES     222
CONFIDNT    216
WORRY       212
RELAXED     211
FLTEFF       13
NOTGETGO     10
FLTDEP        9
NOSLEEP       7
NOTEAT        7
dtype: int64


In [10]:
df_phq9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NOTGETGO  2995 non-null   object
 1   FLTDEP    2996 non-null   object
 2   NOSLEEP   2998 non-null   object
 3   RESTLES   2783 non-null   object
 4   NOTEAT    2998 non-null   object
 5   CONFIDNT  2789 non-null   object
 6   FLTEFF    2992 non-null   object
 7   RELAXED   2794 non-null   object
 8   WORRY     2793 non-null   object
dtypes: object(9)
memory usage: 211.4+ KB


In [11]:
# Delete all entries that has a missing value in one of these features
df_phq9_2 = df_phq9.copy()
df_phq9_2 = df_phq9_2.dropna()
df_phq9_2.info() # los around 244 entries from that

<class 'pandas.core.frame.DataFrame'>
Index: 2763 entries, 0 to 3004
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NOTGETGO  2763 non-null   object
 1   FLTDEP    2763 non-null   object
 2   NOSLEEP   2763 non-null   object
 3   RESTLES   2763 non-null   object
 4   NOTEAT    2763 non-null   object
 5   CONFIDNT  2763 non-null   object
 6   FLTEFF    2763 non-null   object
 7   RELAXED   2763 non-null   object
 8   WORRY     2763 non-null   object
dtypes: object(9)
memory usage: 215.9+ KB


In [12]:
# Making depression scale
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in df_phq9_2.columns:
    df_phq9_2.loc[:, col] = label_encoder.fit_transform(df_phq9_2[col])

# Need to reverse the order from bad-good to good-bad
df_phq9_2.loc[:, 'CONFIDNT'] = df_phq9_2['CONFIDNT'].apply(lambda x: 3 - x)
df_phq9_2.loc[:, 'RELAXED'] = df_phq9_2['RELAXED'].apply(lambda x: 3 - x)

df_phq9_2['total_sum'] = df_phq9_2.sum(axis = 1)

In [13]:
# Categorize depression
def categorize_score(score):
    if 0 <= score <= 4:
        return 'Normal'
    elif 5 <= score <= 9:
        return 'Mild'
    elif 10 <= score <= 27:
        return 'ModerateSevere'

# Applying the categorization function to the 'Total_Sum' column
df_phq9_2['depression_category'] = df_phq9_2['total_sum'].apply(categorize_score)

In [14]:
# Seperate by Category
df_phq9_normal = df_phq9_2[df_phq9_2['depression_category'] == 'Normal']
df_phq9_mild = df_phq9_2[df_phq9_2['depression_category'] == 'Mild']
df_phq9_moderatesevere = df_phq9_2[df_phq9_2['depression_category'] == 'ModerateSevere']

In [15]:
df_phq9_normal.info()
df_phq9_mild.info()
df_phq9_moderatesevere.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1308 entries, 0 to 3003
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   NOTGETGO             1308 non-null   object
 1   FLTDEP               1308 non-null   object
 2   NOSLEEP              1308 non-null   object
 3   RESTLES              1308 non-null   object
 4   NOTEAT               1308 non-null   object
 5   CONFIDNT             1308 non-null   object
 6   FLTEFF               1308 non-null   object
 7   RELAXED              1308 non-null   object
 8   WORRY                1308 non-null   object
 9   total_sum            1308 non-null   object
 10  depression_category  1308 non-null   object
dtypes: object(11)
memory usage: 122.6+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 940 entries, 2 to 3004
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   NOTGETGO      

In [16]:
# Connect with original dataset
mild_indices = df_phq9_mild.index

df_mild_connection = df_pure.loc[mild_indices]
df_mild_connection['depression_category'] = 'mild'
df_mild_connection

moderatesevere_indices = df_phq9_moderatesevere.index

df_moderatesevere_connection = df_pure.loc[moderatesevere_indices]
df_moderatesevere_connection['depression_category'] = 'moderatesevere'
df_moderatesevere_connection

normal_indices = df_phq9_normal.index

df_normal_connection = df_pure.loc[normal_indices]
df_normal_connection['depression_category'] = 'normal'
df_normal_connection

df_appended = pd.concat([df_normal_connection, df_mild_connection, df_moderatesevere_connection], ignore_index=False)

symptoms_array = [
    "NOTGETGO",  # Difficulty getting going
    "FLTDEP",    # Feeling of deep sadness or emptiness
    "NOSLEEP",   # Insomnia or sleeping too much
    "RESTLES",   # Feeling restless
    "NOTEAT",    # Changes in appetite or weight
    "CONFIDNT",  # Lack of confidence
    "UNCNTRL",   # Feeling things are out of control
    "RELAXED",   # Unable to feel relaxed
    "WORRY"   # Worrying thoughts
]

df_appended = df_appended.drop(columns = symptoms_array)
df_appended['total_sum'] = df_phq9_2['total_sum']

In [17]:
df_appended

Unnamed: 0,GENDER,AGE,AGEGRP,DEGREE_RECODE,EDUC,RACE_RECODE,HISPANIC,ETHGRP,MILITARY,JAIL,...,IWLOC5,IWLOC6,STRUCTQ,BUILD,OTBUILD,COMBUILD,CASECOMP,CASEDIF,depression_category,total_sum
0,(2) female,62,(1) 57-64,(5) masters,(4) bachelors or more,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,(4) very well kept,(3) fairly well kept (needs cosmetic work),(3) average,(11) eleventh case or more,(2) somewhat difficult,normal,2
1,(2) female,79,(3) 75-85,(2) high school diploma/equivalency,(3) voc cert/some college/assoc,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,(4) very well kept,(3) fairly well kept (needs cosmetic work),(4) above average,(11) eleventh case or more,(3) not very difficult,normal,3
17,(1) male,58,(1) 57-64,(4) bachelors,(4) bachelors or more,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,(3) fairly well kept (needs cosmetic work),(3) fairly well kept (needs cosmetic work),(3) average,(11) eleventh case or more,(2) somewhat difficult,normal,2
24,(1) male,79,(3) 75-85,(5) masters,(4) bachelors or more,(1) white/caucasian,(0) no,(1) white,(1) yes,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,(4) very well kept,(4) very well kept,(5) far above average,(08) eighth case,(2) somewhat difficult,normal,4
26,(2) female,68,(2) 65-74,"(6) law, md or phd",(4) bachelors or more,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,(4) very well kept,(3) fairly well kept (needs cosmetic work),(4) above average,(11) eleventh case or more,(2) somewhat difficult,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,(2) female,61,(1) 57-64,(1) none,(1) < hs,(1) white/caucasian,(1) yes,"(3) hispanic, non-black",(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(01) trailer,(3) fairly well kept (needs cosmetic work),(1) very poorly kept (needs major repairs),(4) above average,(11) eleventh case or more,(3) not very difficult,moderatesevere,14
2991,(2) female,70,(2) 65-74,(1) none,(1) < hs,"(3) asian, pacific islander, american indian o...",(1) yes,"(3) hispanic, non-black",(0) no,(0) no,...,(1) 1 (quiet),(1) 1 (no smell),(02) detached single family house,,,,(11) eleventh case or more,(3) not very difficult,moderatesevere,11
2992,(2) female,70,(2) 65-74,(3) associates,(3) voc cert/some college/assoc,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(1) 1 (quiet),(4) 4,(02) detached single family house,(3) fairly well kept (needs cosmetic work),(3) fairly well kept (needs cosmetic work),(3) average,(11) eleventh case or more,(4) not at all difficult,moderatesevere,10
2993,(1) male,63,(1) 57-64,(4) bachelors,(4) bachelors or more,(1) white/caucasian,(0) no,(1) white,(0) no,(0) no,...,(2) 2,(1) 1 (no smell),(02) detached single family house,(4) very well kept,(4) very well kept,(3) average,(07) seventh case,(4) not at all difficult,moderatesevere,11


In [18]:
# Export for regression
df_appended.to_csv('3labelv4Regression.csv', index = False)

In [19]:
# Export for classification
df_appended_classification = df_appended.drop('total_sum', axis = 1)
df_appended_classification.to_csv('3labelv4Classification.csv', index = False)