File size: 9,004 Bytes
86cbe3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import sqlite3
import pandas as pd
from datetime import datetime, timedelta
import random
import os

# Ensure the data directory exists
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)

def create_clinical_trials_db():
    """Creates the clinical_trials.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db'))

    # Studies table
    studies_data = {
        'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'],
        'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'],
        'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'],
        'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'],
        'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'],
        'target_enrollment': [500, 200, 150, 800, 50],
        'current_enrollment': [237, 178, 0, 800, 12],
        'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01']
    }
    pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace')

    # Patients table
    patients_data = {
        'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)],
        'study_id': [random.choice(studies_data['study_id']) for _ in range(100)],
        'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)],
        'age': [random.randint(18, 85) for _ in range(100)],
        'gender': [random.choice(['M', 'F']) for _ in range(100)],
        'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)]
    }
    pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace')

    # Adverse Events table
    adverse_events_data = {
        'event_id': list(range(1, 51)),
        'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)],
        'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)],
        'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)],
        'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)],
        'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)]
    }
    pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace')

    # Add foreign keys (SQLite doesn't enforce, but documents relationships)
    conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)")
    conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)")
    conn.commit()
    conn.close()
    print("✅ Clinical Trials database created successfully!")

def create_laboratory_db():
    """Creates the laboratory.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db'))

    # Lab Tests table
    lab_tests_data = {
        'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)],
        'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)],
        'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
        'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)],
        'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)],
        'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)]
    }
    pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace')

    # Test Results table
    results_data = {
        'result_id': list(range(1, 601)),
        'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)],
        'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)],
        'value': [round(random.uniform(1, 200), 2) for _ in range(600)],
        'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)],
        'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)],
        'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)],
        'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)]
    }
    pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace')

    # Biomarkers table (for research)
    biomarkers_data = {
        'biomarker_id': list(range(1, 31)),
        'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)],
        'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)],
        'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)],
        'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)],
        'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)]
    }
    pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace')

    conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)")
    conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)")
    conn.commit()
    conn.close()
    print("✅ Laboratory database created successfully!")

def create_drug_discovery_db():
    """Creates the drug_discovery.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db'))

    # Compounds table
    compounds_data = {
        'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)],
        'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)],
        'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)],
        'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)],
        'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)],
        'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)]
    }
    pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace')

    # Assay Results table
    assays_data = {
        'assay_id': list(range(1, 201)),
        'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)],
        'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)],
        'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)],
        'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)],
        'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
        'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)]
    }
    pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace')

    # Drug Targets table
    targets_data = {
        'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)],
        'target_name': [f'Protein-{i}' for i in range(1, 21)],
        'gene_symbol': [f'GENE{i}' for i in range(1, 21)],
        'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)],
        'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)]
    }
    pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace')

    # Compound-Target Associations
    associations_data = {
        'association_id': list(range(1, 76)),
        'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)],
        'target_id': [random.choice(targets_data['target_id']) for _ in range(75)],
        'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)],
        'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)]
    }
    pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace')

    conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)")
    conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)")
    conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)")
    conn.commit()
    conn.close()
    print("✅ Drug Discovery database created successfully!")

if __name__ == "__main__":
    create_clinical_trials_db()
    create_laboratory_db()
    create_drug_discovery_db()
    print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!")