import sqlite3 import pandas as pd from datetime import datetime, timedelta import random import os # Ensure the data directory exists DATA_DIR = 'data' os.makedirs(DATA_DIR, exist_ok=True) def create_clinical_trials_db(): """Creates the clinical_trials.db database.""" conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db')) # Studies table studies_data = { 'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'], 'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'], 'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'], 'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'], 'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'], 'target_enrollment': [500, 200, 150, 800, 50], 'current_enrollment': [237, 178, 0, 800, 12], 'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01'] } pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace') # Patients table patients_data = { 'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)], 'study_id': [random.choice(studies_data['study_id']) for _ in range(100)], 'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)], 'age': [random.randint(18, 85) for _ in range(100)], 'gender': [random.choice(['M', 'F']) for _ in range(100)], 'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)] } pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace') # Adverse Events table adverse_events_data = { 'event_id': list(range(1, 51)), 'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)], 'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)], 'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)], 'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)], 'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)] } pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace') # Add foreign keys (SQLite doesn't enforce, but documents relationships) conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)") conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)") conn.commit() conn.close() print("āœ… Clinical Trials database created successfully!") def create_laboratory_db(): """Creates the laboratory.db database.""" conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db')) # Lab Tests table lab_tests_data = { 'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)], 'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)], 'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)], 'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)], 'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)], 'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)] } pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace') # Test Results table results_data = { 'result_id': list(range(1, 601)), 'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)], 'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)], 'value': [round(random.uniform(1, 200), 2) for _ in range(600)], 'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)], 'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)], 'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)], 'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)] } pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace') # Biomarkers table (for research) biomarkers_data = { 'biomarker_id': list(range(1, 31)), 'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)], 'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)], 'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)], 'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)], 'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)] } pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace') conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)") conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)") conn.commit() conn.close() print("āœ… Laboratory database created successfully!") def create_drug_discovery_db(): """Creates the drug_discovery.db database.""" conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db')) # Compounds table compounds_data = { 'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)], 'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)], 'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)], 'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)], 'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)], 'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)] } pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace') # Assay Results table assays_data = { 'assay_id': list(range(1, 201)), 'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)], 'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)], 'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)], 'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)], 'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)], 'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)] } pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace') # Drug Targets table targets_data = { 'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)], 'target_name': [f'Protein-{i}' for i in range(1, 21)], 'gene_symbol': [f'GENE{i}' for i in range(1, 21)], 'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)], 'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)] } pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace') # Compound-Target Associations associations_data = { 'association_id': list(range(1, 76)), 'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)], 'target_id': [random.choice(targets_data['target_id']) for _ in range(75)], 'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)], 'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)] } pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace') conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)") conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)") conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)") conn.commit() conn.close() print("āœ… Drug Discovery database created successfully!") if __name__ == "__main__": create_clinical_trials_db() create_laboratory_db() create_drug_discovery_db() print("\nšŸŽ‰ All three Life Sciences databases created successfully in the 'data' directory!")