Spaces:
No application file
No application file
| import sqlite3 | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| import random | |
| import os | |
| # Ensure the data directory exists | |
| DATA_DIR = 'data' | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| def create_clinical_trials_db(): | |
| """Creates the clinical_trials.db database.""" | |
| conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db')) | |
| # Studies table | |
| studies_data = { | |
| 'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'], | |
| 'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'], | |
| 'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'], | |
| 'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'], | |
| 'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'], | |
| 'target_enrollment': [500, 200, 150, 800, 50], | |
| 'current_enrollment': [237, 178, 0, 800, 12], | |
| 'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01'] | |
| } | |
| pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace') | |
| # Patients table | |
| patients_data = { | |
| 'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)], | |
| 'study_id': [random.choice(studies_data['study_id']) for _ in range(100)], | |
| 'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)], | |
| 'age': [random.randint(18, 85) for _ in range(100)], | |
| 'gender': [random.choice(['M', 'F']) for _ in range(100)], | |
| 'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)] | |
| } | |
| pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace') | |
| # Adverse Events table | |
| adverse_events_data = { | |
| 'event_id': list(range(1, 51)), | |
| 'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)], | |
| 'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)], | |
| 'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)], | |
| 'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)], | |
| 'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)] | |
| } | |
| pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace') | |
| # Add foreign keys (SQLite doesn't enforce, but documents relationships) | |
| conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)") | |
| conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)") | |
| conn.commit() | |
| conn.close() | |
| print("✅ Clinical Trials database created successfully!") | |
| def create_laboratory_db(): | |
| """Creates the laboratory.db database.""" | |
| conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db')) | |
| # Lab Tests table | |
| lab_tests_data = { | |
| 'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)], | |
| 'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)], | |
| 'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)], | |
| 'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)], | |
| 'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)], | |
| 'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)] | |
| } | |
| pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace') | |
| # Test Results table | |
| results_data = { | |
| 'result_id': list(range(1, 601)), | |
| 'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)], | |
| 'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)], | |
| 'value': [round(random.uniform(1, 200), 2) for _ in range(600)], | |
| 'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)], | |
| 'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)], | |
| 'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)], | |
| 'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)] | |
| } | |
| pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace') | |
| # Biomarkers table (for research) | |
| biomarkers_data = { | |
| 'biomarker_id': list(range(1, 31)), | |
| 'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)], | |
| 'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)], | |
| 'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)], | |
| 'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)], | |
| 'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)] | |
| } | |
| pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace') | |
| conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)") | |
| conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)") | |
| conn.commit() | |
| conn.close() | |
| print("✅ Laboratory database created successfully!") | |
| def create_drug_discovery_db(): | |
| """Creates the drug_discovery.db database.""" | |
| conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db')) | |
| # Compounds table | |
| compounds_data = { | |
| 'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)], | |
| 'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)], | |
| 'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)], | |
| 'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)], | |
| 'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)], | |
| 'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)] | |
| } | |
| pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace') | |
| # Assay Results table | |
| assays_data = { | |
| 'assay_id': list(range(1, 201)), | |
| 'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)], | |
| 'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)], | |
| 'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)], | |
| 'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)], | |
| 'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)], | |
| 'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)] | |
| } | |
| pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace') | |
| # Drug Targets table | |
| targets_data = { | |
| 'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)], | |
| 'target_name': [f'Protein-{i}' for i in range(1, 21)], | |
| 'gene_symbol': [f'GENE{i}' for i in range(1, 21)], | |
| 'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)], | |
| 'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)] | |
| } | |
| pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace') | |
| # Compound-Target Associations | |
| associations_data = { | |
| 'association_id': list(range(1, 76)), | |
| 'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)], | |
| 'target_id': [random.choice(targets_data['target_id']) for _ in range(75)], | |
| 'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)], | |
| 'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)] | |
| } | |
| pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace') | |
| conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)") | |
| conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)") | |
| conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)") | |
| conn.commit() | |
| conn.close() | |
| print("✅ Drug Discovery database created successfully!") | |
| if __name__ == "__main__": | |
| create_clinical_trials_db() | |
| create_laboratory_db() | |
| create_drug_discovery_db() | |
| print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!") | |