Spaces:
No application file
No application file
File size: 9,004 Bytes
86cbe3c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | import sqlite3
import pandas as pd
from datetime import datetime, timedelta
import random
import os
# Ensure the data directory exists
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)
def create_clinical_trials_db():
"""Creates the clinical_trials.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db'))
# Studies table
studies_data = {
'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'],
'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'],
'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'],
'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'],
'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'],
'target_enrollment': [500, 200, 150, 800, 50],
'current_enrollment': [237, 178, 0, 800, 12],
'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01']
}
pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace')
# Patients table
patients_data = {
'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)],
'study_id': [random.choice(studies_data['study_id']) for _ in range(100)],
'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)],
'age': [random.randint(18, 85) for _ in range(100)],
'gender': [random.choice(['M', 'F']) for _ in range(100)],
'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)]
}
pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace')
# Adverse Events table
adverse_events_data = {
'event_id': list(range(1, 51)),
'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)],
'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)],
'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)],
'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)],
'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)]
}
pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace')
# Add foreign keys (SQLite doesn't enforce, but documents relationships)
conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)")
conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)")
conn.commit()
conn.close()
print("✅ Clinical Trials database created successfully!")
def create_laboratory_db():
"""Creates the laboratory.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db'))
# Lab Tests table
lab_tests_data = {
'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)],
'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)],
'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)],
'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)],
'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)]
}
pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace')
# Test Results table
results_data = {
'result_id': list(range(1, 601)),
'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)],
'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)],
'value': [round(random.uniform(1, 200), 2) for _ in range(600)],
'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)],
'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)],
'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)],
'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)]
}
pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace')
# Biomarkers table (for research)
biomarkers_data = {
'biomarker_id': list(range(1, 31)),
'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)],
'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)],
'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)],
'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)],
'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)]
}
pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace')
conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)")
conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)")
conn.commit()
conn.close()
print("✅ Laboratory database created successfully!")
def create_drug_discovery_db():
"""Creates the drug_discovery.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db'))
# Compounds table
compounds_data = {
'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)],
'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)],
'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)],
'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)],
'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)],
'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)]
}
pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace')
# Assay Results table
assays_data = {
'assay_id': list(range(1, 201)),
'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)],
'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)],
'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)],
'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)],
'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)]
}
pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace')
# Drug Targets table
targets_data = {
'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)],
'target_name': [f'Protein-{i}' for i in range(1, 21)],
'gene_symbol': [f'GENE{i}' for i in range(1, 21)],
'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)],
'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)]
}
pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace')
# Compound-Target Associations
associations_data = {
'association_id': list(range(1, 76)),
'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)],
'target_id': [random.choice(targets_data['target_id']) for _ in range(75)],
'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)],
'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)]
}
pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace')
conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)")
conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)")
conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)")
conn.commit()
conn.close()
print("✅ Drug Discovery database created successfully!")
if __name__ == "__main__":
create_clinical_trials_db()
create_laboratory_db()
create_drug_discovery_db()
print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!")
|