File size: 9,004 Bytes

86cbe3c

import sqlite3
import pandas as pd
from datetime import datetime, timedelta
import random
import os

# Ensure the data directory exists
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)

def create_clinical_trials_db():
    """Creates the clinical_trials.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db'))

    # Studies table
    studies_data = {
        'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'],
        'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'],
        'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'],
        'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'],
        'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'],
        'target_enrollment': [500, 200, 150, 800, 50],
        'current_enrollment': [237, 178, 0, 800, 12],
        'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01']
    }
    pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace')

    # Patients table
    patients_data = {
        'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)],
        'study_id': [random.choice(studies_data['study_id']) for _ in range(100)],
        'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)],
        'age': [random.randint(18, 85) for _ in range(100)],
        'gender': [random.choice(['M', 'F']) for _ in range(100)],
        'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)]
    }
    pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace')

    # Adverse Events table
    adverse_events_data = {
        'event_id': list(range(1, 51)),
        'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)],
        'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)],
        'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)],
        'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)],
        'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)]
    }
    pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace')

    # Add foreign keys (SQLite doesn't enforce, but documents relationships)
    conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)")
    conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)")
    conn.commit()
    conn.close()
    print("✅ Clinical Trials database created successfully!")

def create_laboratory_db():
    """Creates the laboratory.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db'))

    # Lab Tests table
    lab_tests_data = {
        'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)],
        'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)],
        'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
        'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)],
        'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)],
        'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)]
    }
    pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace')

    # Test Results table
    results_data = {
        'result_id': list(range(1, 601)),
        'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)],
        'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)],
        'value': [round(random.uniform(1, 200), 2) for _ in range(600)],
        'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)],
        'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)],
        'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)],
        'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)]
    }
    pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace')

    # Biomarkers table (for research)
    biomarkers_data = {
        'biomarker_id': list(range(1, 31)),
        'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)],
        'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)],
        'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)],
        'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)],
        'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)]
    }
    pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace')

    conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)")
    conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)")
    conn.commit()
    conn.close()
    print("✅ Laboratory database created successfully!")

def create_drug_discovery_db():
    """Creates the drug_discovery.db database."""
    conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db'))

    # Compounds table
    compounds_data = {
        'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)],
        'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)],
        'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)],
        'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)],
        'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)],
        'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)]
    }
    pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace')

    # Assay Results table
    assays_data = {
        'assay_id': list(range(1, 201)),
        'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)],
        'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)],
        'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)],
        'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)],
        'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
        'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)]
    }
    pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace')

    # Drug Targets table
    targets_data = {
        'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)],
        'target_name': [f'Protein-{i}' for i in range(1, 21)],
        'gene_symbol': [f'GENE{i}' for i in range(1, 21)],
        'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)],
        'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)]
    }
    pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace')

    # Compound-Target Associations
    associations_data = {
        'association_id': list(range(1, 76)),
        'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)],
        'target_id': [random.choice(targets_data['target_id']) for _ in range(75)],
        'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)],
        'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)]
    }
    pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace')

    conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)")
    conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)")
    conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)")
    conn.commit()
    conn.close()
    print("✅ Drug Discovery database created successfully!")

if __name__ == "__main__":
    create_clinical_trials_db()
    create_laboratory_db()
    create_drug_discovery_db()
    print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!")