agent-mcp-sql / ops /scripts /generate_sample_databases.py
ohmygaugh's picture
1st pass at merging the code bases(does not run yet)
86cbe3c
import sqlite3
import pandas as pd
from datetime import datetime, timedelta
import random
import os
# Ensure the data directory exists
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)
def create_clinical_trials_db():
"""Creates the clinical_trials.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db'))
# Studies table
studies_data = {
'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'],
'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'],
'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'],
'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'],
'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'],
'target_enrollment': [500, 200, 150, 800, 50],
'current_enrollment': [237, 178, 0, 800, 12],
'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01']
}
pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace')
# Patients table
patients_data = {
'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)],
'study_id': [random.choice(studies_data['study_id']) for _ in range(100)],
'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)],
'age': [random.randint(18, 85) for _ in range(100)],
'gender': [random.choice(['M', 'F']) for _ in range(100)],
'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)]
}
pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace')
# Adverse Events table
adverse_events_data = {
'event_id': list(range(1, 51)),
'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)],
'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)],
'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)],
'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)],
'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)]
}
pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace')
# Add foreign keys (SQLite doesn't enforce, but documents relationships)
conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)")
conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)")
conn.commit()
conn.close()
print("✅ Clinical Trials database created successfully!")
def create_laboratory_db():
"""Creates the laboratory.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db'))
# Lab Tests table
lab_tests_data = {
'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)],
'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)],
'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)],
'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)],
'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)]
}
pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace')
# Test Results table
results_data = {
'result_id': list(range(1, 601)),
'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)],
'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)],
'value': [round(random.uniform(1, 200), 2) for _ in range(600)],
'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)],
'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)],
'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)],
'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)]
}
pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace')
# Biomarkers table (for research)
biomarkers_data = {
'biomarker_id': list(range(1, 31)),
'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)],
'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)],
'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)],
'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)],
'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)]
}
pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace')
conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)")
conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)")
conn.commit()
conn.close()
print("✅ Laboratory database created successfully!")
def create_drug_discovery_db():
"""Creates the drug_discovery.db database."""
conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db'))
# Compounds table
compounds_data = {
'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)],
'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)],
'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)],
'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)],
'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)],
'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)]
}
pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace')
# Assay Results table
assays_data = {
'assay_id': list(range(1, 201)),
'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)],
'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)],
'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)],
'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)],
'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)]
}
pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace')
# Drug Targets table
targets_data = {
'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)],
'target_name': [f'Protein-{i}' for i in range(1, 21)],
'gene_symbol': [f'GENE{i}' for i in range(1, 21)],
'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)],
'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)]
}
pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace')
# Compound-Target Associations
associations_data = {
'association_id': list(range(1, 76)),
'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)],
'target_id': [random.choice(targets_data['target_id']) for _ in range(75)],
'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)],
'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)]
}
pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace')
conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)")
conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)")
conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)")
conn.commit()
conn.close()
print("✅ Drug Discovery database created successfully!")
if __name__ == "__main__":
create_clinical_trials_db()
create_laboratory_db()
create_drug_discovery_db()
print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!")