Spaces:

ohmygaugh
/

agent-mcp-sql

No application file

App Files Files Community

agent-mcp-sql / ops /scripts /generate_sample_databases.py

ohmygaugh

1st pass at merging the code bases(does not run yet)

86cbe3c 5 months ago

raw

history blame contribute delete

9 kB

	import sqlite3
	import pandas as pd
	from datetime import datetime, timedelta
	import random
	import os

	# Ensure the data directory exists
	DATA_DIR = 'data'
	os.makedirs(DATA_DIR, exist_ok=True)

	def create_clinical_trials_db():
	"""Creates the clinical_trials.db database."""
	conn = sqlite3.connect(os.path.join(DATA_DIR, 'clinical_trials.db'))

	# Studies table
	studies_data = {
	'study_id': ['ONCO-2023-001', 'CARDIO-2023-047', 'NEURO-2024-012', 'DIAB-2023-089', 'RARE-2024-003'],
	'study_name': ['Phase III Immunotherapy Trial', 'Beta Blocker Efficacy Study', 'Alzheimer Prevention Trial', 'Insulin Resistance Study', 'Rare Disease Natural History'],
	'phase': ['Phase 3', 'Phase 2', 'Phase 2', 'Phase 3', 'Observational'],
	'status': ['RECRUITING', 'ACTIVE', 'PLANNING', 'COMPLETED', 'RECRUITING'],
	'sponsor': ['OncoPharm Inc', 'CardioHealth', 'NeuroGen', 'DiabetesCare', 'NIH'],
	'target_enrollment': [500, 200, 150, 800, 50],
	'current_enrollment': [237, 178, 0, 800, 12],
	'start_date': ['2023-03-15', '2023-06-01', '2024-01-15', '2023-01-10', '2024-02-01']
	}
	pd.DataFrame(studies_data).to_sql('studies', conn, index=False, if_exists='replace')

	# Patients table
	patients_data = {
	'patient_id': [f'PT{str(i).zfill(6)}' for i in range(1, 101)],
	'study_id': [random.choice(studies_data['study_id']) for _ in range(100)],
	'enrollment_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(100)],
	'age': [random.randint(18, 85) for _ in range(100)],
	'gender': [random.choice(['M', 'F']) for _ in range(100)],
	'status': [random.choice(['ENROLLED', 'COMPLETED', 'WITHDRAWN', 'SCREENING']) for _ in range(100)]
	}
	pd.DataFrame(patients_data).to_sql('patients', conn, index=False, if_exists='replace')

	# Adverse Events table
	adverse_events_data = {
	'event_id': list(range(1, 51)),
	'patient_id': [random.choice(patients_data['patient_id'][:50]) for _ in range(50)],
	'event_date': [(datetime.now() - timedelta(days=random.randint(1, 180))).strftime('%Y-%m-%d') for _ in range(50)],
	'event_type': [random.choice(['NAUSEA', 'HEADACHE', 'FATIGUE', 'RASH', 'FEVER']) for _ in range(50)],
	'severity': [random.choice(['MILD', 'MODERATE', 'SEVERE']) for _ in range(50)],
	'related_to_treatment': [random.choice(['YES', 'NO', 'UNKNOWN']) for _ in range(50)]
	}
	pd.DataFrame(adverse_events_data).to_sql('adverse_events', conn, index=False, if_exists='replace')

	# Add foreign keys (SQLite doesn't enforce, but documents relationships)
	conn.execute("CREATE INDEX idx_patients_study ON patients(study_id)")
	conn.execute("CREATE INDEX idx_events_patient ON adverse_events(patient_id)")
	conn.commit()
	conn.close()
	print("✅ Clinical Trials database created successfully!")

	def create_laboratory_db():
	"""Creates the laboratory.db database."""
	conn = sqlite3.connect(os.path.join(DATA_DIR, 'laboratory.db'))

	# Lab Tests table
	lab_tests_data = {
	'test_id': [f'LAB{str(i).zfill(8)}' for i in range(1, 201)],
	'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(200)],
	'test_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
	'test_type': [random.choice(['CBC', 'METABOLIC_PANEL', 'LIVER_FUNCTION', 'LIPID_PANEL', 'HBA1C']) for _ in range(200)],
	'ordered_by': [f'DR{str(random.randint(1, 20)).zfill(4)}' for _ in range(200)],
	'priority': [random.choice(['ROUTINE', 'URGENT', 'STAT']) for _ in range(200)]
	}
	pd.DataFrame(lab_tests_data).to_sql('lab_tests', conn, index=False, if_exists='replace')

	# Test Results table
	results_data = {
	'result_id': list(range(1, 601)),
	'test_id': [random.choice(lab_tests_data['test_id']) for _ in range(600)],
	'analyte': [random.choice(['GLUCOSE', 'WBC', 'RBC', 'PLATELETS', 'CREATININE', 'ALT', 'AST', 'CHOLESTEROL']) for _ in range(600)],
	'value': [round(random.uniform(1, 200), 2) for _ in range(600)],
	'unit': [random.choice(['mg/dL', 'K/uL', 'M/uL', 'g/dL', 'mmol/L']) for _ in range(600)],
	'reference_low': [round(random.uniform(1, 50), 2) for _ in range(600)],
	'reference_high': [round(random.uniform(100, 200), 2) for _ in range(600)],
	'flag': [random.choice(['NORMAL', 'HIGH', 'LOW', 'CRITICAL']) for _ in range(600)]
	}
	pd.DataFrame(results_data).to_sql('test_results', conn, index=False, if_exists='replace')

	# Biomarkers table (for research)
	biomarkers_data = {
	'biomarker_id': list(range(1, 31)),
	'patient_id': [f'PT{str(random.randint(1, 100)).zfill(6)}' for _ in range(30)],
	'biomarker_name': [random.choice(['PD-L1', 'BRCA1', 'EGFR', 'KRAS', 'HER2']) for _ in range(30)],
	'expression_level': [random.choice(['HIGH', 'MEDIUM', 'LOW', 'NEGATIVE']) for _ in range(30)],
	'test_method': [random.choice(['IHC', 'PCR', 'NGS', 'FLOW_CYTOMETRY']) for _ in range(30)],
	'collection_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(30)]
	}
	pd.DataFrame(biomarkers_data).to_sql('biomarkers', conn, index=False, if_exists='replace')

	conn.execute("CREATE INDEX idx_results_test ON test_results(test_id)")
	conn.execute("CREATE INDEX idx_tests_patient ON lab_tests(patient_id)")
	conn.commit()
	conn.close()
	print("✅ Laboratory database created successfully!")

	def create_drug_discovery_db():
	"""Creates the drug_discovery.db database."""
	conn = sqlite3.connect(os.path.join(DATA_DIR, 'drug_discovery.db'))

	# Compounds table
	compounds_data = {
	'compound_id': [f'CMP-{str(i).zfill(6)}' for i in range(1, 51)],
	'compound_name': [f'Compound-{chr(65+i//10)}{i%10}' for i in range(50)],
	'molecular_weight': [round(random.uniform(200, 800), 2) for _ in range(50)],
	'formula': [f'C{random.randint(10,30)}H{random.randint(10,40)}N{random.randint(0,5)}O{random.randint(1,10)}' for _ in range(50)],
	'development_stage': [random.choice(['DISCOVERY', 'LEAD_OPT', 'PRECLINICAL', 'CLINICAL', 'DISCONTINUED']) for _ in range(50)],
	'target_class': [random.choice(['KINASE', 'GPCR', 'ION_CHANNEL', 'PROTEASE', 'ANTIBODY']) for _ in range(50)]
	}
	pd.DataFrame(compounds_data).to_sql('compounds', conn, index=False, if_exists='replace')

	# Assay Results table
	assays_data = {
	'assay_id': list(range(1, 201)),
	'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(200)],
	'assay_type': [random.choice(['BINDING', 'CELL_VIABILITY', 'ENZYME_INHIBITION', 'TOXICITY']) for _ in range(200)],
	'ic50_nm': [round(random.uniform(0.1, 10000), 2) for _ in range(200)],
	'efficacy_percent': [round(random.uniform(0, 100), 1) for _ in range(200)],
	'assay_date': [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d') for _ in range(200)],
	'scientist': [f'SCI-{random.randint(1, 10)}' for _ in range(200)]
	}
	pd.DataFrame(assays_data).to_sql('assay_results', conn, index=False, if_exists='replace')

	# Drug Targets table
	targets_data = {
	'target_id': [f'TGT-{str(i).zfill(4)}' for i in range(1, 21)],
	'target_name': [f'Protein-{i}' for i in range(1, 21)],
	'gene_symbol': [f'GENE{i}' for i in range(1, 21)],
	'pathway': [random.choice(['MAPK', 'PI3K/AKT', 'WNT', 'NOTCH', 'HEDGEHOG']) for _ in range(20)],
	'disease_area': [random.choice(['ONCOLOGY', 'CARDIOLOGY', 'NEUROLOGY', 'IMMUNOLOGY']) for _ in range(20)]
	}
	pd.DataFrame(targets_data).to_sql('drug_targets', conn, index=False, if_exists='replace')

	# Compound-Target Associations
	associations_data = {
	'association_id': list(range(1, 76)),
	'compound_id': [random.choice(compounds_data['compound_id']) for _ in range(75)],
	'target_id': [random.choice(targets_data['target_id']) for _ in range(75)],
	'affinity_nm': [round(random.uniform(0.1, 1000), 2) for _ in range(75)],
	'selectivity_fold': [round(random.uniform(1, 100), 1) for _ in range(75)]
	}
	pd.DataFrame(associations_data).to_sql('compound_targets', conn, index=False, if_exists='replace')

	conn.execute("CREATE INDEX idx_assays_compound ON assay_results(compound_id)")
	conn.execute("CREATE INDEX idx_associations_compound ON compound_targets(compound_id)")
	conn.execute("CREATE INDEX idx_associations_target ON compound_targets(target_id)")
	conn.commit()
	conn.close()
	print("✅ Drug Discovery database created successfully!")

	if __name__ == "__main__":
	create_clinical_trials_db()
	create_laboratory_db()
	create_drug_discovery_db()
	print("\n🎉 All three Life Sciences databases created successfully in the 'data' directory!")