import duckdb import pandas as pd from parser import parse_symptoms from embeddings import compute_weighted_embedding def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"): con = duckdb.connect("medical.db") # Sequences con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;") con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;") # Tables con.execute(""" CREATE TABLE IF NOT EXISTS disease ( disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'), name TEXT UNIQUE ); """) con.execute(""" CREATE TABLE IF NOT EXISTS symptom ( symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'), name TEXT UNIQUE ); """) con.execute(""" CREATE TABLE IF NOT EXISTS disease_symptom ( disease_id INTEGER, symptom_id INTEGER, incidence FLOAT, PRIMARY KEY (disease_id, symptom_id) ); """) con.execute(""" CREATE TABLE IF NOT EXISTS disease_embedding ( disease_id INTEGER, embedding DOUBLE[] ); """) # Load CSV df = pd.read_csv(csv_path) for _, row in df.iterrows(): disease = row.iloc[0].strip().lower() symptoms = parse_symptoms(row.iloc[1]) con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease]) disease_id = con.execute( "SELECT disease_id FROM disease WHERE name = ?", [disease] ).fetchone()[0] for symptom, incidence in symptoms: con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom]) symptom_id = con.execute( "SELECT symptom_id FROM symptom WHERE name = ?", [symptom] ).fetchone()[0] con.execute(""" INSERT OR IGNORE INTO disease_symptom VALUES (?, ?, ?) """, [disease_id, symptom_id, incidence]) embedding = compute_weighted_embedding(symptoms) con.execute( "INSERT INTO disease_embedding VALUES (?, ?)", [disease_id, embedding.tolist()] ) return con