| import duckdb |
| import pandas as pd |
| from parser import parse_symptoms |
| from embeddings import compute_weighted_embedding |
|
|
| def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"): |
| con = duckdb.connect("medical.db") |
|
|
| |
| con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;") |
| con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;") |
|
|
| |
| con.execute(""" |
| CREATE TABLE IF NOT EXISTS disease ( |
| disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'), |
| name TEXT UNIQUE |
| ); |
| """) |
|
|
| con.execute(""" |
| CREATE TABLE IF NOT EXISTS symptom ( |
| symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'), |
| name TEXT UNIQUE |
| ); |
| """) |
|
|
| con.execute(""" |
| CREATE TABLE IF NOT EXISTS disease_symptom ( |
| disease_id INTEGER, |
| symptom_id INTEGER, |
| incidence FLOAT, |
| PRIMARY KEY (disease_id, symptom_id) |
| ); |
| """) |
|
|
| con.execute(""" |
| CREATE TABLE IF NOT EXISTS disease_embedding ( |
| disease_id INTEGER, |
| embedding DOUBLE[] |
| ); |
| """) |
|
|
| |
| df = pd.read_csv(csv_path) |
|
|
| for _, row in df.iterrows(): |
| disease = row.iloc[0].strip().lower() |
| symptoms = parse_symptoms(row.iloc[1]) |
|
|
| con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease]) |
| disease_id = con.execute( |
| "SELECT disease_id FROM disease WHERE name = ?", |
| [disease] |
| ).fetchone()[0] |
|
|
| for symptom, incidence in symptoms: |
| con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom]) |
| symptom_id = con.execute( |
| "SELECT symptom_id FROM symptom WHERE name = ?", |
| [symptom] |
| ).fetchone()[0] |
|
|
| con.execute(""" |
| INSERT OR IGNORE INTO disease_symptom |
| VALUES (?, ?, ?) |
| """, [disease_id, symptom_id, incidence]) |
|
|
| embedding = compute_weighted_embedding(symptoms) |
| con.execute( |
| "INSERT INTO disease_embedding VALUES (?, ?)", |
| [disease_id, embedding.tolist()] |
| ) |
|
|
| return con |
|
|