Spaces:
Running
Running
| import duckdb | |
| import pandas as pd | |
| from .parser import parse_symptoms | |
| from .embeddings import compute_weighted_embedding | |
| def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"): | |
| con = duckdb.connect("medical.db") | |
| # Sequences | |
| con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;") | |
| con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;") | |
| # Tables | |
| con.execute(""" | |
| CREATE TABLE IF NOT EXISTS disease ( | |
| disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'), | |
| name TEXT UNIQUE | |
| ); | |
| """) | |
| con.execute(""" | |
| CREATE TABLE IF NOT EXISTS symptom ( | |
| symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'), | |
| name TEXT UNIQUE | |
| ); | |
| """) | |
| con.execute(""" | |
| CREATE TABLE IF NOT EXISTS disease_symptom ( | |
| disease_id INTEGER, | |
| symptom_id INTEGER, | |
| incidence FLOAT, | |
| PRIMARY KEY (disease_id, symptom_id) | |
| ); | |
| """) | |
| con.execute(""" | |
| CREATE TABLE IF NOT EXISTS disease_embedding ( | |
| disease_id INTEGER, | |
| embedding DOUBLE[] | |
| ); | |
| """) | |
| # Load CSV | |
| df = pd.read_csv(csv_path) | |
| for _, row in df.iterrows(): | |
| disease = row.iloc[0].strip().lower() | |
| symptoms = parse_symptoms(row.iloc[1]) | |
| con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease]) | |
| disease_id = con.execute( | |
| "SELECT disease_id FROM disease WHERE name = ?", | |
| [disease] | |
| ).fetchone()[0] | |
| for symptom, incidence in symptoms: | |
| con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom]) | |
| symptom_id = con.execute( | |
| "SELECT symptom_id FROM symptom WHERE name = ?", | |
| [symptom] | |
| ).fetchone()[0] | |
| con.execute(""" | |
| INSERT OR IGNORE INTO disease_symptom | |
| VALUES (?, ?, ?) | |
| """, [disease_id, symptom_id, incidence]) | |
| embedding = compute_weighted_embedding(symptoms) | |
| con.execute( | |
| "INSERT INTO disease_embedding VALUES (?, ?)", | |
| [disease_id, embedding.tolist()] | |
| ) | |
| return con | |