santanche's picture
fix (dbvec): wrong internal import
e0d8955
import duckdb
import pandas as pd
from parser import parse_symptoms
from embeddings import compute_weighted_embedding
def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"):
con = duckdb.connect("medical.db")
# Sequences
con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;")
con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;")
# Tables
con.execute("""
CREATE TABLE IF NOT EXISTS disease (
disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'),
name TEXT UNIQUE
);
""")
con.execute("""
CREATE TABLE IF NOT EXISTS symptom (
symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'),
name TEXT UNIQUE
);
""")
con.execute("""
CREATE TABLE IF NOT EXISTS disease_symptom (
disease_id INTEGER,
symptom_id INTEGER,
incidence FLOAT,
PRIMARY KEY (disease_id, symptom_id)
);
""")
con.execute("""
CREATE TABLE IF NOT EXISTS disease_embedding (
disease_id INTEGER,
embedding DOUBLE[]
);
""")
# Load CSV
df = pd.read_csv(csv_path)
for _, row in df.iterrows():
disease = row.iloc[0].strip().lower()
symptoms = parse_symptoms(row.iloc[1])
con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
disease_id = con.execute(
"SELECT disease_id FROM disease WHERE name = ?",
[disease]
).fetchone()[0]
for symptom, incidence in symptoms:
con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
symptom_id = con.execute(
"SELECT symptom_id FROM symptom WHERE name = ?",
[symptom]
).fetchone()[0]
con.execute("""
INSERT OR IGNORE INTO disease_symptom
VALUES (?, ?, ?)
""", [disease_id, symptom_id, incidence])
embedding = compute_weighted_embedding(symptoms)
con.execute(
"INSERT INTO disease_embedding VALUES (?, ?)",
[disease_id, embedding.tolist()]
)
return con