sml-agents-publish-subscribe-dbvec

Sleeping

File size: 2,158 Bytes

import duckdb
import pandas as pd
from parser import parse_symptoms
from embeddings import compute_weighted_embedding

def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"):
    con = duckdb.connect("medical.db")

    # Sequences
    con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;")
    con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;")

    # Tables
    con.execute("""
    CREATE TABLE IF NOT EXISTS disease (
        disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'),
        name TEXT UNIQUE
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS symptom (
        symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'),
        name TEXT UNIQUE
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS disease_symptom (
        disease_id INTEGER,
        symptom_id INTEGER,
        incidence FLOAT,
        PRIMARY KEY (disease_id, symptom_id)
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS disease_embedding (
        disease_id INTEGER,
        embedding DOUBLE[]
    );
    """)

    # Load CSV
    df = pd.read_csv(csv_path)

    for _, row in df.iterrows():
        disease = row.iloc[0].strip().lower()
        symptoms = parse_symptoms(row.iloc[1])

        con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
        disease_id = con.execute(
            "SELECT disease_id FROM disease WHERE name = ?",
            [disease]
        ).fetchone()[0]

        for symptom, incidence in symptoms:
            con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
            symptom_id = con.execute(
                "SELECT symptom_id FROM symptom WHERE name = ?",
                [symptom]
            ).fetchone()[0]

            con.execute("""
                INSERT OR IGNORE INTO disease_symptom
                VALUES (?, ?, ?)
            """, [disease_id, symptom_id, incidence])

        embedding = compute_weighted_embedding(symptoms)
        con.execute(
            "INSERT INTO disease_embedding VALUES (?, ?)",
            [disease_id, embedding.tolist()]
        )

    return con