File size: 2,158 Bytes
16d3318
 
e0d8955
 
16d3318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import duckdb
import pandas as pd
from parser import parse_symptoms
from embeddings import compute_weighted_embedding

def init_db(csv_path="data/DerivedKnowledgeGraph_final.csv"):
    con = duckdb.connect("medical.db")

    # Sequences
    con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;")
    con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;")

    # Tables
    con.execute("""
    CREATE TABLE IF NOT EXISTS disease (
        disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'),
        name TEXT UNIQUE
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS symptom (
        symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'),
        name TEXT UNIQUE
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS disease_symptom (
        disease_id INTEGER,
        symptom_id INTEGER,
        incidence FLOAT,
        PRIMARY KEY (disease_id, symptom_id)
    );
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS disease_embedding (
        disease_id INTEGER,
        embedding DOUBLE[]
    );
    """)

    # Load CSV
    df = pd.read_csv(csv_path)

    for _, row in df.iterrows():
        disease = row.iloc[0].strip().lower()
        symptoms = parse_symptoms(row.iloc[1])

        con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
        disease_id = con.execute(
            "SELECT disease_id FROM disease WHERE name = ?",
            [disease]
        ).fetchone()[0]

        for symptom, incidence in symptoms:
            con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
            symptom_id = con.execute(
                "SELECT symptom_id FROM symptom WHERE name = ?",
                [symptom]
            ).fetchone()[0]

            con.execute("""
                INSERT OR IGNORE INTO disease_symptom
                VALUES (?, ?, ?)
            """, [disease_id, symptom_id, incidence])

        embedding = compute_weighted_embedding(symptoms)
        con.execute(
            "INSERT INTO disease_embedding VALUES (?, ?)",
            [disease_id, embedding.tolist()]
        )

    return con