File size: 4,693 Bytes
389c5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import re
import json
from dotenv import load_dotenv
from openai import OpenAI
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from rapidfuzz import fuzz

# === Setup ===
base_dir = os.path.dirname(__file__)
raw_path = os.path.abspath(os.path.join(base_dir, "data", "raw"))
processed_path = os.path.abspath(os.path.join(base_dir, "data", "processed"))
os.makedirs(processed_path, exist_ok=True)

load_dotenv(os.path.join(base_dir, "config.env"))
api_key = os.environ.get("OPENAI_API_KEY")

# === Step 1: Read IDSR Text ===
with open(os.path.join(raw_path, "IDSR.txt"), encoding="utf-8") as f:
    text = f.read()

# === Step 2: Extract Keywords via GPT ===
prompt = """
You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.

Focus on words or phrases that are likely to appear in clinical case definitions or user queries β€” such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc.

Only return the keywords or short phrases β€” one per line.

Text:
"""

client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt + text}
    ],
    temperature=0.0
)

# Normalize keywords
keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]
def normalize_kw(kw):
    return kw.lstrip("-β€’ ").strip().lower()
keywords = [normalize_kw(kw) for kw in keywords]

# Save keywords
kw_path = os.path.join(processed_path, "idsr_keywords.txt")
with open(kw_path, "w", encoding="utf-8") as f:
    for keyword in keywords:
        f.write(f"{keyword}\n")

print(f"βœ… Saved keywords to {kw_path}")

# === Step 3: Parse Disease Sections ===
def parse_disease_text(text):
    diseases = []
    lines = text.strip().splitlines()

    current_disease = None
    current_subsection = None
    buffer = []

    def finalize_subsection():
        if current_disease is not None and current_subsection and buffer:
            content = " ".join(line.strip() for line in buffer).strip()
            current_disease[current_subsection] = content

    subsection_pattern = re.compile(r"^-\s*(.+):\s*$")

    for line in lines + [""]:
        if not line.strip():
            finalize_subsection()
            if current_disease:
                diseases.append(current_disease)
            current_disease = None
            current_subsection = None
            buffer = []
            continue

        if current_disease is None:
            current_disease = {"disease_name": line.strip()}
            continue

        match = subsection_pattern.match(line)
        if match:
            finalize_subsection()
            current_subsection = match.group(1).strip()
            buffer = []
        else:
            buffer.append(line.rstrip())

    return diseases

disease_dicts = parse_disease_text(text)

# === Step 4: Convert to LangChain Documents ===
def convert_disease_dicts_to_documents(disease_dicts):
    docs = []
    for disease in disease_dicts:
        disease_name = disease.get("disease_name", "")
        subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"]
        full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections)
        docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name}))
    return docs

documents = convert_disease_dicts_to_documents(disease_dicts)

# === Step 5: Tag Documents with Keywords ===
def tag_documents_with_keywords(documents, keywords, threshold=85):
    tagged = []
    for doc in documents:
        content = doc.page_content.lower()
        matched = [kw for kw in keywords if fuzz.partial_ratio(kw.lower(), content) >= threshold]
        doc.metadata["matched_keywords"] = matched
        tagged.append(doc)
    return tagged

tagged_documents = tag_documents_with_keywords(documents, keywords)

# Save JSON version
json_path = os.path.join(processed_path, "tagged_documents.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump([doc.dict() for doc in tagged_documents], f, ensure_ascii=False, indent=2)

print(f"βœ… Saved tagged documents to {json_path}")

# === Step 6: Build and Save FAISS Vectorstore ===
embedding_model = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(tagged_documents, embedding_model)
vs_path = os.path.join(processed_path, "disease_vectorstore")
vectorstore.save_local(vs_path)

print(f"βœ… Saved FAISS vectorstore to {vs_path}")