Spaces:

Nishauri
/

ClinicianAssistant

Sleeping

File size: 4,693 Bytes

389c5f0

import os
import re
import json
from dotenv import load_dotenv
from openai import OpenAI
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from rapidfuzz import fuzz

# === Setup ===
base_dir = os.path.dirname(__file__)
raw_path = os.path.abspath(os.path.join(base_dir, "data", "raw"))
processed_path = os.path.abspath(os.path.join(base_dir, "data", "processed"))
os.makedirs(processed_path, exist_ok=True)

load_dotenv(os.path.join(base_dir, "config.env"))
api_key = os.environ.get("OPENAI_API_KEY")

# === Step 1: Read IDSR Text ===
with open(os.path.join(raw_path, "IDSR.txt"), encoding="utf-8") as f:
    text = f.read()

# === Step 2: Extract Keywords via GPT ===
prompt = """
You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.

Focus on words or phrases that are likely to appear in clinical case definitions or user queries — such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc.

Only return the keywords or short phrases — one per line.

Text:
"""

client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt + text}
    ],
    temperature=0.0
)

# Normalize keywords
keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]
def normalize_kw(kw):
    return kw.lstrip("-• ").strip().lower()
keywords = [normalize_kw(kw) for kw in keywords]

# Save keywords
kw_path = os.path.join(processed_path, "idsr_keywords.txt")
with open(kw_path, "w", encoding="utf-8") as f:
    for keyword in keywords:
        f.write(f"{keyword}\n")

print(f"✅ Saved keywords to {kw_path}")

# === Step 3: Parse Disease Sections ===
def parse_disease_text(text):
    diseases = []
    lines = text.strip().splitlines()

    current_disease = None
    current_subsection = None
    buffer = []

    def finalize_subsection():
        if current_disease is not None and current_subsection and buffer:
            content = " ".join(line.strip() for line in buffer).strip()
            current_disease[current_subsection] = content

    subsection_pattern = re.compile(r"^-\s*(.+):\s*$")

    for line in lines + [""]:
        if not line.strip():
            finalize_subsection()
            if current_disease:
                diseases.append(current_disease)
            current_disease = None
            current_subsection = None
            buffer = []
            continue

        if current_disease is None:
            current_disease = {"disease_name": line.strip()}
            continue

        match = subsection_pattern.match(line)
        if match:
            finalize_subsection()
            current_subsection = match.group(1).strip()
            buffer = []
        else:
            buffer.append(line.rstrip())

    return diseases

disease_dicts = parse_disease_text(text)

# === Step 4: Convert to LangChain Documents ===
def convert_disease_dicts_to_documents(disease_dicts):
    docs = []
    for disease in disease_dicts:
        disease_name = disease.get("disease_name", "")
        subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"]
        full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections)
        docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name}))
    return docs

documents = convert_disease_dicts_to_documents(disease_dicts)

# === Step 5: Tag Documents with Keywords ===
def tag_documents_with_keywords(documents, keywords, threshold=85):
    tagged = []
    for doc in documents:
        content = doc.page_content.lower()
        matched = [kw for kw in keywords if fuzz.partial_ratio(kw.lower(), content) >= threshold]
        doc.metadata["matched_keywords"] = matched
        tagged.append(doc)
    return tagged

tagged_documents = tag_documents_with_keywords(documents, keywords)

# Save JSON version
json_path = os.path.join(processed_path, "tagged_documents.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump([doc.dict() for doc in tagged_documents], f, ensure_ascii=False, indent=2)

print(f"✅ Saved tagged documents to {json_path}")

# === Step 6: Build and Save FAISS Vectorstore ===
embedding_model = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(tagged_documents, embedding_model)
vs_path = os.path.join(processed_path, "disease_vectorstore")
vectorstore.save_local(vs_path)

print(f"✅ Saved FAISS vectorstore to {vs_path}")