Spaces:
Sleeping
Sleeping
JDFPalladium
cleaning up organization of scripts and data and updating filepaths in app to processed data
389c5f0 | import os | |
| import re | |
| import json | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from langchain_core.documents import Document | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from rapidfuzz import fuzz | |
| # === Setup === | |
| base_dir = os.path.dirname(__file__) | |
| raw_path = os.path.abspath(os.path.join(base_dir, "data", "raw")) | |
| processed_path = os.path.abspath(os.path.join(base_dir, "data", "processed")) | |
| os.makedirs(processed_path, exist_ok=True) | |
| load_dotenv(os.path.join(base_dir, "config.env")) | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| # === Step 1: Read IDSR Text === | |
| with open(os.path.join(raw_path, "IDSR.txt"), encoding="utf-8") as f: | |
| text = f.read() | |
| # === Step 2: Extract Keywords via GPT === | |
| prompt = """ | |
| You are a helpful assistant. Extract a list of 30β50 key symptoms, signs, or diagnostic terms from the following disease descriptions. | |
| Focus on words or phrases that are likely to appear in clinical case definitions or user queries β such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc. | |
| Only return the keywords or short phrases β one per line. | |
| Text: | |
| """ | |
| client = OpenAI() | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": prompt + text} | |
| ], | |
| temperature=0.0 | |
| ) | |
| # Normalize keywords | |
| keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()] | |
| def normalize_kw(kw): | |
| return kw.lstrip("-β’ ").strip().lower() | |
| keywords = [normalize_kw(kw) for kw in keywords] | |
| # Save keywords | |
| kw_path = os.path.join(processed_path, "idsr_keywords.txt") | |
| with open(kw_path, "w", encoding="utf-8") as f: | |
| for keyword in keywords: | |
| f.write(f"{keyword}\n") | |
| print(f"β Saved keywords to {kw_path}") | |
| # === Step 3: Parse Disease Sections === | |
| def parse_disease_text(text): | |
| diseases = [] | |
| lines = text.strip().splitlines() | |
| current_disease = None | |
| current_subsection = None | |
| buffer = [] | |
| def finalize_subsection(): | |
| if current_disease is not None and current_subsection and buffer: | |
| content = " ".join(line.strip() for line in buffer).strip() | |
| current_disease[current_subsection] = content | |
| subsection_pattern = re.compile(r"^-\s*(.+):\s*$") | |
| for line in lines + [""]: | |
| if not line.strip(): | |
| finalize_subsection() | |
| if current_disease: | |
| diseases.append(current_disease) | |
| current_disease = None | |
| current_subsection = None | |
| buffer = [] | |
| continue | |
| if current_disease is None: | |
| current_disease = {"disease_name": line.strip()} | |
| continue | |
| match = subsection_pattern.match(line) | |
| if match: | |
| finalize_subsection() | |
| current_subsection = match.group(1).strip() | |
| buffer = [] | |
| else: | |
| buffer.append(line.rstrip()) | |
| return diseases | |
| disease_dicts = parse_disease_text(text) | |
| # === Step 4: Convert to LangChain Documents === | |
| def convert_disease_dicts_to_documents(disease_dicts): | |
| docs = [] | |
| for disease in disease_dicts: | |
| disease_name = disease.get("disease_name", "") | |
| subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"] | |
| full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections) | |
| docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name})) | |
| return docs | |
| documents = convert_disease_dicts_to_documents(disease_dicts) | |
| # === Step 5: Tag Documents with Keywords === | |
| def tag_documents_with_keywords(documents, keywords, threshold=85): | |
| tagged = [] | |
| for doc in documents: | |
| content = doc.page_content.lower() | |
| matched = [kw for kw in keywords if fuzz.partial_ratio(kw.lower(), content) >= threshold] | |
| doc.metadata["matched_keywords"] = matched | |
| tagged.append(doc) | |
| return tagged | |
| tagged_documents = tag_documents_with_keywords(documents, keywords) | |
| # Save JSON version | |
| json_path = os.path.join(processed_path, "tagged_documents.json") | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump([doc.dict() for doc in tagged_documents], f, ensure_ascii=False, indent=2) | |
| print(f"β Saved tagged documents to {json_path}") | |
| # === Step 6: Build and Save FAISS Vectorstore === | |
| embedding_model = OpenAIEmbeddings() | |
| vectorstore = FAISS.from_documents(tagged_documents, embedding_model) | |
| vs_path = os.path.join(processed_path, "disease_vectorstore") | |
| vectorstore.save_local(vs_path) | |
| print(f"β Saved FAISS vectorstore to {vs_path}") | |