File size: 5,718 Bytes
25fcb73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# =================================================================================
# data_processing.py: Process and prepare raw data
# =================================================================================
import json
import re
from llama_index.core import Document
from tqdm import tqdm
import config
def clean_text(text: str) -> str:
"""
Cleans the input text by removing common noise from FDA documents.
"""
text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
text = re.sub(r'\s{2,}', ' ', text).strip()
text = re.sub(r'[\-=*]{3,}', '', text)
return text
def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
"""
Loads drug data from a JSON file, filters for high-quality entries,
cleans the text, and returns a list of LangChain Document objects.
"""
print(f"Loading data from: {json_path}...")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
all_docs = []
print("Filtering, cleaning, and converting data to 'Document' objects...")
for entry in tqdm(data, desc="Processing drug data"):
if not entry: continue
# --- NEW FILTERING LOGIC ---
# 1. Ensure the entry has a brand or generic name.
brand_name_list = entry.get("openfda", {}).get("brand_name")
generic_name_list = entry.get("openfda", {}).get("generic_name")
if not brand_name_list and not generic_name_list:
continue # Skip entries with no name
# 2. Ensure it's likely a real drug by checking for a crucial section.
if "indications_and_usage" not in entry:
continue # Skip entries that don't say what the drug is for
brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
# ---------------------------
sections_to_process = {
"indications_and_usage": "Indications and Usage",
"adverse_reactions": "Adverse Reactions",
"drug_interactions": "Drug Interactions",
"contraindications": "Contraindications",
"warnings": "Warnings",
"boxed_warning": "Boxed Warning",
"mechanism_of_action": "Mechanism of Action",
"pharmacokinetics": "Pharmacokinetics",
"dosage_and_administration": "Dosage and Administration",
"how_supplied": "How Supplied",
"storage_and_handling": "Storage and Handling",
"information_for_patients": "Information for Patients",
"pregnancy": "Pregnancy",
"nursing_mothers": "Nursing Mothers",
"pediatric_use": "Pediatric Use",
"geriatric_use": "Geriatric Use"
}
for key, section_name in sections_to_process.items():
text_list = entry.get(key)
if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
cleaned_text = clean_text(text_list[0])
if cleaned_text:
metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
doc = Document(page_content=cleaned_text, metadata=metadata)
all_docs.append(doc)
print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
return all_docs
def load_and_process_all():
"""
Loads and processes documents from all configured data sources.
"""
all_docs = []
# Process FDA drug data
fda_docs = load_and_prepare_fda_documents()
all_docs.extend(fda_docs)
# Process HealthCareMagic data
# healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH)
# all_docs.extend(healthcare_docs)
'''# Process MedQuad data
medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)
all_docs.extend(medquad_docs)'''
print(f"Total documents loaded from all sources: {len(all_docs)}")
return all_docs
def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
"""
Loads cleaned drug data from a JSON Lines file and converts it into
a list of LlamaIndex Document objects for the RAG pipeline.
"""
print(f"Loading cleaned drug data from: {json_path}...")
all_docs = []
try:
with open(json_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc="Processing cleaned drug data"):
entry = json.loads(line)
content = entry.get("content")
if not content:
continue
metadata = {
"doc_id": entry.get("doc_id"),
"brand_name": entry.get("brand_name"),
"generic_name": entry.get("generic_name"),
"section": entry.get("section"),
"source": "FDA Drug Labels"
}
# The text for the document is just the content of the section
doc = Document(text=content, metadata=metadata)
all_docs.append(doc)
except FileNotFoundError:
print(f"Error: The file '{json_path}' was not found.")
return []
except json.JSONDecodeError as e:
print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
return []
print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
return all_docs
|