Spaces:

alperensn
/

PharmaBot

Sleeping

File size: 5,718 Bytes

25fcb73

# =================================================================================
# data_processing.py: Process and prepare raw data
# =================================================================================
import json
import re
from llama_index.core import Document
from tqdm import tqdm
import config

def clean_text(text: str) -> str:
    """

    Cleans the input text by removing common noise from FDA documents.

    """
    text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    text = re.sub(r'[\-=*]{3,}', '', text)
    return text

def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
    """

    Loads drug data from a JSON file, filters for high-quality entries,

    cleans the text, and returns a list of LangChain Document objects.

    """
    print(f"Loading data from: {json_path}...")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    all_docs = []
    print("Filtering, cleaning, and converting data to 'Document' objects...")
    for entry in tqdm(data, desc="Processing drug data"):
        if not entry: continue

        # --- NEW FILTERING LOGIC ---
        # 1. Ensure the entry has a brand or generic name.
        brand_name_list = entry.get("openfda", {}).get("brand_name")
        generic_name_list = entry.get("openfda", {}).get("generic_name")
        
        if not brand_name_list and not generic_name_list:
            continue # Skip entries with no name

        # 2. Ensure it's likely a real drug by checking for a crucial section.
        if "indications_and_usage" not in entry:
            continue # Skip entries that don't say what the drug is for

        brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
        generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
        # ---------------------------

        sections_to_process = {
            "indications_and_usage": "Indications and Usage",
            "adverse_reactions": "Adverse Reactions",
            "drug_interactions": "Drug Interactions",
            "contraindications": "Contraindications",
            "warnings": "Warnings",
            "boxed_warning": "Boxed Warning",
            "mechanism_of_action": "Mechanism of Action",
            "pharmacokinetics": "Pharmacokinetics",
            "dosage_and_administration": "Dosage and Administration",
            "how_supplied": "How Supplied",
            "storage_and_handling": "Storage and Handling",
            "information_for_patients": "Information for Patients",
            "pregnancy": "Pregnancy",
            "nursing_mothers": "Nursing Mothers",
            "pediatric_use": "Pediatric Use",
            "geriatric_use": "Geriatric Use"
        }

        for key, section_name in sections_to_process.items():
            text_list = entry.get(key)
            if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
                cleaned_text = clean_text(text_list[0])
                if cleaned_text:
                    metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
                    doc = Document(page_content=cleaned_text, metadata=metadata)
                    all_docs.append(doc)

    print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
    return all_docs

def load_and_process_all():
    """

    Loads and processes documents from all configured data sources.

    """
    all_docs = []

    # Process FDA drug data
    fda_docs = load_and_prepare_fda_documents()
    all_docs.extend(fda_docs)

    # Process HealthCareMagic data
    # healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH)
    # all_docs.extend(healthcare_docs)

    '''# Process MedQuad data

    medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)

    all_docs.extend(medquad_docs)'''

    print(f"Total documents loaded from all sources: {len(all_docs)}")
    return all_docs

def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
    """

    Loads cleaned drug data from a JSON Lines file and converts it into

    a list of LlamaIndex Document objects for the RAG pipeline.

    """
    print(f"Loading cleaned drug data from: {json_path}...")
    all_docs = []
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc="Processing cleaned drug data"):
                entry = json.loads(line)
                
                content = entry.get("content")
                if not content:
                    continue

                metadata = {
                    "doc_id": entry.get("doc_id"),
                    "brand_name": entry.get("brand_name"),
                    "generic_name": entry.get("generic_name"),
                    "section": entry.get("section"),
                    "source": "FDA Drug Labels"
                }
                
                # The text for the document is just the content of the section
                doc = Document(text=content, metadata=metadata)
                all_docs.append(doc)

    except FileNotFoundError:
        print(f"Error: The file '{json_path}' was not found.")
        return []
    except json.JSONDecodeError as e:
        print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
        return []

    print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
    return all_docs