File size: 5,718 Bytes
25fcb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# =================================================================================
# data_processing.py: Process and prepare raw data
# =================================================================================
import json
import re
from llama_index.core import Document
from tqdm import tqdm
import config

def clean_text(text: str) -> str:
    """

    Cleans the input text by removing common noise from FDA documents.

    """
    text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    text = re.sub(r'[\-=*]{3,}', '', text)
    return text

def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
    """

    Loads drug data from a JSON file, filters for high-quality entries,

    cleans the text, and returns a list of LangChain Document objects.

    """
    print(f"Loading data from: {json_path}...")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    all_docs = []
    print("Filtering, cleaning, and converting data to 'Document' objects...")
    for entry in tqdm(data, desc="Processing drug data"):
        if not entry: continue

        # --- NEW FILTERING LOGIC ---
        # 1. Ensure the entry has a brand or generic name.
        brand_name_list = entry.get("openfda", {}).get("brand_name")
        generic_name_list = entry.get("openfda", {}).get("generic_name")
        
        if not brand_name_list and not generic_name_list:
            continue # Skip entries with no name

        # 2. Ensure it's likely a real drug by checking for a crucial section.
        if "indications_and_usage" not in entry:
            continue # Skip entries that don't say what the drug is for

        brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
        generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
        # ---------------------------

        sections_to_process = {
            "indications_and_usage": "Indications and Usage",
            "adverse_reactions": "Adverse Reactions",
            "drug_interactions": "Drug Interactions",
            "contraindications": "Contraindications",
            "warnings": "Warnings",
            "boxed_warning": "Boxed Warning",
            "mechanism_of_action": "Mechanism of Action",
            "pharmacokinetics": "Pharmacokinetics",
            "dosage_and_administration": "Dosage and Administration",
            "how_supplied": "How Supplied",
            "storage_and_handling": "Storage and Handling",
            "information_for_patients": "Information for Patients",
            "pregnancy": "Pregnancy",
            "nursing_mothers": "Nursing Mothers",
            "pediatric_use": "Pediatric Use",
            "geriatric_use": "Geriatric Use"
        }

        for key, section_name in sections_to_process.items():
            text_list = entry.get(key)
            if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
                cleaned_text = clean_text(text_list[0])
                if cleaned_text:
                    metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
                    doc = Document(page_content=cleaned_text, metadata=metadata)
                    all_docs.append(doc)

    print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
    return all_docs

def load_and_process_all():
    """

    Loads and processes documents from all configured data sources.

    """
    all_docs = []

    # Process FDA drug data
    fda_docs = load_and_prepare_fda_documents()
    all_docs.extend(fda_docs)

    # Process HealthCareMagic data
    # healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH)
    # all_docs.extend(healthcare_docs)

    '''# Process MedQuad data

    medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)

    all_docs.extend(medquad_docs)'''

    print(f"Total documents loaded from all sources: {len(all_docs)}")
    return all_docs

def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
    """

    Loads cleaned drug data from a JSON Lines file and converts it into

    a list of LlamaIndex Document objects for the RAG pipeline.

    """
    print(f"Loading cleaned drug data from: {json_path}...")
    all_docs = []
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc="Processing cleaned drug data"):
                entry = json.loads(line)
                
                content = entry.get("content")
                if not content:
                    continue

                metadata = {
                    "doc_id": entry.get("doc_id"),
                    "brand_name": entry.get("brand_name"),
                    "generic_name": entry.get("generic_name"),
                    "section": entry.get("section"),
                    "source": "FDA Drug Labels"
                }
                
                # The text for the document is just the content of the section
                doc = Document(text=content, metadata=metadata)
                all_docs.append(doc)

    except FileNotFoundError:
        print(f"Error: The file '{json_path}' was not found.")
        return []
    except json.JSONDecodeError as e:
        print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
        return []

    print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
    return all_docs