Zeggai commited on
Commit
1d6e6be
·
verified ·
1 Parent(s): 6bca910

Delete ingest_and_index_supplementary_doc_two.py

Browse files
ingest_and_index_supplementary_doc_two.py DELETED
@@ -1,228 +0,0 @@
1
- import os
2
- import sys
3
- from pathlib import Path
4
- import argparse
5
- import traceback
6
- from typing import List
7
-
8
- from llama_index.core import (
9
- SimpleDirectoryReader,
10
- VectorStoreIndex,
11
- StorageContext,
12
- Settings,
13
- Document
14
- )
15
- from llama_index.core.node_parser import SentenceSplitter
16
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
17
- from llama_index.llms.google_genai import GoogleGenAI
18
-
19
- # For Mistral OCR parsing
20
- import requests
21
- import base64
22
-
23
- # Base directory where indices for supplementary documents will be stored
24
- SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT = "./storage/supplementary_indices/test"
25
-
26
- def configure_indexing_settings():
27
- """Configures free HuggingFace embedding model for medical content understanding."""
28
-
29
- print("Configuring free BGE embedding model for medical document indexing...")
30
- # Using free BGE model - excellent for medical content
31
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
32
- print(f"Embedding Model: BAAI/bge-small-en-v1.5 (FREE)")
33
-
34
- gemini_api_key = os.getenv("GOOGLE_API_KEY")
35
- if gemini_api_key:
36
- Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
37
- print(f"LLM (Optional for indexing): {Settings.llm.model}")
38
- else:
39
- Settings.llm = None
40
- print("LLM (Optional for indexing): Not configured (GOOGLE_API_KEY not set).")
41
-
42
- def parse_pdf_with_mistral_ocr(pdf_path: str) -> str:
43
- """
44
- Parse PDF using Mistral OCR for better medical document understanding.
45
- Falls back to SimpleDirectoryReader if Mistral API is not available.
46
- """
47
- # Try multiple ways to get the API key
48
- mistral_api_key = None
49
-
50
- # Method 1: Direct environment variable
51
- mistral_api_key = "rixeCMkocBC0fsYGE5uaiOgvGy1GQXL3"
52
-
53
-
54
- print(f" 🔑 API Key Status: {'Found' if mistral_api_key else 'Not Found'}")
55
-
56
- if not mistral_api_key:
57
- print(" ⚠️ MISTRAL_API_KEY not found in environment, falling back to SimpleDirectoryReader...")
58
- print(" 💡 Tip: Restart your terminal/IDE after setting environment variables")
59
- reader = SimpleDirectoryReader(input_files=[pdf_path])
60
- documents = reader.load_data()
61
- return documents[0].text if documents else ""
62
-
63
- try:
64
- print(" 🔍 Using Mistral OCR for PDF parsing...")
65
-
66
- # Read and encode PDF
67
- with open(pdf_path, "rb") as pdf_file:
68
- pdf_base64 = base64.b64encode(pdf_file.read()).decode('utf-8')
69
-
70
- # Mistral OCR API call
71
- headers = {
72
- "Authorization": f"Bearer {mistral_api_key}",
73
- "Content-Type": "application/json"
74
- }
75
-
76
- payload = {
77
- "model": "pixtral-12b-2409",
78
- "messages": [
79
- {
80
- "role": "user",
81
- "content": [
82
- {
83
- "type": "text",
84
- "text": "Extract all text from this medical document. Preserve structure, headings, and formatting. Pay special attention to medical terminology, dosages, and clinical data."
85
- },
86
- {
87
- "type": "image_url",
88
- "image_url": {
89
- "url": f"data:application/pdf;base64,{pdf_base64}"
90
- }
91
- }
92
- ]
93
- }
94
- ]
95
- }
96
-
97
- response = requests.post("https://api.mistral.ai/v1/chat/completions",
98
- headers=headers, json=payload, timeout=60)
99
-
100
- if response.status_code == 200:
101
- result = response.json()
102
- extracted_text = result["choices"][0]["message"]["content"]
103
- print(" ✅ Successfully extracted text using Mistral OCR")
104
- return extracted_text
105
- else:
106
- print(f" ⚠️ Mistral OCR failed (status: {response.status_code}), falling back...")
107
- raise Exception(f"Mistral API error: {response.status_code}")
108
-
109
- except Exception as e:
110
- print(f" ⚠️ Mistral OCR error: {e}, falling back to SimpleDirectoryReader...")
111
- reader = SimpleDirectoryReader(input_files=[pdf_path])
112
- documents = reader.load_data()
113
- return documents[0].text if documents else ""
114
-
115
- def process_and_index_document(doc_file_path_str: str, index_persist_path_str: str):
116
- """
117
- Enhanced ingestion with Mistral OCR, smart chunking, and OpenAI embeddings.
118
- """
119
- doc_file_path = Path(doc_file_path_str)
120
- index_persist_path = Path(index_persist_path_str)
121
-
122
- print(f"\n--- Processing Medical Document: {doc_file_path.name} ---")
123
- print(f" Index target directory: {index_persist_path}")
124
-
125
- if not doc_file_path.exists():
126
- print(f" ❌ Error: Document not found at {doc_file_path}")
127
- return False
128
-
129
- try:
130
- # Step 1: Parse with Mistral OCR
131
- print(" 📄 Parsing PDF with enhanced OCR...")
132
- extracted_text = parse_pdf_with_mistral_ocr(str(doc_file_path))
133
-
134
- if not extracted_text.strip():
135
- print(" ❌ No text extracted from document.")
136
- return False
137
-
138
- # Create Document object
139
- document = Document(
140
- text=extracted_text,
141
- metadata={
142
- "source_document": doc_file_path.name,
143
- "file_path": str(doc_file_path),
144
- "document_type": "medical_pdf"
145
- }
146
- )
147
-
148
- # Step 2: Smart chunking optimized for medical content
149
- print(" 🧩 Applying smart chunking optimized for medical content...")
150
- node_parser = SentenceSplitter(
151
- chunk_size=512, # Good size for medical content
152
- chunk_overlap=50, # Preserve context between chunks
153
- separator=" ",
154
- )
155
-
156
- nodes = node_parser.get_nodes_from_documents([document], show_progress=True)
157
-
158
- if not nodes:
159
- print(" ❌ Chunking returned no nodes.")
160
- return False
161
-
162
- print(f" ✅ Created {len(nodes)} optimized chunks")
163
-
164
- # Add metadata to all nodes
165
- for i, node in enumerate(nodes):
166
- node.metadata.update({
167
- "source_document": doc_file_path.name,
168
- "chunk_id": i,
169
- "total_chunks": len(nodes)
170
- })
171
-
172
- # Step 3: Build index with optimized settings
173
- print(f" 🔨 Building vector index with OpenAI embeddings...")
174
- storage_context = StorageContext.from_defaults()
175
- vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
176
-
177
- print(" 💾 Persisting index...")
178
- index_persist_path.mkdir(parents=True, exist_ok=True)
179
- vector_index.storage_context.persist(persist_dir=str(index_persist_path))
180
-
181
- print(f" ✅ Successfully created optimized medical document index!")
182
- print(f" 📊 Stats: {len(nodes)} chunks, FREE BGE embeddings, Enhanced OCR")
183
-
184
- return True
185
-
186
- except Exception as e:
187
- print(f" ❌ Error during processing: {e}")
188
- traceback.print_exc()
189
- return False
190
-
191
- if __name__ == "__main__":
192
- parser = argparse.ArgumentParser(description="Enhanced medical PDF indexing with Mistral OCR and optimized retrieval.")
193
- parser.add_argument("document_path", help="Full path to the medical PDF document.")
194
- parser.add_argument("index_output_dir", help="Directory name for the index output.")
195
- args = parser.parse_args()
196
-
197
- print("🏥 Medical Document RAG Indexer - Enhanced Edition")
198
- print("=" * 60)
199
-
200
- configure_indexing_settings()
201
-
202
- # Ensure base directory exists
203
- Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT).mkdir(parents=True, exist_ok=True)
204
-
205
- # Construct full index path
206
- index_output_path = Path(args.index_output_dir)
207
- if not index_output_path.is_absolute():
208
- index_output_path = Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT) / args.index_output_dir
209
-
210
- print(f"\n📋 Processing: {args.document_path}")
211
- print(f"💾 Index destination: {index_output_path}")
212
-
213
- success = process_and_index_document(args.document_path, str(index_output_path))
214
-
215
- if success:
216
- print(f"\n🎉 SUCCESS! Medical document index ready at: {index_output_path}")
217
- print("\n💡 Setup tips:")
218
- print(" - Uses FREE BGE embeddings (no API key needed)")
219
- print(" - Set MISTRAL_API_KEY for enhanced OCR (optional)")
220
- print(" - Restart terminal/IDE after setting environment variables")
221
- else:
222
- print(f"\n❌ FAILED to create index for: {args.document_path}")
223
-
224
- # Usage examples:
225
- # python ingest_and_index_supplementary_doc.py "./data/medical_guideline.pdf" "guideline_index"
226
- # python ingest_and_index_supplementary_doc.py "./data/clinical_trial.pdf" "trial_index"
227
-
228
- #python ingest_and_index_supplementary_doc.py "./data/supplementary_docs/Immunization in Practice_WHO_eng_2015.pdf" "medical_index"