Spaces:
Sleeping
Sleeping
Delete ingest_and_index_supplementary_doc_two.py
Browse files
ingest_and_index_supplementary_doc_two.py
DELETED
|
@@ -1,228 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
import argparse
|
| 5 |
-
import traceback
|
| 6 |
-
from typing import List
|
| 7 |
-
|
| 8 |
-
from llama_index.core import (
|
| 9 |
-
SimpleDirectoryReader,
|
| 10 |
-
VectorStoreIndex,
|
| 11 |
-
StorageContext,
|
| 12 |
-
Settings,
|
| 13 |
-
Document
|
| 14 |
-
)
|
| 15 |
-
from llama_index.core.node_parser import SentenceSplitter
|
| 16 |
-
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 17 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
| 18 |
-
|
| 19 |
-
# For Mistral OCR parsing
|
| 20 |
-
import requests
|
| 21 |
-
import base64
|
| 22 |
-
|
| 23 |
-
# Base directory where indices for supplementary documents will be stored
|
| 24 |
-
SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT = "./storage/supplementary_indices/test"
|
| 25 |
-
|
| 26 |
-
def configure_indexing_settings():
|
| 27 |
-
"""Configures free HuggingFace embedding model for medical content understanding."""
|
| 28 |
-
|
| 29 |
-
print("Configuring free BGE embedding model for medical document indexing...")
|
| 30 |
-
# Using free BGE model - excellent for medical content
|
| 31 |
-
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
| 32 |
-
print(f"Embedding Model: BAAI/bge-small-en-v1.5 (FREE)")
|
| 33 |
-
|
| 34 |
-
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
| 35 |
-
if gemini_api_key:
|
| 36 |
-
Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
|
| 37 |
-
print(f"LLM (Optional for indexing): {Settings.llm.model}")
|
| 38 |
-
else:
|
| 39 |
-
Settings.llm = None
|
| 40 |
-
print("LLM (Optional for indexing): Not configured (GOOGLE_API_KEY not set).")
|
| 41 |
-
|
| 42 |
-
def parse_pdf_with_mistral_ocr(pdf_path: str) -> str:
|
| 43 |
-
"""
|
| 44 |
-
Parse PDF using Mistral OCR for better medical document understanding.
|
| 45 |
-
Falls back to SimpleDirectoryReader if Mistral API is not available.
|
| 46 |
-
"""
|
| 47 |
-
# Try multiple ways to get the API key
|
| 48 |
-
mistral_api_key = None
|
| 49 |
-
|
| 50 |
-
# Method 1: Direct environment variable
|
| 51 |
-
mistral_api_key = "rixeCMkocBC0fsYGE5uaiOgvGy1GQXL3"
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
print(f" 🔑 API Key Status: {'Found' if mistral_api_key else 'Not Found'}")
|
| 55 |
-
|
| 56 |
-
if not mistral_api_key:
|
| 57 |
-
print(" ⚠️ MISTRAL_API_KEY not found in environment, falling back to SimpleDirectoryReader...")
|
| 58 |
-
print(" 💡 Tip: Restart your terminal/IDE after setting environment variables")
|
| 59 |
-
reader = SimpleDirectoryReader(input_files=[pdf_path])
|
| 60 |
-
documents = reader.load_data()
|
| 61 |
-
return documents[0].text if documents else ""
|
| 62 |
-
|
| 63 |
-
try:
|
| 64 |
-
print(" 🔍 Using Mistral OCR for PDF parsing...")
|
| 65 |
-
|
| 66 |
-
# Read and encode PDF
|
| 67 |
-
with open(pdf_path, "rb") as pdf_file:
|
| 68 |
-
pdf_base64 = base64.b64encode(pdf_file.read()).decode('utf-8')
|
| 69 |
-
|
| 70 |
-
# Mistral OCR API call
|
| 71 |
-
headers = {
|
| 72 |
-
"Authorization": f"Bearer {mistral_api_key}",
|
| 73 |
-
"Content-Type": "application/json"
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
payload = {
|
| 77 |
-
"model": "pixtral-12b-2409",
|
| 78 |
-
"messages": [
|
| 79 |
-
{
|
| 80 |
-
"role": "user",
|
| 81 |
-
"content": [
|
| 82 |
-
{
|
| 83 |
-
"type": "text",
|
| 84 |
-
"text": "Extract all text from this medical document. Preserve structure, headings, and formatting. Pay special attention to medical terminology, dosages, and clinical data."
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"type": "image_url",
|
| 88 |
-
"image_url": {
|
| 89 |
-
"url": f"data:application/pdf;base64,{pdf_base64}"
|
| 90 |
-
}
|
| 91 |
-
}
|
| 92 |
-
]
|
| 93 |
-
}
|
| 94 |
-
]
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
response = requests.post("https://api.mistral.ai/v1/chat/completions",
|
| 98 |
-
headers=headers, json=payload, timeout=60)
|
| 99 |
-
|
| 100 |
-
if response.status_code == 200:
|
| 101 |
-
result = response.json()
|
| 102 |
-
extracted_text = result["choices"][0]["message"]["content"]
|
| 103 |
-
print(" ✅ Successfully extracted text using Mistral OCR")
|
| 104 |
-
return extracted_text
|
| 105 |
-
else:
|
| 106 |
-
print(f" ⚠️ Mistral OCR failed (status: {response.status_code}), falling back...")
|
| 107 |
-
raise Exception(f"Mistral API error: {response.status_code}")
|
| 108 |
-
|
| 109 |
-
except Exception as e:
|
| 110 |
-
print(f" ⚠️ Mistral OCR error: {e}, falling back to SimpleDirectoryReader...")
|
| 111 |
-
reader = SimpleDirectoryReader(input_files=[pdf_path])
|
| 112 |
-
documents = reader.load_data()
|
| 113 |
-
return documents[0].text if documents else ""
|
| 114 |
-
|
| 115 |
-
def process_and_index_document(doc_file_path_str: str, index_persist_path_str: str):
|
| 116 |
-
"""
|
| 117 |
-
Enhanced ingestion with Mistral OCR, smart chunking, and OpenAI embeddings.
|
| 118 |
-
"""
|
| 119 |
-
doc_file_path = Path(doc_file_path_str)
|
| 120 |
-
index_persist_path = Path(index_persist_path_str)
|
| 121 |
-
|
| 122 |
-
print(f"\n--- Processing Medical Document: {doc_file_path.name} ---")
|
| 123 |
-
print(f" Index target directory: {index_persist_path}")
|
| 124 |
-
|
| 125 |
-
if not doc_file_path.exists():
|
| 126 |
-
print(f" ❌ Error: Document not found at {doc_file_path}")
|
| 127 |
-
return False
|
| 128 |
-
|
| 129 |
-
try:
|
| 130 |
-
# Step 1: Parse with Mistral OCR
|
| 131 |
-
print(" 📄 Parsing PDF with enhanced OCR...")
|
| 132 |
-
extracted_text = parse_pdf_with_mistral_ocr(str(doc_file_path))
|
| 133 |
-
|
| 134 |
-
if not extracted_text.strip():
|
| 135 |
-
print(" ❌ No text extracted from document.")
|
| 136 |
-
return False
|
| 137 |
-
|
| 138 |
-
# Create Document object
|
| 139 |
-
document = Document(
|
| 140 |
-
text=extracted_text,
|
| 141 |
-
metadata={
|
| 142 |
-
"source_document": doc_file_path.name,
|
| 143 |
-
"file_path": str(doc_file_path),
|
| 144 |
-
"document_type": "medical_pdf"
|
| 145 |
-
}
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
# Step 2: Smart chunking optimized for medical content
|
| 149 |
-
print(" 🧩 Applying smart chunking optimized for medical content...")
|
| 150 |
-
node_parser = SentenceSplitter(
|
| 151 |
-
chunk_size=512, # Good size for medical content
|
| 152 |
-
chunk_overlap=50, # Preserve context between chunks
|
| 153 |
-
separator=" ",
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
nodes = node_parser.get_nodes_from_documents([document], show_progress=True)
|
| 157 |
-
|
| 158 |
-
if not nodes:
|
| 159 |
-
print(" ❌ Chunking returned no nodes.")
|
| 160 |
-
return False
|
| 161 |
-
|
| 162 |
-
print(f" ✅ Created {len(nodes)} optimized chunks")
|
| 163 |
-
|
| 164 |
-
# Add metadata to all nodes
|
| 165 |
-
for i, node in enumerate(nodes):
|
| 166 |
-
node.metadata.update({
|
| 167 |
-
"source_document": doc_file_path.name,
|
| 168 |
-
"chunk_id": i,
|
| 169 |
-
"total_chunks": len(nodes)
|
| 170 |
-
})
|
| 171 |
-
|
| 172 |
-
# Step 3: Build index with optimized settings
|
| 173 |
-
print(f" 🔨 Building vector index with OpenAI embeddings...")
|
| 174 |
-
storage_context = StorageContext.from_defaults()
|
| 175 |
-
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
|
| 176 |
-
|
| 177 |
-
print(" 💾 Persisting index...")
|
| 178 |
-
index_persist_path.mkdir(parents=True, exist_ok=True)
|
| 179 |
-
vector_index.storage_context.persist(persist_dir=str(index_persist_path))
|
| 180 |
-
|
| 181 |
-
print(f" ✅ Successfully created optimized medical document index!")
|
| 182 |
-
print(f" 📊 Stats: {len(nodes)} chunks, FREE BGE embeddings, Enhanced OCR")
|
| 183 |
-
|
| 184 |
-
return True
|
| 185 |
-
|
| 186 |
-
except Exception as e:
|
| 187 |
-
print(f" ❌ Error during processing: {e}")
|
| 188 |
-
traceback.print_exc()
|
| 189 |
-
return False
|
| 190 |
-
|
| 191 |
-
if __name__ == "__main__":
|
| 192 |
-
parser = argparse.ArgumentParser(description="Enhanced medical PDF indexing with Mistral OCR and optimized retrieval.")
|
| 193 |
-
parser.add_argument("document_path", help="Full path to the medical PDF document.")
|
| 194 |
-
parser.add_argument("index_output_dir", help="Directory name for the index output.")
|
| 195 |
-
args = parser.parse_args()
|
| 196 |
-
|
| 197 |
-
print("🏥 Medical Document RAG Indexer - Enhanced Edition")
|
| 198 |
-
print("=" * 60)
|
| 199 |
-
|
| 200 |
-
configure_indexing_settings()
|
| 201 |
-
|
| 202 |
-
# Ensure base directory exists
|
| 203 |
-
Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT).mkdir(parents=True, exist_ok=True)
|
| 204 |
-
|
| 205 |
-
# Construct full index path
|
| 206 |
-
index_output_path = Path(args.index_output_dir)
|
| 207 |
-
if not index_output_path.is_absolute():
|
| 208 |
-
index_output_path = Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT) / args.index_output_dir
|
| 209 |
-
|
| 210 |
-
print(f"\n📋 Processing: {args.document_path}")
|
| 211 |
-
print(f"💾 Index destination: {index_output_path}")
|
| 212 |
-
|
| 213 |
-
success = process_and_index_document(args.document_path, str(index_output_path))
|
| 214 |
-
|
| 215 |
-
if success:
|
| 216 |
-
print(f"\n🎉 SUCCESS! Medical document index ready at: {index_output_path}")
|
| 217 |
-
print("\n💡 Setup tips:")
|
| 218 |
-
print(" - Uses FREE BGE embeddings (no API key needed)")
|
| 219 |
-
print(" - Set MISTRAL_API_KEY for enhanced OCR (optional)")
|
| 220 |
-
print(" - Restart terminal/IDE after setting environment variables")
|
| 221 |
-
else:
|
| 222 |
-
print(f"\n❌ FAILED to create index for: {args.document_path}")
|
| 223 |
-
|
| 224 |
-
# Usage examples:
|
| 225 |
-
# python ingest_and_index_supplementary_doc.py "./data/medical_guideline.pdf" "guideline_index"
|
| 226 |
-
# python ingest_and_index_supplementary_doc.py "./data/clinical_trial.pdf" "trial_index"
|
| 227 |
-
|
| 228 |
-
#python ingest_and_index_supplementary_doc.py "./data/supplementary_docs/Immunization in Practice_WHO_eng_2015.pdf" "medical_index"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|