Spaces:

VaxGuide
/

Agentic_RAG

Sleeping

App Files Files Community

Zeggai commited on May 27, 2025

Commit

1d6e6be

verified ·

1 Parent(s): 6bca910

Delete ingest_and_index_supplementary_doc_two.py

Browse files

Files changed (1) hide show

ingest_and_index_supplementary_doc_two.py +0 -228

ingest_and_index_supplementary_doc_two.py DELETED Viewed

@@ -1,228 +0,0 @@
-import os
-import sys
-from pathlib import Path
-import argparse
-import traceback
-from typing import List
-from llama_index.core import (
-    SimpleDirectoryReader,
-    VectorStoreIndex,
-    StorageContext,
-    Settings,
-    Document
-)
-from llama_index.core.node_parser import SentenceSplitter
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.llms.google_genai import GoogleGenAI
-# For Mistral OCR parsing
-import requests
-import base64
-# Base directory where indices for supplementary documents will be stored
-SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT = "./storage/supplementary_indices/test"
-def configure_indexing_settings():
-    """Configures free HuggingFace embedding model for medical content understanding."""
-    print("Configuring free BGE embedding model for medical document indexing...")
-    # Using free BGE model - excellent for medical content
-    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-    print(f"Embedding Model: BAAI/bge-small-en-v1.5 (FREE)")
-    gemini_api_key = os.getenv("GOOGLE_API_KEY")
-    if gemini_api_key:
-        Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
-        print(f"LLM (Optional for indexing): {Settings.llm.model}")
-    else:
-        Settings.llm = None
-        print("LLM (Optional for indexing): Not configured (GOOGLE_API_KEY not set).")
-def parse_pdf_with_mistral_ocr(pdf_path: str) -> str:
-    """
-    Parse PDF using Mistral OCR for better medical document understanding.
-    Falls back to SimpleDirectoryReader if Mistral API is not available.
-    """
-    # Try multiple ways to get the API key
-    mistral_api_key = None
-    # Method 1: Direct environment variable
-    mistral_api_key = "rixeCMkocBC0fsYGE5uaiOgvGy1GQXL3"
-    print(f"   🔑 API Key Status: {'Found' if mistral_api_key else 'Not Found'}")
-    if not mistral_api_key:
-        print("   ⚠️  MISTRAL_API_KEY not found in environment, falling back to SimpleDirectoryReader...")
-        print("   💡 Tip: Restart your terminal/IDE after setting environment variables")
-        reader = SimpleDirectoryReader(input_files=[pdf_path])
-        documents = reader.load_data()
-        return documents[0].text if documents else ""
-    try:
-        print("   🔍 Using Mistral OCR for PDF parsing...")
-        # Read and encode PDF
-        with open(pdf_path, "rb") as pdf_file:
-            pdf_base64 = base64.b64encode(pdf_file.read()).decode('utf-8')
-        # Mistral OCR API call
-        headers = {
-            "Authorization": f"Bearer {mistral_api_key}",
-            "Content-Type": "application/json"
-        }
-        payload = {
-            "model": "pixtral-12b-2409",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Extract all text from this medical document. Preserve structure, headings, and formatting. Pay special attention to medical terminology, dosages, and clinical data."
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:application/pdf;base64,{pdf_base64}"
-                            }
-                        }
-                    ]
-                }
-            ]
-        }
-        response = requests.post("https://api.mistral.ai/v1/chat/completions",
-                               headers=headers, json=payload, timeout=60)
-        if response.status_code == 200:
-            result = response.json()
-            extracted_text = result["choices"][0]["message"]["content"]
-            print("   ✅ Successfully extracted text using Mistral OCR")
-            return extracted_text
-        else:
-            print(f"   ⚠️  Mistral OCR failed (status: {response.status_code}), falling back...")
-            raise Exception(f"Mistral API error: {response.status_code}")
-    except Exception as e:
-        print(f"   ⚠️  Mistral OCR error: {e}, falling back to SimpleDirectoryReader...")
-        reader = SimpleDirectoryReader(input_files=[pdf_path])
-        documents = reader.load_data()
-        return documents[0].text if documents else ""
-def process_and_index_document(doc_file_path_str: str, index_persist_path_str: str):
-    """
-    Enhanced ingestion with Mistral OCR, smart chunking, and OpenAI embeddings.
-    """
-    doc_file_path = Path(doc_file_path_str)
-    index_persist_path = Path(index_persist_path_str)
-    print(f"\n--- Processing Medical Document: {doc_file_path.name} ---")
-    print(f"   Index target directory: {index_persist_path}")
-    if not doc_file_path.exists():
-        print(f"   ❌ Error: Document not found at {doc_file_path}")
-        return False
-    try:
-        # Step 1: Parse with Mistral OCR
-        print("   📄 Parsing PDF with enhanced OCR...")
-        extracted_text = parse_pdf_with_mistral_ocr(str(doc_file_path))
-        if not extracted_text.strip():
-            print("   ❌ No text extracted from document.")
-            return False
-        # Create Document object
-        document = Document(
-            text=extracted_text,
-            metadata={
-                "source_document": doc_file_path.name,
-                "file_path": str(doc_file_path),
-                "document_type": "medical_pdf"
-            }
-        )
-        # Step 2: Smart chunking optimized for medical content
-        print("   🧩 Applying smart chunking optimized for medical content...")
-        node_parser = SentenceSplitter(
-            chunk_size=512,  # Good size for medical content
-            chunk_overlap=50,  # Preserve context between chunks
-            separator=" ",
-        )
-        nodes = node_parser.get_nodes_from_documents([document], show_progress=True)
-        if not nodes:
-            print("   ❌ Chunking returned no nodes.")
-            return False
-        print(f"   ✅ Created {len(nodes)} optimized chunks")
-        # Add metadata to all nodes
-        for i, node in enumerate(nodes):
-            node.metadata.update({
-                "source_document": doc_file_path.name,
-                "chunk_id": i,
-                "total_chunks": len(nodes)
-            })
-        # Step 3: Build index with optimized settings
-        print(f"   🔨 Building vector index with OpenAI embeddings...")
-        storage_context = StorageContext.from_defaults()
-        vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
-        print("   💾 Persisting index...")
-        index_persist_path.mkdir(parents=True, exist_ok=True)
-        vector_index.storage_context.persist(persist_dir=str(index_persist_path))
-        print(f"   ✅ Successfully created optimized medical document index!")
-        print(f"   📊 Stats: {len(nodes)} chunks, FREE BGE embeddings, Enhanced OCR")
-        return True
-    except Exception as e:
-        print(f"   ❌ Error during processing: {e}")
-        traceback.print_exc()
-        return False
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Enhanced medical PDF indexing with Mistral OCR and optimized retrieval.")
-    parser.add_argument("document_path", help="Full path to the medical PDF document.")
-    parser.add_argument("index_output_dir", help="Directory name for the index output.")
-    args = parser.parse_args()
-    print("🏥 Medical Document RAG Indexer - Enhanced Edition")
-    print("=" * 60)
-    configure_indexing_settings()
-    # Ensure base directory exists
-    Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT).mkdir(parents=True, exist_ok=True)
-    # Construct full index path
-    index_output_path = Path(args.index_output_dir)
-    if not index_output_path.is_absolute():
-        index_output_path = Path(SUPPLEMENTARY_INDEXES_BASE_PATH_FOR_AGENT) / args.index_output_dir
-    print(f"\n📋 Processing: {args.document_path}")
-    print(f"💾 Index destination: {index_output_path}")
-    success = process_and_index_document(args.document_path, str(index_output_path))
-    if success:
-        print(f"\n🎉 SUCCESS! Medical document index ready at: {index_output_path}")
-        print("\n💡 Setup tips:")
-        print("   - Uses FREE BGE embeddings (no API key needed)")
-        print("   - Set MISTRAL_API_KEY for enhanced OCR (optional)")
-        print("   - Restart terminal/IDE after setting environment variables")
-    else:
-        print(f"\n❌ FAILED to create index for: {args.document_path}")
-# Usage examples:
-# python ingest_and_index_supplementary_doc.py "./data/medical_guideline.pdf" "guideline_index"
-# python ingest_and_index_supplementary_doc.py "./data/clinical_trial.pdf" "trial_index"
- #python ingest_and_index_supplementary_doc.py "./data/supplementary_docs/Immunization in Practice_WHO_eng_2015.pdf" "medical_index"