Spaces:

VaxGuide
/

Agentic_RAG

Sleeping

App Files Files Community

Zeggai commited on May 27, 2025

Commit

6bca910

verified ·

1 Parent(s): 3f2644d

Delete build_section_indices.py

Browse files

Files changed (1) hide show

build_section_indices.py +0 -128

build_section_indices.py DELETED Viewed

@@ -1,128 +0,0 @@
-# build_section_indices.py
-import os
-import re
-from pathlib import Path
-# In build_section_indices.py
-import time # Import time for a small delay if needed (unlikely fix, but for debugging)
-from pathlib import Path
-# (Keep other imports the same: os, re, ingest function, Settings, LlamaIndex classes, etc.)
-from document_uploader import ingest_section_docs_unstructured
-from global_settings import STORAGE_PATH
-from llama_index.core import Settings, VectorStoreIndex, StorageContext
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.llms.google_genai import GoogleGenAI # Or other LLM
-# --- Define SECTION_INDEX_BASE_PATH ---
-SECTION_INDEX_BASE_PATH = "./storage/section_indices"
-def create_index_for_section(section_pdf_path: Path, section_index_dir: Path):
-    """
-    Ingests a single section PDF, builds VectorStoreIndex IN MEMORY,
-    and then persists it to disk.
-    """
-    print(f"\n--- Processing Section: {section_pdf_path.name} ---")
-    print(f"   Index target directory: {section_index_dir}")
-    # 1. Ingest this specific section file to get nodes
-    nodes = ingest_section_docs_unstructured(
-        input_path=section_pdf_path.parent,
-        process_filename=section_pdf_path.name,
-        use_summaries=False # Keep LLM calls out of ingestion for now
-    )
-    if not nodes:
-        print(f"   ❌ Ingestion returned no nodes for {section_pdf_path.name}. Skipping index creation.")
-        return False
-    print(f"   Ingested {len(nodes)} nodes for this section.")
-    # 2. Build VectorStoreIndex IN MEMORY first
-    try:
-        print(f"   Building VectorStoreIndex in memory...")
-        # Create a *default* storage context (in-memory)
-        storage_context = StorageContext.from_defaults()
-        # Build the index using the in-memory context
-        vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
-        print(f"   Index built successfully in memory.")
-    except Exception as e:
-        print(f"   ❌ Error building index in memory for {section_pdf_path.name}: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-    # 3. Persist the IN-MEMORY index to the target directory
-    try:
-        print(f"   Persisting index to disk: {section_index_dir}...")
-        # Ensure the target directory exists just before persisting
-        section_index_dir.mkdir(parents=True, exist_ok=True)
-        # time.sleep(0.1) # Tiny delay - very unlikely needed but safe to try once
-        # Now, tell the index (and its associated context) to save to disk
-        vector_index.storage_context.persist(persist_dir=str(section_index_dir))
-        print(f"   ✅ Successfully persisted index for {section_pdf_path.name} to {section_index_dir}")
-        return True
-    except Exception as e:
-        print(f"   ❌ Error persisting index to disk for {section_pdf_path.name}: {e}")
-        # If the error still occurs here, it might be permissions or path related
-        print(f"   Check write permissions for the directory: {section_index_dir.parent}")
-        import traceback
-        traceback.print_exc()
-        return False
-# --- Main Indexing Orchestration ---
-# (The if __name__ == "__main__": block remains the same, it calls the function above)
-if __name__ == "__main__":
-    print("--- Starting Per-Section Index Building Process (In-Memory First) ---")
-    # --- Global Configuration ---
-    print("Configuring Embedding model...")
-    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-    print(f"Embedding Model: {Settings.embed_model.model_name}")
-    # LLM config (optional here unless index build unexpectedly needs it)
-    gemini_api_key = os.getenv("GOOGLE_API_KEY")
-    if not gemini_api_key: Settings.llm = None
-    else: Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
-    print(f"LLM Configured: {Settings.llm}")
-    # --- End Configuration ---
-    # --- Define Paths ---
-    source_docs_path = Path(STORAGE_PATH)
-    base_index_path = Path(SECTION_INDEX_BASE_PATH)
-    base_index_path.mkdir(parents=True, exist_ok=True)
-    print(f"Source documents location: {source_docs_path}")
-    print(f"Base index storage location: {base_index_path}")
-    # --- Find Section Files and Process ---
-    filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE)
-    processed_count = 0
-    error_count = 0
-    # --- !!! DELETE Existing Section Index Dirs before running !!! ---
-    # print(f"Warning: Consider manually deleting contents of {base_index_path} before running.")
-    # ---
-    for pdf_file in source_docs_path.glob("*.pdf"):
-        match = filename_pattern.match(pdf_file.name)
-        if match:
-            section_id = match.group(1).strip()
-            section_index_dir_name = f"section_{section_id}_index"
-            section_index_full_path = base_index_path / section_index_dir_name
-            success = create_index_for_section(pdf_file, section_index_full_path)
-            if success:
-                processed_count += 1
-            else:
-                error_count += 1
-        else:
-            print(f"Skipping file (doesn't match section pattern): {pdf_file.name}")
-    print("\n--- Indexing Summary ---")
-    print(f"Successfully processed and indexed {processed_count} section file(s).")
-    if error_count > 0:
-        print(f"Encountered errors for {error_count} section file(s). Check logs above.")
-    print(f"Section indices stored under: {base_index_path}")