Spaces:

VaxGuide
/

Agentic_RAG

Sleeping

App Files Files Community

Zeggai commited on May 27, 2025

Commit

0d9a678

verified ·

1 Parent(s): 1d6e6be

Delete document_uploader.py

Browse files

Files changed (1) hide show

document_uploader.py +0 -200

document_uploader.py DELETED Viewed

@@ -1,200 +0,0 @@
-# --- Imports ---
-import os
-import re
-from pathlib import Path
-from global_settings import STORAGE_PATH, CACHE_FILE
-from logging_functions import log_action
-# LlamaIndex Core Imports
-from llama_index.core import SimpleDirectoryReader, Settings
-from llama_index.core.ingestion import IngestionPipeline, IngestionCache
-# Import the Unstructured Node Parser
-from llama_index.core.node_parser import UnstructuredElementNodeParser # For parsing PDFs directly
-from llama_index.core.extractors import SummaryExtractor # Optional
-# Import Embedding Model
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-# Import LLM (Gemini) - Optional, only if SummaryExtractor runs
-from llama_index.llms.google_genai import GoogleGenAI
-# --- Function Definition ---
-def ingest_section_docs_unstructured(
-    input_path=STORAGE_PATH,
-    cache_path=CACHE_FILE,
-    process_filename=None,
-    use_summaries=False
-):
-    """
-    Ingests one or more SECTION document files (PDFs) using SimpleDirectoryReader
-    followed by UnstructuredElementNodeParser in the pipeline.
-    Adds section metadata based on filenames.
-    Args:
-        input_path (str): Path to the directory containing section PDF documents.
-        cache_path (str): Path to the ingestion cache file.
-        process_filename (str, optional): If provided, only process the document with this filename. Defaults to None (process all).
-        use_summaries (bool): Whether to include SummaryExtractor. Defaults to False.
-    Returns:
-        list: A list of processed BaseNode objects with section metadata, or empty list on failure.
-    """
-    # --- LLM & Embedding Configuration ---
-    print("Configuring LLM (Gemini if needed) and Embedding models...")
-    gemini_api_key = os.getenv("GOOGLE_API_KEY")
-    if use_summaries:
-        if not gemini_api_key:
-            print("Warning: GOOGLE_API_KEY not set, but summaries requested. Disabling summaries.")
-            Settings.llm = None
-            use_summaries = False
-        else:
-            Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
-            print(f"Gemini LLM configured: {Settings.llm.model}")
-    else:
-        Settings.llm = None
-        print("LLM not configured as summaries are disabled.")
-    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-    print(f"Embedding Model: {Settings.embed_model.model_name}")
-    # --- End Configuration ---
-    # --- Load Data (Basic Text Extraction) ---
-    print(f"Attempting to load documents from: {input_path}")
-    # SimpleDirectoryReader will do basic PDF text extraction here.
-    # We are NOT using LlamaParse in file_extractor anymore.
-    reader_kwargs = {"filename_as_id": True, "required_exts": [".pdf"]}
-    documents_to_process = []
-    if process_filename:
-        print(f"Attempting to load specific section file: {process_filename}")
-        file_path = Path(input_path) / process_filename
-        if not file_path.exists() or file_path.suffix.lower() != ".pdf":
-             print(f"Error: Specified file '{process_filename}' not found or not a PDF in '{input_path}'.")
-             return []
-        # Load single PDF
-        reader = SimpleDirectoryReader(input_files=[file_path], **reader_kwargs)
-    else:
-        print(f"Loading all PDF files from directory: {input_path}")
-        reader = SimpleDirectoryReader(input_path, **reader_kwargs)
-    try:
-        # loaded_docs are basic Document objects with raw text extracted by the reader
-        loaded_docs = reader.load_data(show_progress=True)
-        print(f"Successfully loaded {len(loaded_docs)} documents (basic extraction).")
-        documents_to_process = loaded_docs
-    except Exception as e:
-        print(f"Error loading documents: {e}")
-        import traceback
-        traceback.print_exc()
-        return []
-    if not documents_to_process:
-        print("No documents loaded. Exiting ingestion.")
-        return []
-    # --- Add Section Metadata Based on Filename ---
-    # Apply this to the initially loaded docs BEFORE the pipeline re-parses them
-    print("Adding section metadata based on filenames...")
-    docs_with_metadata = []
-    filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE)
-    for doc in documents_to_process:
-        filename = doc.metadata.get('file_name', doc.id_) # Use id_ if filename missing
-        section_id = "unknown"
-        section_title = "unknown"
-        match = filename_pattern.match(filename)
-        if match:
-            section_id = match.group(1).strip()
-            section_title = match.group(2).strip()
-        else:
-            print(f"Warning: Filename '{filename}' did not match expected pattern 'Number. Title.pdf'")
-        doc.metadata['section_id'] = section_id
-        doc.metadata['section_title'] = section_title
-        docs_with_metadata.append(doc)
-        log_action(f"File '{filename}' (Section {section_id}) loaded.", action_type="LOAD")
-    # --- End Metadata Addition ---
-    # --- Caching Logic ---
-    try:
-        cache = IngestionCache.from_persist_path(cache_path)
-        print("Cache file found. Running using cache...")
-    except FileNotFoundError:
-        cache = IngestionCache()
-        print("No cache file found or error reading cache. Running without...")
-    # --- Define the Ingestion Pipeline (Unstructured Parser FIRST) ---
-    print("Defining ingestion pipeline (Unstructured Parser, Embedding)...")
-    # 1. UnstructuredElementNodeParser will take the raw Documents and re-parse them
-    #    using the 'unstructured' library for better layout/element detection.
-    node_parser = UnstructuredElementNodeParser()
-    # 2. (Optional) Summary Extractor
-    summary_extractor = SummaryExtractor(summaries=['self']) if use_summaries and Settings.llm else None
-    # 3. Embedding Model (using Settings)
-    embed_model = Settings.embed_model
-    transformations = [node_parser] # Unstructured parser goes first!
-    if summary_extractor:
-        transformations.append(summary_extractor)
-    transformations.append(embed_model)
-    pipeline = IngestionPipeline(
-        transformations=transformations,
-        cache=cache
-    )
-    print(f"Pipeline transformations: {[type(t).__name__ for t in pipeline.transformations]}")
-    # --- Run Pipeline ---
-    print("Running ingestion pipeline (Unstructured Parsing, Embedding)...")
-    # Pass the initial Documents (with added metadata) to the pipeline
-    # UnstructuredElementNodeParser will process them first.
-    final_nodes = pipeline.run(documents=docs_with_metadata, show_progress=True)
-    print(f"Ingestion pipeline complete. Processed/Generated {len(final_nodes)} final nodes.")
-    # --- Node Inspection ---
-    if final_nodes:
-        print("\n--- Inspecting Final Nodes (Post-Pipeline) ---")
-        num_nodes_to_inspect = min(len(final_nodes), 3)
-        for i in range(num_nodes_to_inspect):
-             node_to_inspect = final_nodes[i]
-             print(f"\n--- Node {i} (ID: {node_to_inspect.node_id}) ---")
-             print("Metadata:")
-             print(node_to_inspect.metadata) # Verify section_id etc.
-             print("\nContent (first 500 chars):")
-             print(node_to_inspect.text[:500] + "...")
-             print("-" * 20)
-    # --- Persist Cache ---
-    print(f"Persisting cache to {cache_path}...")
-    pipeline.cache.persist(cache_path)
-    print("Cache persisted.")
-    return final_nodes
-# --- Script Execution ---
-if __name__ == "__main__":
-    print("Starting Section Document Ingestion using Unstructured...")
-    # 1. Place section PDFs in STORAGE_PATH.
-    # 2. Ensure unstructured dependencies are installed (see above).
-    # 3. Set GOOGLE_API_KEY if using summaries.
-    generate_summaries = False # Keep False to avoid LLM calls initially
-    process_this_file = None # Set to filename like "2. REPERES SUR LES MALADIES....pdf" or None
-    if process_this_file:
-        print(f"Processing single file: {process_this_file}")
-    else:
-        print(f"Processing all PDF files found in: {STORAGE_PATH}")
-    nodes_output = ingest_section_docs_unstructured(
-        process_filename=process_this_file,
-        use_summaries=generate_summaries
-    )
-    print(f"\nIngestion process finished. {len(nodes_output) if nodes_output else 0} nodes processed.")
-    # ... rest of main block ...