# --- Imports --- import os import re from pathlib import Path from global_settings import STORAGE_PATH, CACHE_FILE from logging_functions import log_action # LlamaIndex Core Imports from llama_index.core import SimpleDirectoryReader, Settings from llama_index.core.ingestion import IngestionPipeline, IngestionCache # Import the Unstructured Node Parser from llama_index.core.node_parser import UnstructuredElementNodeParser # For parsing PDFs directly from llama_index.core.extractors import SummaryExtractor # Optional # Import Embedding Model from llama_index.embeddings.huggingface import HuggingFaceEmbedding # Import LLM (Gemini) - Optional, only if SummaryExtractor runs from llama_index.llms.google_genai import GoogleGenAI # --- Function Definition --- def ingest_section_docs_unstructured( input_path=STORAGE_PATH, cache_path=CACHE_FILE, process_filename=None, use_summaries=False ): """ Ingests one or more SECTION document files (PDFs) using SimpleDirectoryReader followed by UnstructuredElementNodeParser in the pipeline. Adds section metadata based on filenames. Args: input_path (str): Path to the directory containing section PDF documents. cache_path (str): Path to the ingestion cache file. process_filename (str, optional): If provided, only process the document with this filename. Defaults to None (process all). use_summaries (bool): Whether to include SummaryExtractor. Defaults to False. Returns: list: A list of processed BaseNode objects with section metadata, or empty list on failure. """ # --- LLM & Embedding Configuration --- print("Configuring LLM (Gemini if needed) and Embedding models...") gemini_api_key = os.getenv("GOOGLE_API_KEY") if use_summaries: if not gemini_api_key: print("Warning: GOOGLE_API_KEY not set, but summaries requested. Disabling summaries.") Settings.llm = None use_summaries = False else: Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key) print(f"Gemini LLM configured: {Settings.llm.model}") else: Settings.llm = None print("LLM not configured as summaries are disabled.") Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") print(f"Embedding Model: {Settings.embed_model.model_name}") # --- End Configuration --- # --- Load Data (Basic Text Extraction) --- print(f"Attempting to load documents from: {input_path}") # SimpleDirectoryReader will do basic PDF text extraction here. # We are NOT using LlamaParse in file_extractor anymore. reader_kwargs = {"filename_as_id": True, "required_exts": [".pdf"]} documents_to_process = [] if process_filename: print(f"Attempting to load specific section file: {process_filename}") file_path = Path(input_path) / process_filename if not file_path.exists() or file_path.suffix.lower() != ".pdf": print(f"Error: Specified file '{process_filename}' not found or not a PDF in '{input_path}'.") return [] # Load single PDF reader = SimpleDirectoryReader(input_files=[file_path], **reader_kwargs) else: print(f"Loading all PDF files from directory: {input_path}") reader = SimpleDirectoryReader(input_path, **reader_kwargs) try: # loaded_docs are basic Document objects with raw text extracted by the reader loaded_docs = reader.load_data(show_progress=True) print(f"Successfully loaded {len(loaded_docs)} documents (basic extraction).") documents_to_process = loaded_docs except Exception as e: print(f"Error loading documents: {e}") import traceback traceback.print_exc() return [] if not documents_to_process: print("No documents loaded. Exiting ingestion.") return [] # --- Add Section Metadata Based on Filename --- # Apply this to the initially loaded docs BEFORE the pipeline re-parses them print("Adding section metadata based on filenames...") docs_with_metadata = [] filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE) for doc in documents_to_process: filename = doc.metadata.get('file_name', doc.id_) # Use id_ if filename missing section_id = "unknown" section_title = "unknown" match = filename_pattern.match(filename) if match: section_id = match.group(1).strip() section_title = match.group(2).strip() else: print(f"Warning: Filename '{filename}' did not match expected pattern 'Number. Title.pdf'") doc.metadata['section_id'] = section_id doc.metadata['section_title'] = section_title docs_with_metadata.append(doc) log_action(f"File '{filename}' (Section {section_id}) loaded.", action_type="LOAD") # --- End Metadata Addition --- # --- Caching Logic --- try: cache = IngestionCache.from_persist_path(cache_path) print("Cache file found. Running using cache...") except FileNotFoundError: cache = IngestionCache() print("No cache file found or error reading cache. Running without...") # --- Define the Ingestion Pipeline (Unstructured Parser FIRST) --- print("Defining ingestion pipeline (Unstructured Parser, Embedding)...") # 1. UnstructuredElementNodeParser will take the raw Documents and re-parse them # using the 'unstructured' library for better layout/element detection. node_parser = UnstructuredElementNodeParser() # 2. (Optional) Summary Extractor summary_extractor = SummaryExtractor(summaries=['self']) if use_summaries and Settings.llm else None # 3. Embedding Model (using Settings) embed_model = Settings.embed_model transformations = [node_parser] # Unstructured parser goes first! if summary_extractor: transformations.append(summary_extractor) transformations.append(embed_model) pipeline = IngestionPipeline( transformations=transformations, cache=cache ) print(f"Pipeline transformations: {[type(t).__name__ for t in pipeline.transformations]}") # --- Run Pipeline --- print("Running ingestion pipeline (Unstructured Parsing, Embedding)...") # Pass the initial Documents (with added metadata) to the pipeline # UnstructuredElementNodeParser will process them first. final_nodes = pipeline.run(documents=docs_with_metadata, show_progress=True) print(f"Ingestion pipeline complete. Processed/Generated {len(final_nodes)} final nodes.") # --- Node Inspection --- if final_nodes: print("\n--- Inspecting Final Nodes (Post-Pipeline) ---") num_nodes_to_inspect = min(len(final_nodes), 3) for i in range(num_nodes_to_inspect): node_to_inspect = final_nodes[i] print(f"\n--- Node {i} (ID: {node_to_inspect.node_id}) ---") print("Metadata:") print(node_to_inspect.metadata) # Verify section_id etc. print("\nContent (first 500 chars):") print(node_to_inspect.text[:500] + "...") print("-" * 20) # --- Persist Cache --- print(f"Persisting cache to {cache_path}...") pipeline.cache.persist(cache_path) print("Cache persisted.") return final_nodes # --- Script Execution --- if __name__ == "__main__": print("Starting Section Document Ingestion using Unstructured...") # 1. Place section PDFs in STORAGE_PATH. # 2. Ensure unstructured dependencies are installed (see above). # 3. Set GOOGLE_API_KEY if using summaries. generate_summaries = False # Keep False to avoid LLM calls initially process_this_file = None # Set to filename like "2. REPERES SUR LES MALADIES....pdf" or None if process_this_file: print(f"Processing single file: {process_this_file}") else: print(f"Processing all PDF files found in: {STORAGE_PATH}") nodes_output = ingest_section_docs_unstructured( process_filename=process_this_file, use_summaries=generate_summaries ) print(f"\nIngestion process finished. {len(nodes_output) if nodes_output else 0} nodes processed.") # ... rest of main block ...