Spaces:
Sleeping
Sleeping
| # --- Imports --- | |
| import os | |
| import re | |
| from pathlib import Path | |
| from global_settings import STORAGE_PATH, CACHE_FILE | |
| from logging_functions import log_action | |
| # LlamaIndex Core Imports | |
| from llama_index.core import SimpleDirectoryReader, Settings | |
| from llama_index.core.ingestion import IngestionPipeline, IngestionCache | |
| # Import the Unstructured Node Parser | |
| from llama_index.core.node_parser import UnstructuredElementNodeParser # For parsing PDFs directly | |
| from llama_index.core.extractors import SummaryExtractor # Optional | |
| # Import Embedding Model | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| # Import LLM (Gemini) - Optional, only if SummaryExtractor runs | |
| from llama_index.llms.google_genai import GoogleGenAI | |
| # --- Function Definition --- | |
| def ingest_section_docs_unstructured( | |
| input_path=STORAGE_PATH, | |
| cache_path=CACHE_FILE, | |
| process_filename=None, | |
| use_summaries=False | |
| ): | |
| """ | |
| Ingests one or more SECTION document files (PDFs) using SimpleDirectoryReader | |
| followed by UnstructuredElementNodeParser in the pipeline. | |
| Adds section metadata based on filenames. | |
| Args: | |
| input_path (str): Path to the directory containing section PDF documents. | |
| cache_path (str): Path to the ingestion cache file. | |
| process_filename (str, optional): If provided, only process the document with this filename. Defaults to None (process all). | |
| use_summaries (bool): Whether to include SummaryExtractor. Defaults to False. | |
| Returns: | |
| list: A list of processed BaseNode objects with section metadata, or empty list on failure. | |
| """ | |
| # --- LLM & Embedding Configuration --- | |
| print("Configuring LLM (Gemini if needed) and Embedding models...") | |
| gemini_api_key = os.getenv("GOOGLE_API_KEY") | |
| if use_summaries: | |
| if not gemini_api_key: | |
| print("Warning: GOOGLE_API_KEY not set, but summaries requested. Disabling summaries.") | |
| Settings.llm = None | |
| use_summaries = False | |
| else: | |
| Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key) | |
| print(f"Gemini LLM configured: {Settings.llm.model}") | |
| else: | |
| Settings.llm = None | |
| print("LLM not configured as summaries are disabled.") | |
| Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| print(f"Embedding Model: {Settings.embed_model.model_name}") | |
| # --- End Configuration --- | |
| # --- Load Data (Basic Text Extraction) --- | |
| print(f"Attempting to load documents from: {input_path}") | |
| # SimpleDirectoryReader will do basic PDF text extraction here. | |
| # We are NOT using LlamaParse in file_extractor anymore. | |
| reader_kwargs = {"filename_as_id": True, "required_exts": [".pdf"]} | |
| documents_to_process = [] | |
| if process_filename: | |
| print(f"Attempting to load specific section file: {process_filename}") | |
| file_path = Path(input_path) / process_filename | |
| if not file_path.exists() or file_path.suffix.lower() != ".pdf": | |
| print(f"Error: Specified file '{process_filename}' not found or not a PDF in '{input_path}'.") | |
| return [] | |
| # Load single PDF | |
| reader = SimpleDirectoryReader(input_files=[file_path], **reader_kwargs) | |
| else: | |
| print(f"Loading all PDF files from directory: {input_path}") | |
| reader = SimpleDirectoryReader(input_path, **reader_kwargs) | |
| try: | |
| # loaded_docs are basic Document objects with raw text extracted by the reader | |
| loaded_docs = reader.load_data(show_progress=True) | |
| print(f"Successfully loaded {len(loaded_docs)} documents (basic extraction).") | |
| documents_to_process = loaded_docs | |
| except Exception as e: | |
| print(f"Error loading documents: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return [] | |
| if not documents_to_process: | |
| print("No documents loaded. Exiting ingestion.") | |
| return [] | |
| # --- Add Section Metadata Based on Filename --- | |
| # Apply this to the initially loaded docs BEFORE the pipeline re-parses them | |
| print("Adding section metadata based on filenames...") | |
| docs_with_metadata = [] | |
| filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE) | |
| for doc in documents_to_process: | |
| filename = doc.metadata.get('file_name', doc.id_) # Use id_ if filename missing | |
| section_id = "unknown" | |
| section_title = "unknown" | |
| match = filename_pattern.match(filename) | |
| if match: | |
| section_id = match.group(1).strip() | |
| section_title = match.group(2).strip() | |
| else: | |
| print(f"Warning: Filename '{filename}' did not match expected pattern 'Number. Title.pdf'") | |
| doc.metadata['section_id'] = section_id | |
| doc.metadata['section_title'] = section_title | |
| docs_with_metadata.append(doc) | |
| log_action(f"File '{filename}' (Section {section_id}) loaded.", action_type="LOAD") | |
| # --- End Metadata Addition --- | |
| # --- Caching Logic --- | |
| try: | |
| cache = IngestionCache.from_persist_path(cache_path) | |
| print("Cache file found. Running using cache...") | |
| except FileNotFoundError: | |
| cache = IngestionCache() | |
| print("No cache file found or error reading cache. Running without...") | |
| # --- Define the Ingestion Pipeline (Unstructured Parser FIRST) --- | |
| print("Defining ingestion pipeline (Unstructured Parser, Embedding)...") | |
| # 1. UnstructuredElementNodeParser will take the raw Documents and re-parse them | |
| # using the 'unstructured' library for better layout/element detection. | |
| node_parser = UnstructuredElementNodeParser() | |
| # 2. (Optional) Summary Extractor | |
| summary_extractor = SummaryExtractor(summaries=['self']) if use_summaries and Settings.llm else None | |
| # 3. Embedding Model (using Settings) | |
| embed_model = Settings.embed_model | |
| transformations = [node_parser] # Unstructured parser goes first! | |
| if summary_extractor: | |
| transformations.append(summary_extractor) | |
| transformations.append(embed_model) | |
| pipeline = IngestionPipeline( | |
| transformations=transformations, | |
| cache=cache | |
| ) | |
| print(f"Pipeline transformations: {[type(t).__name__ for t in pipeline.transformations]}") | |
| # --- Run Pipeline --- | |
| print("Running ingestion pipeline (Unstructured Parsing, Embedding)...") | |
| # Pass the initial Documents (with added metadata) to the pipeline | |
| # UnstructuredElementNodeParser will process them first. | |
| final_nodes = pipeline.run(documents=docs_with_metadata, show_progress=True) | |
| print(f"Ingestion pipeline complete. Processed/Generated {len(final_nodes)} final nodes.") | |
| # --- Node Inspection --- | |
| if final_nodes: | |
| print("\n--- Inspecting Final Nodes (Post-Pipeline) ---") | |
| num_nodes_to_inspect = min(len(final_nodes), 3) | |
| for i in range(num_nodes_to_inspect): | |
| node_to_inspect = final_nodes[i] | |
| print(f"\n--- Node {i} (ID: {node_to_inspect.node_id}) ---") | |
| print("Metadata:") | |
| print(node_to_inspect.metadata) # Verify section_id etc. | |
| print("\nContent (first 500 chars):") | |
| print(node_to_inspect.text[:500] + "...") | |
| print("-" * 20) | |
| # --- Persist Cache --- | |
| print(f"Persisting cache to {cache_path}...") | |
| pipeline.cache.persist(cache_path) | |
| print("Cache persisted.") | |
| return final_nodes | |
| # --- Script Execution --- | |
| if __name__ == "__main__": | |
| print("Starting Section Document Ingestion using Unstructured...") | |
| # 1. Place section PDFs in STORAGE_PATH. | |
| # 2. Ensure unstructured dependencies are installed (see above). | |
| # 3. Set GOOGLE_API_KEY if using summaries. | |
| generate_summaries = False # Keep False to avoid LLM calls initially | |
| process_this_file = None # Set to filename like "2. REPERES SUR LES MALADIES....pdf" or None | |
| if process_this_file: | |
| print(f"Processing single file: {process_this_file}") | |
| else: | |
| print(f"Processing all PDF files found in: {STORAGE_PATH}") | |
| nodes_output = ingest_section_docs_unstructured( | |
| process_filename=process_this_file, | |
| use_summaries=generate_summaries | |
| ) | |
| print(f"\nIngestion process finished. {len(nodes_output) if nodes_output else 0} nodes processed.") | |
| # ... rest of main block ... | |