Spaces:
Sleeping
Sleeping
Delete document_uploader.py
Browse files- document_uploader.py +0 -200
document_uploader.py
DELETED
|
@@ -1,200 +0,0 @@
|
|
| 1 |
-
# --- Imports ---
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from global_settings import STORAGE_PATH, CACHE_FILE
|
| 6 |
-
from logging_functions import log_action
|
| 7 |
-
|
| 8 |
-
# LlamaIndex Core Imports
|
| 9 |
-
from llama_index.core import SimpleDirectoryReader, Settings
|
| 10 |
-
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
|
| 11 |
-
# Import the Unstructured Node Parser
|
| 12 |
-
from llama_index.core.node_parser import UnstructuredElementNodeParser # For parsing PDFs directly
|
| 13 |
-
from llama_index.core.extractors import SummaryExtractor # Optional
|
| 14 |
-
|
| 15 |
-
# Import Embedding Model
|
| 16 |
-
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 17 |
-
|
| 18 |
-
# Import LLM (Gemini) - Optional, only if SummaryExtractor runs
|
| 19 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
| 20 |
-
|
| 21 |
-
# --- Function Definition ---
|
| 22 |
-
def ingest_section_docs_unstructured(
|
| 23 |
-
input_path=STORAGE_PATH,
|
| 24 |
-
cache_path=CACHE_FILE,
|
| 25 |
-
process_filename=None,
|
| 26 |
-
use_summaries=False
|
| 27 |
-
):
|
| 28 |
-
"""
|
| 29 |
-
Ingests one or more SECTION document files (PDFs) using SimpleDirectoryReader
|
| 30 |
-
followed by UnstructuredElementNodeParser in the pipeline.
|
| 31 |
-
Adds section metadata based on filenames.
|
| 32 |
-
|
| 33 |
-
Args:
|
| 34 |
-
input_path (str): Path to the directory containing section PDF documents.
|
| 35 |
-
cache_path (str): Path to the ingestion cache file.
|
| 36 |
-
process_filename (str, optional): If provided, only process the document with this filename. Defaults to None (process all).
|
| 37 |
-
use_summaries (bool): Whether to include SummaryExtractor. Defaults to False.
|
| 38 |
-
|
| 39 |
-
Returns:
|
| 40 |
-
list: A list of processed BaseNode objects with section metadata, or empty list on failure.
|
| 41 |
-
"""
|
| 42 |
-
|
| 43 |
-
# --- LLM & Embedding Configuration ---
|
| 44 |
-
print("Configuring LLM (Gemini if needed) and Embedding models...")
|
| 45 |
-
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
| 46 |
-
if use_summaries:
|
| 47 |
-
if not gemini_api_key:
|
| 48 |
-
print("Warning: GOOGLE_API_KEY not set, but summaries requested. Disabling summaries.")
|
| 49 |
-
Settings.llm = None
|
| 50 |
-
use_summaries = False
|
| 51 |
-
else:
|
| 52 |
-
Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
|
| 53 |
-
print(f"Gemini LLM configured: {Settings.llm.model}")
|
| 54 |
-
else:
|
| 55 |
-
Settings.llm = None
|
| 56 |
-
print("LLM not configured as summaries are disabled.")
|
| 57 |
-
|
| 58 |
-
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
| 59 |
-
print(f"Embedding Model: {Settings.embed_model.model_name}")
|
| 60 |
-
# --- End Configuration ---
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
# --- Load Data (Basic Text Extraction) ---
|
| 64 |
-
print(f"Attempting to load documents from: {input_path}")
|
| 65 |
-
# SimpleDirectoryReader will do basic PDF text extraction here.
|
| 66 |
-
# We are NOT using LlamaParse in file_extractor anymore.
|
| 67 |
-
reader_kwargs = {"filename_as_id": True, "required_exts": [".pdf"]}
|
| 68 |
-
documents_to_process = []
|
| 69 |
-
|
| 70 |
-
if process_filename:
|
| 71 |
-
print(f"Attempting to load specific section file: {process_filename}")
|
| 72 |
-
file_path = Path(input_path) / process_filename
|
| 73 |
-
if not file_path.exists() or file_path.suffix.lower() != ".pdf":
|
| 74 |
-
print(f"Error: Specified file '{process_filename}' not found or not a PDF in '{input_path}'.")
|
| 75 |
-
return []
|
| 76 |
-
# Load single PDF
|
| 77 |
-
reader = SimpleDirectoryReader(input_files=[file_path], **reader_kwargs)
|
| 78 |
-
else:
|
| 79 |
-
print(f"Loading all PDF files from directory: {input_path}")
|
| 80 |
-
reader = SimpleDirectoryReader(input_path, **reader_kwargs)
|
| 81 |
-
|
| 82 |
-
try:
|
| 83 |
-
# loaded_docs are basic Document objects with raw text extracted by the reader
|
| 84 |
-
loaded_docs = reader.load_data(show_progress=True)
|
| 85 |
-
print(f"Successfully loaded {len(loaded_docs)} documents (basic extraction).")
|
| 86 |
-
documents_to_process = loaded_docs
|
| 87 |
-
except Exception as e:
|
| 88 |
-
print(f"Error loading documents: {e}")
|
| 89 |
-
import traceback
|
| 90 |
-
traceback.print_exc()
|
| 91 |
-
return []
|
| 92 |
-
|
| 93 |
-
if not documents_to_process:
|
| 94 |
-
print("No documents loaded. Exiting ingestion.")
|
| 95 |
-
return []
|
| 96 |
-
|
| 97 |
-
# --- Add Section Metadata Based on Filename ---
|
| 98 |
-
# Apply this to the initially loaded docs BEFORE the pipeline re-parses them
|
| 99 |
-
print("Adding section metadata based on filenames...")
|
| 100 |
-
docs_with_metadata = []
|
| 101 |
-
filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE)
|
| 102 |
-
for doc in documents_to_process:
|
| 103 |
-
filename = doc.metadata.get('file_name', doc.id_) # Use id_ if filename missing
|
| 104 |
-
section_id = "unknown"
|
| 105 |
-
section_title = "unknown"
|
| 106 |
-
match = filename_pattern.match(filename)
|
| 107 |
-
if match:
|
| 108 |
-
section_id = match.group(1).strip()
|
| 109 |
-
section_title = match.group(2).strip()
|
| 110 |
-
else:
|
| 111 |
-
print(f"Warning: Filename '{filename}' did not match expected pattern 'Number. Title.pdf'")
|
| 112 |
-
|
| 113 |
-
doc.metadata['section_id'] = section_id
|
| 114 |
-
doc.metadata['section_title'] = section_title
|
| 115 |
-
docs_with_metadata.append(doc)
|
| 116 |
-
log_action(f"File '{filename}' (Section {section_id}) loaded.", action_type="LOAD")
|
| 117 |
-
# --- End Metadata Addition ---
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
# --- Caching Logic ---
|
| 121 |
-
try:
|
| 122 |
-
cache = IngestionCache.from_persist_path(cache_path)
|
| 123 |
-
print("Cache file found. Running using cache...")
|
| 124 |
-
except FileNotFoundError:
|
| 125 |
-
cache = IngestionCache()
|
| 126 |
-
print("No cache file found or error reading cache. Running without...")
|
| 127 |
-
|
| 128 |
-
# --- Define the Ingestion Pipeline (Unstructured Parser FIRST) ---
|
| 129 |
-
print("Defining ingestion pipeline (Unstructured Parser, Embedding)...")
|
| 130 |
-
|
| 131 |
-
# 1. UnstructuredElementNodeParser will take the raw Documents and re-parse them
|
| 132 |
-
# using the 'unstructured' library for better layout/element detection.
|
| 133 |
-
node_parser = UnstructuredElementNodeParser()
|
| 134 |
-
|
| 135 |
-
# 2. (Optional) Summary Extractor
|
| 136 |
-
summary_extractor = SummaryExtractor(summaries=['self']) if use_summaries and Settings.llm else None
|
| 137 |
-
|
| 138 |
-
# 3. Embedding Model (using Settings)
|
| 139 |
-
embed_model = Settings.embed_model
|
| 140 |
-
|
| 141 |
-
transformations = [node_parser] # Unstructured parser goes first!
|
| 142 |
-
if summary_extractor:
|
| 143 |
-
transformations.append(summary_extractor)
|
| 144 |
-
transformations.append(embed_model)
|
| 145 |
-
|
| 146 |
-
pipeline = IngestionPipeline(
|
| 147 |
-
transformations=transformations,
|
| 148 |
-
cache=cache
|
| 149 |
-
)
|
| 150 |
-
print(f"Pipeline transformations: {[type(t).__name__ for t in pipeline.transformations]}")
|
| 151 |
-
|
| 152 |
-
# --- Run Pipeline ---
|
| 153 |
-
print("Running ingestion pipeline (Unstructured Parsing, Embedding)...")
|
| 154 |
-
# Pass the initial Documents (with added metadata) to the pipeline
|
| 155 |
-
# UnstructuredElementNodeParser will process them first.
|
| 156 |
-
final_nodes = pipeline.run(documents=docs_with_metadata, show_progress=True)
|
| 157 |
-
print(f"Ingestion pipeline complete. Processed/Generated {len(final_nodes)} final nodes.")
|
| 158 |
-
|
| 159 |
-
# --- Node Inspection ---
|
| 160 |
-
if final_nodes:
|
| 161 |
-
print("\n--- Inspecting Final Nodes (Post-Pipeline) ---")
|
| 162 |
-
num_nodes_to_inspect = min(len(final_nodes), 3)
|
| 163 |
-
for i in range(num_nodes_to_inspect):
|
| 164 |
-
node_to_inspect = final_nodes[i]
|
| 165 |
-
print(f"\n--- Node {i} (ID: {node_to_inspect.node_id}) ---")
|
| 166 |
-
print("Metadata:")
|
| 167 |
-
print(node_to_inspect.metadata) # Verify section_id etc.
|
| 168 |
-
print("\nContent (first 500 chars):")
|
| 169 |
-
print(node_to_inspect.text[:500] + "...")
|
| 170 |
-
print("-" * 20)
|
| 171 |
-
|
| 172 |
-
# --- Persist Cache ---
|
| 173 |
-
print(f"Persisting cache to {cache_path}...")
|
| 174 |
-
pipeline.cache.persist(cache_path)
|
| 175 |
-
print("Cache persisted.")
|
| 176 |
-
|
| 177 |
-
return final_nodes
|
| 178 |
-
|
| 179 |
-
# --- Script Execution ---
|
| 180 |
-
if __name__ == "__main__":
|
| 181 |
-
print("Starting Section Document Ingestion using Unstructured...")
|
| 182 |
-
# 1. Place section PDFs in STORAGE_PATH.
|
| 183 |
-
# 2. Ensure unstructured dependencies are installed (see above).
|
| 184 |
-
# 3. Set GOOGLE_API_KEY if using summaries.
|
| 185 |
-
|
| 186 |
-
generate_summaries = False # Keep False to avoid LLM calls initially
|
| 187 |
-
process_this_file = None # Set to filename like "2. REPERES SUR LES MALADIES....pdf" or None
|
| 188 |
-
|
| 189 |
-
if process_this_file:
|
| 190 |
-
print(f"Processing single file: {process_this_file}")
|
| 191 |
-
else:
|
| 192 |
-
print(f"Processing all PDF files found in: {STORAGE_PATH}")
|
| 193 |
-
|
| 194 |
-
nodes_output = ingest_section_docs_unstructured(
|
| 195 |
-
process_filename=process_this_file,
|
| 196 |
-
use_summaries=generate_summaries
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
print(f"\nIngestion process finished. {len(nodes_output) if nodes_output else 0} nodes processed.")
|
| 200 |
-
# ... rest of main block ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|