Zeggai commited on
Commit
6bca910
·
verified ·
1 Parent(s): 3f2644d

Delete build_section_indices.py

Browse files
Files changed (1) hide show
  1. build_section_indices.py +0 -128
build_section_indices.py DELETED
@@ -1,128 +0,0 @@
1
- # build_section_indices.py
2
-
3
- import os
4
- import re
5
- from pathlib import Path
6
- # In build_section_indices.py
7
-
8
- import time # Import time for a small delay if needed (unlikely fix, but for debugging)
9
- from pathlib import Path
10
- # (Keep other imports the same: os, re, ingest function, Settings, LlamaIndex classes, etc.)
11
- from document_uploader import ingest_section_docs_unstructured
12
- from global_settings import STORAGE_PATH
13
- from llama_index.core import Settings, VectorStoreIndex, StorageContext
14
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
- from llama_index.llms.google_genai import GoogleGenAI # Or other LLM
16
-
17
- # --- Define SECTION_INDEX_BASE_PATH ---
18
- SECTION_INDEX_BASE_PATH = "./storage/section_indices"
19
-
20
- def create_index_for_section(section_pdf_path: Path, section_index_dir: Path):
21
- """
22
- Ingests a single section PDF, builds VectorStoreIndex IN MEMORY,
23
- and then persists it to disk.
24
- """
25
- print(f"\n--- Processing Section: {section_pdf_path.name} ---")
26
- print(f" Index target directory: {section_index_dir}")
27
-
28
- # 1. Ingest this specific section file to get nodes
29
- nodes = ingest_section_docs_unstructured(
30
- input_path=section_pdf_path.parent,
31
- process_filename=section_pdf_path.name,
32
- use_summaries=False # Keep LLM calls out of ingestion for now
33
- )
34
-
35
- if not nodes:
36
- print(f" ❌ Ingestion returned no nodes for {section_pdf_path.name}. Skipping index creation.")
37
- return False
38
-
39
- print(f" Ingested {len(nodes)} nodes for this section.")
40
-
41
- # 2. Build VectorStoreIndex IN MEMORY first
42
- try:
43
- print(f" Building VectorStoreIndex in memory...")
44
- # Create a *default* storage context (in-memory)
45
- storage_context = StorageContext.from_defaults()
46
-
47
- # Build the index using the in-memory context
48
- vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
49
- print(f" Index built successfully in memory.")
50
-
51
- except Exception as e:
52
- print(f" ❌ Error building index in memory for {section_pdf_path.name}: {e}")
53
- import traceback
54
- traceback.print_exc()
55
- return False
56
-
57
- # 3. Persist the IN-MEMORY index to the target directory
58
- try:
59
- print(f" Persisting index to disk: {section_index_dir}...")
60
- # Ensure the target directory exists just before persisting
61
- section_index_dir.mkdir(parents=True, exist_ok=True)
62
- # time.sleep(0.1) # Tiny delay - very unlikely needed but safe to try once
63
-
64
- # Now, tell the index (and its associated context) to save to disk
65
- vector_index.storage_context.persist(persist_dir=str(section_index_dir))
66
-
67
- print(f" ✅ Successfully persisted index for {section_pdf_path.name} to {section_index_dir}")
68
- return True
69
- except Exception as e:
70
- print(f" ❌ Error persisting index to disk for {section_pdf_path.name}: {e}")
71
- # If the error still occurs here, it might be permissions or path related
72
- print(f" Check write permissions for the directory: {section_index_dir.parent}")
73
- import traceback
74
- traceback.print_exc()
75
- return False
76
-
77
- # --- Main Indexing Orchestration ---
78
- # (The if __name__ == "__main__": block remains the same, it calls the function above)
79
- if __name__ == "__main__":
80
- print("--- Starting Per-Section Index Building Process (In-Memory First) ---")
81
-
82
- # --- Global Configuration ---
83
- print("Configuring Embedding model...")
84
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
85
- print(f"Embedding Model: {Settings.embed_model.model_name}")
86
- # LLM config (optional here unless index build unexpectedly needs it)
87
- gemini_api_key = os.getenv("GOOGLE_API_KEY")
88
- if not gemini_api_key: Settings.llm = None
89
- else: Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
90
- print(f"LLM Configured: {Settings.llm}")
91
- # --- End Configuration ---
92
-
93
- # --- Define Paths ---
94
- source_docs_path = Path(STORAGE_PATH)
95
- base_index_path = Path(SECTION_INDEX_BASE_PATH)
96
- base_index_path.mkdir(parents=True, exist_ok=True)
97
- print(f"Source documents location: {source_docs_path}")
98
- print(f"Base index storage location: {base_index_path}")
99
-
100
- # --- Find Section Files and Process ---
101
- filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE)
102
- processed_count = 0
103
- error_count = 0
104
-
105
- # --- !!! DELETE Existing Section Index Dirs before running !!! ---
106
- # print(f"Warning: Consider manually deleting contents of {base_index_path} before running.")
107
- # ---
108
-
109
- for pdf_file in source_docs_path.glob("*.pdf"):
110
- match = filename_pattern.match(pdf_file.name)
111
- if match:
112
- section_id = match.group(1).strip()
113
- section_index_dir_name = f"section_{section_id}_index"
114
- section_index_full_path = base_index_path / section_index_dir_name
115
-
116
- success = create_index_for_section(pdf_file, section_index_full_path)
117
- if success:
118
- processed_count += 1
119
- else:
120
- error_count += 1
121
- else:
122
- print(f"Skipping file (doesn't match section pattern): {pdf_file.name}")
123
-
124
- print("\n--- Indexing Summary ---")
125
- print(f"Successfully processed and indexed {processed_count} section file(s).")
126
- if error_count > 0:
127
- print(f"Encountered errors for {error_count} section file(s). Check logs above.")
128
- print(f"Section indices stored under: {base_index_path}")