menikev commited on
Commit
fac81b3
·
verified ·
1 Parent(s): c6db8fa

Update src/ingest_documents.py

Browse files
Files changed (1) hide show
  1. src/ingest_documents.py +217 -130
src/ingest_documents.py CHANGED
@@ -1,130 +1,217 @@
1
- """
2
- PDF Ingestion Pipeline for KnowYourRight Bot
3
- - Loads PDFs from /data/raw
4
- - Checks if pages are scanned or text-based
5
- - Runs OCR when needed
6
- - Splits into chunks for embedding
7
- - Generates embeddings using open-source models
8
- - Saves into ChromaDB vector store
9
- """
10
-
11
- import os
12
- import sys
13
- import fitz # PyMuPDF
14
- import pytesseract
15
- from PIL import Image
16
- from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain_community.embeddings import HuggingFaceEmbeddings
18
- from langchain_community.vectorstores import Chroma
19
- from langchain.docstore.document import Document
20
- from dotenv import load_dotenv
21
- from huggingface_hub import login
22
-
23
- # Load environment variables from .env file
24
- load_dotenv()
25
-
26
- # Get token from env
27
- hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
28
- if not hf_token:
29
- print("[ERROR] Missing Hugging Face token. Add it to .env as HUGGINGFACE_HUB_TOKEN")
30
- sys.exit(1)
31
-
32
- # Login to Hugging Face
33
- login(token=hf_token)
34
-
35
- # Paths
36
- RAW_DATA_DIR = "data/raw"
37
- PROCESSED_DATA_DIR = "data/processed"
38
- VECTOR_DB_DIR = "vector_db"
39
-
40
- os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
41
- os.makedirs(VECTOR_DB_DIR, exist_ok=True)
42
-
43
- # Detect Tesseract path (Windows vs Linux)
44
- if os.name == "nt": # Windows
45
- default_tess_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
46
- if not os.path.exists(default_tess_path):
47
- print("[ERROR] Tesseract not found. Install from: https://github.com/UB-Mannheim/tesseract/wiki")
48
- sys.exit(1)
49
- pytesseract.pytesseract.tesseract_cmd = default_tess_path
50
- else: # Linux/Mac
51
- pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"
52
-
53
- def is_scanned_page(page):
54
- """Check if PDF page contains text or is image-based."""
55
- text = page.get_text().strip()
56
- return len(text) == 0
57
-
58
- def extract_text_from_pdf(pdf_path):
59
- """Extract text from PDF with OCR for scanned pages."""
60
- doc = fitz.open(pdf_path)
61
- all_text = []
62
- for page_num, page in enumerate(doc):
63
- if is_scanned_page(page):
64
- pix = page.get_pixmap(dpi=300)
65
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
66
- text = pytesseract.image_to_string(img)
67
- print(f"[OCR] Page {page_num + 1}: {len(text.strip())} chars extracted")
68
- else:
69
- text = page.get_text()
70
- print(f"[TEXT] Page {page_num + 1}: {len(text.strip())} chars extracted")
71
- if text.strip():
72
- all_text.append(text)
73
- return "\n".join(all_text)
74
-
75
- def save_clean_text(filename, text):
76
- """Save extracted text to processed folder."""
77
- clean_path = os.path.join(PROCESSED_DATA_DIR, filename.replace(".pdf", ".txt"))
78
- with open(clean_path, "w", encoding="utf-8") as f:
79
- f.write(text)
80
- return clean_path
81
-
82
- def chunk_text(file_path):
83
- """Split text into overlapping chunks."""
84
- with open(file_path, "r", encoding="utf-8") as f:
85
- text = f.read()
86
- splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
87
- chunks = splitter.split_text(text)
88
- print(f"[CHUNKS] {file_path}: {len(chunks)} chunks created")
89
- docs = [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks]
90
- return docs
91
-
92
- def embed_and_store(documents):
93
- """Generate embeddings and store in Chroma vector DB."""
94
- if not documents:
95
- print("[ERROR] No documents to embed. Exiting.")
96
- sys.exit(1)
97
-
98
- embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
99
-
100
- # Test embedding
101
- test_vec = embedding_model.embed_query("Hello world")
102
- if not test_vec or all(v == 0 for v in test_vec):
103
- print("[ERROR] Embedding model returned empty vectors. Check Hugging Face token or model access.")
104
- sys.exit(1)
105
-
106
- vectordb = Chroma.from_documents(documents, embedding_model, persist_directory=VECTOR_DB_DIR)
107
- vectordb.persist()
108
- print(f"[OK] Stored {len(documents)} chunks in vector DB at {VECTOR_DB_DIR}")
109
-
110
- def main():
111
- all_docs = []
112
- for filename in os.listdir(RAW_DATA_DIR):
113
- if filename.endswith(".pdf"):
114
- pdf_path = os.path.join(RAW_DATA_DIR, filename)
115
- print(f"[LOAD] Processing {filename}...")
116
- text = extract_text_from_pdf(pdf_path)
117
-
118
- if not text.strip():
119
- print(f"[WARNING] No text extracted from {filename}, skipping...")
120
- continue
121
-
122
- clean_path = save_clean_text(filename, text)
123
- docs = chunk_text(clean_path)
124
- all_docs.extend(docs)
125
-
126
- embed_and_store(all_docs)
127
- print("[DONE] All documents processed and stored.")
128
-
129
- if __name__ == "__main__":
130
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete ingestion script - processes all documents, extracts sections,
4
+ and creates a unified collection with section-aware metadata.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import shutil
10
+ import re
11
+ from pathlib import Path
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_community.embeddings import HuggingFaceEmbeddings
14
+ from langchain_community.vectorstores import Chroma
15
+ from langchain.docstore.document import Document
16
+ from dotenv import load_dotenv
17
+ from huggingface_hub import login
18
+
19
+
20
+ def extract_section_reference(text: str) -> str:
21
+ """
22
+ Extract section/article/part reference from a chunk of text.
23
+ Handles variations like "Section 13", "Sec. 13", "Article 45", "Part IV", "Chapter 2".
24
+ """
25
+ patterns = [
26
+ r"(Section\s+\d+[A-Za-z0-9\-]*)",
27
+ r"(Sec\.\s*\d+[A-Za-z0-9\-]*)",
28
+ r"(Article\s+\d+[A-Za-z0-9\-]*)",
29
+ r"(Art\.\s*\d+[A-Za-z0-9\-]*)",
30
+ r"(Part\s+[IVXLC]+)",
31
+ r"(Chapter\s+\d+)",
32
+ r"(Cap\.\s*[A-Za-z0-9\-]+)"
33
+ ]
34
+ for pattern in patterns:
35
+ match = re.search(pattern, text, re.IGNORECASE)
36
+ if match:
37
+ return match.group(1).strip()
38
+ return "Unknown Section"
39
+
40
+
41
+ def main():
42
+ """Complete ingestion with section-aware metadata"""
43
+
44
+ print("COMPLETE LEGAL DOCUMENT INGESTION (Section-Aware)")
45
+ print("="*60)
46
+
47
+ # Load environment
48
+ load_dotenv()
49
+ hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
50
+
51
+ if not hf_token:
52
+ print("ERROR: Missing HUGGINGFACE_HUB_TOKEN in .env file")
53
+ sys.exit(1)
54
+
55
+ try:
56
+ login(token=hf_token)
57
+ print("✓ Logged in to Hugging Face")
58
+ except Exception as e:
59
+ print(f"ERROR: Hugging Face login failed: {e}")
60
+ sys.exit(1)
61
+
62
+ # Find processed text files
63
+ processed_dir = Path("data/processed")
64
+ if not processed_dir.exists():
65
+ print("ERROR: data/processed directory not found")
66
+ sys.exit(1)
67
+
68
+ text_files = list(processed_dir.glob("*_text.txt"))
69
+ if not text_files:
70
+ print("ERROR: No processed text files found")
71
+ sys.exit(1)
72
+
73
+ print(f"Found {len(text_files)} files to process:")
74
+
75
+ for file in text_files:
76
+ size = file.stat().st_size
77
+ print(f" {file.name}: {size:,} bytes")
78
+
79
+ # Process files into documents
80
+ all_documents = []
81
+
82
+ for text_file in text_files:
83
+ print(f"\nProcessing: {text_file.name}")
84
+
85
+ try:
86
+ with open(text_file, 'r', encoding='utf-8') as f:
87
+ content = f.read()
88
+
89
+ if not content.strip():
90
+ print(f" Skipping empty file: {text_file.name}")
91
+ continue
92
+
93
+ # Split into manageable chunks
94
+ text_splitter = RecursiveCharacterTextSplitter(
95
+ chunk_size=800,
96
+ chunk_overlap=150,
97
+ separators=['\n\n', '\n', '. ', ' ']
98
+ )
99
+
100
+ chunks = text_splitter.split_text(content)
101
+
102
+ # Clean human-readable source name (remove _text, underscores, .pdf)
103
+ source_name = (
104
+ text_file.stem.replace('_text', '')
105
+ .replace('_', ' ')
106
+ .replace('-', ' ')
107
+ .strip()
108
+ )
109
+
110
+ print(f" Created {len(chunks)} chunks from {len(content):,} characters")
111
+
112
+ # Create documents with section-aware metadata
113
+ for i, chunk in enumerate(chunks):
114
+ if len(chunk.strip()) > 20: # Only meaningful chunks
115
+ section_ref = extract_section_reference(chunk)
116
+
117
+ # Infer doc type
118
+ doc_type = "general"
119
+ if "constitution" in text_file.name.lower():
120
+ doc_type = "constitution"
121
+ elif "labour" in text_file.name.lower():
122
+ doc_type = "labour_law"
123
+ elif "fccpa" in text_file.name.lower():
124
+ doc_type = "consumer_protection"
125
+ elif "data_protection" in text_file.name.lower():
126
+ doc_type = "data_protection"
127
+
128
+ doc = Document(
129
+ page_content=chunk.strip(),
130
+ metadata={
131
+ 'source': source_name, # clean name, no .pdf
132
+ 'document_type': doc_type,
133
+ 'chunk_index': i,
134
+ 'total_chunks': len(chunks),
135
+ 'file_path': str(text_file),
136
+ 'content_length': len(chunk.strip()),
137
+ 'section': section_ref
138
+ }
139
+ )
140
+ all_documents.append(doc)
141
+
142
+ except Exception as e:
143
+ print(f" ERROR processing {text_file.name}: {e}")
144
+ continue
145
+
146
+ print(f"\nTotal documents prepared: {len(all_documents)}")
147
+
148
+ if not all_documents:
149
+ print("ERROR: No documents prepared for ingestion")
150
+ sys.exit(1)
151
+
152
+ # Initialize embedding model
153
+ try:
154
+ print("\nInitializing embedding model...")
155
+ embedding_model = HuggingFaceEmbeddings(
156
+ model_name="BAAI/bge-small-en",
157
+ model_kwargs={'device': 'cpu'}
158
+ )
159
+
160
+ test_embedding = embedding_model.embed_query("test legal document")
161
+ print(f"✓ Embedding model ready (dimension: {len(test_embedding)})")
162
+
163
+ except Exception as e:
164
+ print(f"ERROR: Embedding model initialization failed: {e}")
165
+ sys.exit(1)
166
+
167
+ # Create vector database
168
+ try:
169
+ print("\nCreating complete vector database (with sections)...")
170
+
171
+ vector_db_path = Path("vector_db")
172
+ if vector_db_path.exists():
173
+ shutil.rmtree(vector_db_path)
174
+ print(" Removed existing database")
175
+
176
+ vectordb = Chroma.from_documents(
177
+ documents=all_documents,
178
+ embedding=embedding_model,
179
+ persist_directory="vector_db",
180
+ collection_name="legal_documents"
181
+ )
182
+
183
+ count = vectordb._collection.count()
184
+ print(f"✓ Successfully stored {count} documents")
185
+
186
+ # Test search functionality
187
+ print("\nTesting search functionality (showing sections)...")
188
+
189
+ test_queries = [
190
+ "constitutional rights",
191
+ "FCCPA tribunal consumer protection",
192
+ "labour law employment worker",
193
+ "data protection privacy"
194
+ ]
195
+
196
+ for query in test_queries:
197
+ results = vectordb.similarity_search(query, k=3)
198
+ print(f"\n'{query}': {len(results)} results")
199
+ for doc in results:
200
+ print(f" [{doc.metadata.get('document_type')}] "
201
+ f"{doc.metadata.get('section', 'Unknown Section')} — "
202
+ f"{doc.metadata.get('source')}")
203
+ print(f" Preview: {doc.page_content[:120]}...")
204
+
205
+ print(f"\n✓ Complete ingestion successful!")
206
+ print(f"✓ Database contains {count} legal document chunks")
207
+ print(f"✓ Ready for legal question answering with section references")
208
+
209
+ except Exception as e:
210
+ print(f"ERROR: Vector database creation failed: {e}")
211
+ import traceback
212
+ traceback.print_exc()
213
+ sys.exit(1)
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()