Spaces:

menikev
/

KnowYourRIght-Bot

Sleeping

App Files Files Community

menikev commited on Aug 16, 2025

Commit

fac81b3

verified ·

1 Parent(s): c6db8fa

Update src/ingest_documents.py

Browse files

Files changed (1) hide show

src/ingest_documents.py +217 -130

src/ingest_documents.py CHANGED Viewed

@@ -1,130 +1,217 @@
-"""
-PDF Ingestion Pipeline for KnowYourRight Bot
-- Loads PDFs from /data/raw
-- Checks if pages are scanned or text-based
-- Runs OCR when needed
-- Splits into chunks for embedding
-- Generates embeddings using open-source models
-- Saves into ChromaDB vector store
-"""
-import os
-import sys
-import fitz  # PyMuPDF
-import pytesseract
-from PIL import Image
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import Chroma
-from langchain.docstore.document import Document
-from dotenv import load_dotenv
-from huggingface_hub import login
-# Load environment variables from .env file
-load_dotenv()
-# Get token from env
-hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
-if not hf_token:
-    print("[ERROR] Missing Hugging Face token. Add it to .env as HUGGINGFACE_HUB_TOKEN")
-    sys.exit(1)
-# Login to Hugging Face
-login(token=hf_token)
-# Paths
-RAW_DATA_DIR = "data/raw"
-PROCESSED_DATA_DIR = "data/processed"
-VECTOR_DB_DIR = "vector_db"
-os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
-os.makedirs(VECTOR_DB_DIR, exist_ok=True)
-# Detect Tesseract path (Windows vs Linux)
-if os.name == "nt":  # Windows
-    default_tess_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-    if not os.path.exists(default_tess_path):
-        print("[ERROR] Tesseract not found. Install from: https://github.com/UB-Mannheim/tesseract/wiki")
-        sys.exit(1)
-    pytesseract.pytesseract.tesseract_cmd = default_tess_path
-else:  # Linux/Mac
-    pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"
-def is_scanned_page(page):
-    """Check if PDF page contains text or is image-based."""
-    text = page.get_text().strip()
-    return len(text) == 0
-def extract_text_from_pdf(pdf_path):
-    """Extract text from PDF with OCR for scanned pages."""
-    doc = fitz.open(pdf_path)
-    all_text = []
-    for page_num, page in enumerate(doc):
-        if is_scanned_page(page):
-            pix = page.get_pixmap(dpi=300)
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            text = pytesseract.image_to_string(img)
-            print(f"[OCR] Page {page_num + 1}: {len(text.strip())} chars extracted")
-        else:
-            text = page.get_text()
-            print(f"[TEXT] Page {page_num + 1}: {len(text.strip())} chars extracted")
-        if text.strip():
-            all_text.append(text)
-    return "\n".join(all_text)
-def save_clean_text(filename, text):
-    """Save extracted text to processed folder."""
-    clean_path = os.path.join(PROCESSED_DATA_DIR, filename.replace(".pdf", ".txt"))
-    with open(clean_path, "w", encoding="utf-8") as f:
-        f.write(text)
-    return clean_path
-def chunk_text(file_path):
-    """Split text into overlapping chunks."""
-    with open(file_path, "r", encoding="utf-8") as f:
-        text = f.read()
-    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
-    chunks = splitter.split_text(text)
-    print(f"[CHUNKS] {file_path}: {len(chunks)} chunks created")
-    docs = [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks]
-    return docs
-def embed_and_store(documents):
-    """Generate embeddings and store in Chroma vector DB."""
-    if not documents:
-        print("[ERROR] No documents to embed. Exiting.")
-        sys.exit(1)
-    embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
-    # Test embedding
-    test_vec = embedding_model.embed_query("Hello world")
-    if not test_vec or all(v == 0 for v in test_vec):
-        print("[ERROR] Embedding model returned empty vectors. Check Hugging Face token or model access.")
-        sys.exit(1)
-    vectordb = Chroma.from_documents(documents, embedding_model, persist_directory=VECTOR_DB_DIR)
-    vectordb.persist()
-    print(f"[OK] Stored {len(documents)} chunks in vector DB at {VECTOR_DB_DIR}")
-def main():
-    all_docs = []
-    for filename in os.listdir(RAW_DATA_DIR):
-        if filename.endswith(".pdf"):
-            pdf_path = os.path.join(RAW_DATA_DIR, filename)
-            print(f"[LOAD] Processing {filename}...")
-            text = extract_text_from_pdf(pdf_path)
-            if not text.strip():
-                print(f"[WARNING] No text extracted from {filename}, skipping...")
-                continue
-            clean_path = save_clean_text(filename, text)
-            docs = chunk_text(clean_path)
-            all_docs.extend(docs)
-    embed_and_store(all_docs)
-    print("[DONE] All documents processed and stored.")
-if __name__ == "__main__":
-    main()

+#!/usr/bin/env python3
+"""
+Complete ingestion script - processes all documents, extracts sections,
+and creates a unified collection with section-aware metadata.
+"""
+import os
+import sys
+import shutil
+import re
+from pathlib import Path
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.docstore.document import Document
+from dotenv import load_dotenv
+from huggingface_hub import login
+def extract_section_reference(text: str) -> str:
+    """
+    Extract section/article/part reference from a chunk of text.
+    Handles variations like "Section 13", "Sec. 13", "Article 45", "Part IV", "Chapter 2".
+    """
+    patterns = [
+        r"(Section\s+\d+[A-Za-z0-9\-]*)",
+        r"(Sec\.\s*\d+[A-Za-z0-9\-]*)",
+        r"(Article\s+\d+[A-Za-z0-9\-]*)",
+        r"(Art\.\s*\d+[A-Za-z0-9\-]*)",
+        r"(Part\s+[IVXLC]+)",
+        r"(Chapter\s+\d+)",
+        r"(Cap\.\s*[A-Za-z0-9\-]+)"
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            return match.group(1).strip()
+    return "Unknown Section"
+def main():
+    """Complete ingestion with section-aware metadata"""
+    print("COMPLETE LEGAL DOCUMENT INGESTION (Section-Aware)")
+    print("="*60)
+    # Load environment
+    load_dotenv()
+    hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+    if not hf_token:
+        print("ERROR: Missing HUGGINGFACE_HUB_TOKEN in .env file")
+        sys.exit(1)
+    try:
+        login(token=hf_token)
+        print("✓ Logged in to Hugging Face")
+    except Exception as e:
+        print(f"ERROR: Hugging Face login failed: {e}")
+        sys.exit(1)
+    # Find processed text files
+    processed_dir = Path("data/processed")
+    if not processed_dir.exists():
+        print("ERROR: data/processed directory not found")
+        sys.exit(1)
+    text_files = list(processed_dir.glob("*_text.txt"))
+    if not text_files:
+        print("ERROR: No processed text files found")
+        sys.exit(1)
+    print(f"Found {len(text_files)} files to process:")
+    for file in text_files:
+        size = file.stat().st_size
+        print(f"  {file.name}: {size:,} bytes")
+    # Process files into documents
+    all_documents = []
+    for text_file in text_files:
+        print(f"\nProcessing: {text_file.name}")
+        try:
+            with open(text_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+            if not content.strip():
+                print(f"  Skipping empty file: {text_file.name}")
+                continue
+            # Split into manageable chunks
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=800,
+                chunk_overlap=150,
+                separators=['\n\n', '\n', '. ', ' ']
+            )
+            chunks = text_splitter.split_text(content)
+            # Clean human-readable source name (remove _text, underscores, .pdf)
+            source_name = (
+                text_file.stem.replace('_text', '')
+                .replace('_', ' ')
+                .replace('-', ' ')
+                .strip()
+            )
+            print(f"  Created {len(chunks)} chunks from {len(content):,} characters")
+            # Create documents with section-aware metadata
+            for i, chunk in enumerate(chunks):
+                if len(chunk.strip()) > 20:  # Only meaningful chunks
+                    section_ref = extract_section_reference(chunk)
+                    # Infer doc type
+                    doc_type = "general"
+                    if "constitution" in text_file.name.lower():
+                        doc_type = "constitution"
+                    elif "labour" in text_file.name.lower():
+                        doc_type = "labour_law"
+                    elif "fccpa" in text_file.name.lower():
+                        doc_type = "consumer_protection"
+                    elif "data_protection" in text_file.name.lower():
+                        doc_type = "data_protection"
+                    doc = Document(
+                        page_content=chunk.strip(),
+                        metadata={
+                            'source': source_name,  # clean name, no .pdf
+                            'document_type': doc_type,
+                            'chunk_index': i,
+                            'total_chunks': len(chunks),
+                            'file_path': str(text_file),
+                            'content_length': len(chunk.strip()),
+                            'section': section_ref
+                        }
+                    )
+                    all_documents.append(doc)
+        except Exception as e:
+            print(f"  ERROR processing {text_file.name}: {e}")
+            continue
+    print(f"\nTotal documents prepared: {len(all_documents)}")
+    if not all_documents:
+        print("ERROR: No documents prepared for ingestion")
+        sys.exit(1)
+    # Initialize embedding model
+    try:
+        print("\nInitializing embedding model...")
+        embedding_model = HuggingFaceEmbeddings(
+            model_name="BAAI/bge-small-en",
+            model_kwargs={'device': 'cpu'}
+        )
+        test_embedding = embedding_model.embed_query("test legal document")
+        print(f"✓ Embedding model ready (dimension: {len(test_embedding)})")
+    except Exception as e:
+        print(f"ERROR: Embedding model initialization failed: {e}")
+        sys.exit(1)
+    # Create vector database
+    try:
+        print("\nCreating complete vector database (with sections)...")
+        vector_db_path = Path("vector_db")
+        if vector_db_path.exists():
+            shutil.rmtree(vector_db_path)
+            print("  Removed existing database")
+        vectordb = Chroma.from_documents(
+            documents=all_documents,
+            embedding=embedding_model,
+            persist_directory="vector_db",
+            collection_name="legal_documents"
+        )
+        count = vectordb._collection.count()
+        print(f"✓ Successfully stored {count} documents")
+        # Test search functionality
+        print("\nTesting search functionality (showing sections)...")
+        test_queries = [
+            "constitutional rights",
+            "FCCPA tribunal consumer protection",
+            "labour law employment worker",
+            "data protection privacy"
+        ]
+        for query in test_queries:
+            results = vectordb.similarity_search(query, k=3)
+            print(f"\n'{query}': {len(results)} results")
+            for doc in results:
+                print(f"  [{doc.metadata.get('document_type')}] "
+                      f"{doc.metadata.get('section', 'Unknown Section')} — "
+                      f"{doc.metadata.get('source')}")
+                print(f"    Preview: {doc.page_content[:120]}...")
+        print(f"\n✓ Complete ingestion successful!")
+        print(f"✓ Database contains {count} legal document chunks")
+        print(f"✓ Ready for legal question answering with section references")
+    except Exception as e:
+        print(f"ERROR: Vector database creation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()