Spaces:

menikev
/

KnowYourRIght-Bot

Sleeping

File size: 4,754 Bytes

fac81b3
 
 
 
 
 
59d4e08
 
fac81b3
 
397c2a6
fac81b3
 
 
 
 
 
 
 
 
 
59d4e08
 
 
 
fac81b3
 
 
59d4e08
 
 
 
 
397c2a6
fac81b3
 
59d4e08
 
397c2a6
59d4e08
fac81b3
397c2a6
fac81b3
59d4e08
 
 
 
fac81b3
 
59d4e08
 
397c2a6
59d4e08
fac81b3
 
59d4e08
 
fac81b3
 
59d4e08
 
397c2a6
 
 
 
 
fac81b3
59d4e08
 
 
fac81b3
59d4e08
 
fac81b3
59d4e08
fac81b3
59d4e08
fac81b3
 
59d4e08
 
 
fac81b3
59d4e08
397c2a6
59d4e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac81b3
59d4e08
 
fac81b3
 
59d4e08
 
 
 
 
 
 
397c2a6
59d4e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac81b3
 
 
 
397c2a6

#!/usr/bin/env python3
import os
import sys
import shutil
import re
from pathlib import Path

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document


def extract_section_reference(text: str) -> str:
    patterns = [
        r"(Section\s+\d+[A-Za-z0-9\-]*)",
        r"(Article\s+\d+[A-Za-z0-9\-]*)",
        r"(Part\s+[IVXLC]+)",
        r"(Chapter\s+\d+)",
    ]
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            return m.group(1).strip()
    return "Unknown Section"


def _discover_processed_dirs(project_root: Path):
    candidates = [
        project_root / "data" / "processed",
        project_root / "src" / "data" / "processed",
    ]
    return [p for p in candidates if p.exists()]


def main():
    print("=== INGEST: Section-aware build (Spaces-friendly) ===")
    project_root = Path(__file__).resolve().parent.parent
    print(f"[dbg] project_root: {project_root}")

    load_dotenv()

    processed_dirs = _discover_processed_dirs(project_root)
    if not processed_dirs:
        print("ERROR: No processed directories found.")
        print("Expected one of: ./data/processed or ./src/data/processed")
        sys.exit(1)

    text_files = []
    for d in processed_dirs:
        text_files += list(d.glob("*.txt"))
    text_files = sorted(text_files)

    if not text_files:
        print("ERROR: No .txt files found in processed directories.")
        print("Make sure you committed your processed text files to the repo.")
        sys.exit(1)

    print(f"Found {len(text_files)} processed files:")
    for f in text_files:
        try:
            rel = f.relative_to(project_root)
        except Exception:
            rel = f
        print("  -", rel)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=150, separators=['\n\n', '\n', '. ', ' ']
    )

    docs = []
    for tf in text_files:
        try:
            content = tf.read_text(encoding="utf-8")
        except Exception as e:
            print(f"[warn] Could not read {tf}: {e}")
            continue

        if not content.strip():
            print(f"[warn] Empty file, skipping: {tf}")
            continue

        chunks = splitter.split_text(content)
        base = tf.stem
        source_pdfish = base.replace("_text", "").replace("_TXT", "")

        lowname = tf.name.lower()
        if "constitution" in lowname:
            doc_type = "constitution"
        elif "labour" in lowname:
            doc_type = "labour_law"
        elif "fccpa" in lowname:
            doc_type = "consumer_protection"
        elif "data_protection" in lowname or "ndpr" in lowname:
            doc_type = "data_protection"
        else:
            doc_type = "general"

        for i, ch in enumerate(chunks):
            ch = ch.strip()
            if len(ch) < 25:
                continue
            section = extract_section_reference(ch)
            docs.append(
                Document(
                    page_content=ch,
                    metadata={
                        "document_type": doc_type,
                        "section": section,
                        "source": source_pdfish,
                        "chunk_index": i,
                        "total_chunks": len(chunks),
                        "file_path": str(tf.relative_to(project_root)),
                        "content_length": len(ch),
                    },
                )
            )

    if not docs:
        print("ERROR: No chunks prepared. Check your .txt content.")
        sys.exit(1)

    print(f"Prepared {len(docs)} chunks total.")

    print("Initializing embeddings...")
    embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
    test = embed.embed_query("hello")
    print(f"[dbg] embedding dim: {len(test)}")

    persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
    if persist_dir.exists():
        shutil.rmtree(persist_dir)
        print("[dbg] removed existing vector_db")
    persist_dir.mkdir(parents=True, exist_ok=True)

    print(f"Building Chroma at: {persist_dir}")
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embed,
        persist_directory=str(persist_dir),
        collection_name="legal_documents",
    )

    count = vectordb._collection.count()
    print(f"✅ Ingestion complete. Stored {count} chunks in 'legal_documents'.")
    if count == 0:
        print("ERROR: Zero chunks after build. Investigate your input files.")
        sys.exit(1)


if __name__ == "__main__":
    main()