File size: 4,754 Bytes
fac81b3
 
 
 
 
 
59d4e08
 
fac81b3
 
397c2a6
fac81b3
 
 
 
 
 
 
 
 
 
59d4e08
 
 
 
fac81b3
 
 
59d4e08
 
 
 
 
397c2a6
fac81b3
 
59d4e08
 
397c2a6
59d4e08
fac81b3
397c2a6
fac81b3
59d4e08
 
 
 
fac81b3
 
59d4e08
 
397c2a6
59d4e08
fac81b3
 
59d4e08
 
fac81b3
 
59d4e08
 
397c2a6
 
 
 
 
fac81b3
59d4e08
 
 
fac81b3
59d4e08
 
fac81b3
59d4e08
fac81b3
59d4e08
fac81b3
 
59d4e08
 
 
fac81b3
59d4e08
397c2a6
59d4e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac81b3
59d4e08
 
fac81b3
 
59d4e08
 
 
 
 
 
 
397c2a6
59d4e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac81b3
 
 
 
397c2a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
import os
import sys
import shutil
import re
from pathlib import Path

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document


def extract_section_reference(text: str) -> str:
    patterns = [
        r"(Section\s+\d+[A-Za-z0-9\-]*)",
        r"(Article\s+\d+[A-Za-z0-9\-]*)",
        r"(Part\s+[IVXLC]+)",
        r"(Chapter\s+\d+)",
    ]
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            return m.group(1).strip()
    return "Unknown Section"


def _discover_processed_dirs(project_root: Path):
    candidates = [
        project_root / "data" / "processed",
        project_root / "src" / "data" / "processed",
    ]
    return [p for p in candidates if p.exists()]


def main():
    print("=== INGEST: Section-aware build (Spaces-friendly) ===")
    project_root = Path(__file__).resolve().parent.parent
    print(f"[dbg] project_root: {project_root}")

    load_dotenv()

    processed_dirs = _discover_processed_dirs(project_root)
    if not processed_dirs:
        print("ERROR: No processed directories found.")
        print("Expected one of: ./data/processed or ./src/data/processed")
        sys.exit(1)

    text_files = []
    for d in processed_dirs:
        text_files += list(d.glob("*.txt"))
    text_files = sorted(text_files)

    if not text_files:
        print("ERROR: No .txt files found in processed directories.")
        print("Make sure you committed your processed text files to the repo.")
        sys.exit(1)

    print(f"Found {len(text_files)} processed files:")
    for f in text_files:
        try:
            rel = f.relative_to(project_root)
        except Exception:
            rel = f
        print("  -", rel)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=150, separators=['\n\n', '\n', '. ', ' ']
    )

    docs = []
    for tf in text_files:
        try:
            content = tf.read_text(encoding="utf-8")
        except Exception as e:
            print(f"[warn] Could not read {tf}: {e}")
            continue

        if not content.strip():
            print(f"[warn] Empty file, skipping: {tf}")
            continue

        chunks = splitter.split_text(content)
        base = tf.stem
        source_pdfish = base.replace("_text", "").replace("_TXT", "")

        lowname = tf.name.lower()
        if "constitution" in lowname:
            doc_type = "constitution"
        elif "labour" in lowname:
            doc_type = "labour_law"
        elif "fccpa" in lowname:
            doc_type = "consumer_protection"
        elif "data_protection" in lowname or "ndpr" in lowname:
            doc_type = "data_protection"
        else:
            doc_type = "general"

        for i, ch in enumerate(chunks):
            ch = ch.strip()
            if len(ch) < 25:
                continue
            section = extract_section_reference(ch)
            docs.append(
                Document(
                    page_content=ch,
                    metadata={
                        "document_type": doc_type,
                        "section": section,
                        "source": source_pdfish,
                        "chunk_index": i,
                        "total_chunks": len(chunks),
                        "file_path": str(tf.relative_to(project_root)),
                        "content_length": len(ch),
                    },
                )
            )

    if not docs:
        print("ERROR: No chunks prepared. Check your .txt content.")
        sys.exit(1)

    print(f"Prepared {len(docs)} chunks total.")

    print("Initializing embeddings...")
    embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
    test = embed.embed_query("hello")
    print(f"[dbg] embedding dim: {len(test)}")

    persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
    if persist_dir.exists():
        shutil.rmtree(persist_dir)
        print("[dbg] removed existing vector_db")
    persist_dir.mkdir(parents=True, exist_ok=True)

    print(f"Building Chroma at: {persist_dir}")
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embed,
        persist_directory=str(persist_dir),
        collection_name="legal_documents",
    )

    count = vectordb._collection.count()
    print(f"✅ Ingestion complete. Stored {count} chunks in 'legal_documents'.")
    if count == 0:
        print("ERROR: Zero chunks after build. Investigate your input files.")
        sys.exit(1)


if __name__ == "__main__":
    main()