Spaces:
Sleeping
Sleeping
File size: 4,754 Bytes
fac81b3 59d4e08 fac81b3 397c2a6 fac81b3 59d4e08 fac81b3 59d4e08 397c2a6 fac81b3 59d4e08 397c2a6 59d4e08 fac81b3 397c2a6 fac81b3 59d4e08 fac81b3 59d4e08 397c2a6 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 397c2a6 fac81b3 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 397c2a6 59d4e08 fac81b3 59d4e08 fac81b3 59d4e08 397c2a6 59d4e08 fac81b3 397c2a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
#!/usr/bin/env python3
import os
import sys
import shutil
import re
from pathlib import Path
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
def extract_section_reference(text: str) -> str:
patterns = [
r"(Section\s+\d+[A-Za-z0-9\-]*)",
r"(Article\s+\d+[A-Za-z0-9\-]*)",
r"(Part\s+[IVXLC]+)",
r"(Chapter\s+\d+)",
]
for p in patterns:
m = re.search(p, text, re.IGNORECASE)
if m:
return m.group(1).strip()
return "Unknown Section"
def _discover_processed_dirs(project_root: Path):
candidates = [
project_root / "data" / "processed",
project_root / "src" / "data" / "processed",
]
return [p for p in candidates if p.exists()]
def main():
print("=== INGEST: Section-aware build (Spaces-friendly) ===")
project_root = Path(__file__).resolve().parent.parent
print(f"[dbg] project_root: {project_root}")
load_dotenv()
processed_dirs = _discover_processed_dirs(project_root)
if not processed_dirs:
print("ERROR: No processed directories found.")
print("Expected one of: ./data/processed or ./src/data/processed")
sys.exit(1)
text_files = []
for d in processed_dirs:
text_files += list(d.glob("*.txt"))
text_files = sorted(text_files)
if not text_files:
print("ERROR: No .txt files found in processed directories.")
print("Make sure you committed your processed text files to the repo.")
sys.exit(1)
print(f"Found {len(text_files)} processed files:")
for f in text_files:
try:
rel = f.relative_to(project_root)
except Exception:
rel = f
print(" -", rel)
splitter = RecursiveCharacterTextSplitter(
chunk_size=800, chunk_overlap=150, separators=['\n\n', '\n', '. ', ' ']
)
docs = []
for tf in text_files:
try:
content = tf.read_text(encoding="utf-8")
except Exception as e:
print(f"[warn] Could not read {tf}: {e}")
continue
if not content.strip():
print(f"[warn] Empty file, skipping: {tf}")
continue
chunks = splitter.split_text(content)
base = tf.stem
source_pdfish = base.replace("_text", "").replace("_TXT", "")
lowname = tf.name.lower()
if "constitution" in lowname:
doc_type = "constitution"
elif "labour" in lowname:
doc_type = "labour_law"
elif "fccpa" in lowname:
doc_type = "consumer_protection"
elif "data_protection" in lowname or "ndpr" in lowname:
doc_type = "data_protection"
else:
doc_type = "general"
for i, ch in enumerate(chunks):
ch = ch.strip()
if len(ch) < 25:
continue
section = extract_section_reference(ch)
docs.append(
Document(
page_content=ch,
metadata={
"document_type": doc_type,
"section": section,
"source": source_pdfish,
"chunk_index": i,
"total_chunks": len(chunks),
"file_path": str(tf.relative_to(project_root)),
"content_length": len(ch),
},
)
)
if not docs:
print("ERROR: No chunks prepared. Check your .txt content.")
sys.exit(1)
print(f"Prepared {len(docs)} chunks total.")
print("Initializing embeddings...")
embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
test = embed.embed_query("hello")
print(f"[dbg] embedding dim: {len(test)}")
persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
if persist_dir.exists():
shutil.rmtree(persist_dir)
print("[dbg] removed existing vector_db")
persist_dir.mkdir(parents=True, exist_ok=True)
print(f"Building Chroma at: {persist_dir}")
vectordb = Chroma.from_documents(
documents=docs,
embedding=embed,
persist_directory=str(persist_dir),
collection_name="legal_documents",
)
count = vectordb._collection.count()
print(f"✅ Ingestion complete. Stored {count} chunks in 'legal_documents'.")
if count == 0:
print("ERROR: Zero chunks after build. Investigate your input files.")
sys.exit(1)
if __name__ == "__main__":
main() |