Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import shutil | |
| import re | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain.docstore.document import Document | |
| def extract_section_reference(text: str) -> str: | |
| patterns = [ | |
| r"(Section\s+\d+[A-Za-z0-9\-]*)", | |
| r"(Article\s+\d+[A-Za-z0-9\-]*)", | |
| r"(Part\s+[IVXLC]+)", | |
| r"(Chapter\s+\d+)", | |
| ] | |
| for p in patterns: | |
| m = re.search(p, text, re.IGNORECASE) | |
| if m: | |
| return m.group(1).strip() | |
| return "Unknown Section" | |
| def _discover_processed_dirs(project_root: Path): | |
| candidates = [ | |
| project_root / "data" / "processed", | |
| project_root / "src" / "data" / "processed", | |
| ] | |
| return [p for p in candidates if p.exists()] | |
| def main(): | |
| print("=== INGEST: Section-aware build (Spaces-friendly) ===") | |
| project_root = Path(__file__).resolve().parent.parent | |
| print(f"[dbg] project_root: {project_root}") | |
| load_dotenv() | |
| processed_dirs = _discover_processed_dirs(project_root) | |
| if not processed_dirs: | |
| print("ERROR: No processed directories found.") | |
| print("Expected one of: ./data/processed or ./src/data/processed") | |
| sys.exit(1) | |
| text_files = [] | |
| for d in processed_dirs: | |
| text_files += list(d.glob("*.txt")) | |
| text_files = sorted(text_files) | |
| if not text_files: | |
| print("ERROR: No .txt files found in processed directories.") | |
| print("Make sure you committed your processed text files to the repo.") | |
| sys.exit(1) | |
| print(f"Found {len(text_files)} processed files:") | |
| for f in text_files: | |
| try: | |
| rel = f.relative_to(project_root) | |
| except Exception: | |
| rel = f | |
| print(" -", rel) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, chunk_overlap=150, separators=['\n\n', '\n', '. ', ' '] | |
| ) | |
| docs = [] | |
| for tf in text_files: | |
| try: | |
| content = tf.read_text(encoding="utf-8") | |
| except Exception as e: | |
| print(f"[warn] Could not read {tf}: {e}") | |
| continue | |
| if not content.strip(): | |
| print(f"[warn] Empty file, skipping: {tf}") | |
| continue | |
| chunks = splitter.split_text(content) | |
| base = tf.stem | |
| source_pdfish = base.replace("_text", "").replace("_TXT", "") | |
| lowname = tf.name.lower() | |
| if "constitution" in lowname: | |
| doc_type = "constitution" | |
| elif "labour" in lowname: | |
| doc_type = "labour_law" | |
| elif "fccpa" in lowname: | |
| doc_type = "consumer_protection" | |
| elif "data_protection" in lowname or "ndpr" in lowname: | |
| doc_type = "data_protection" | |
| else: | |
| doc_type = "general" | |
| for i, ch in enumerate(chunks): | |
| ch = ch.strip() | |
| if len(ch) < 25: | |
| continue | |
| section = extract_section_reference(ch) | |
| docs.append( | |
| Document( | |
| page_content=ch, | |
| metadata={ | |
| "document_type": doc_type, | |
| "section": section, | |
| "source": source_pdfish, | |
| "chunk_index": i, | |
| "total_chunks": len(chunks), | |
| "file_path": str(tf.relative_to(project_root)), | |
| "content_length": len(ch), | |
| }, | |
| ) | |
| ) | |
| if not docs: | |
| print("ERROR: No chunks prepared. Check your .txt content.") | |
| sys.exit(1) | |
| print(f"Prepared {len(docs)} chunks total.") | |
| print("Initializing embeddings...") | |
| embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}) | |
| test = embed.embed_query("hello") | |
| print(f"[dbg] embedding dim: {len(test)}") | |
| persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db")) | |
| if persist_dir.exists(): | |
| shutil.rmtree(persist_dir) | |
| print("[dbg] removed existing vector_db") | |
| persist_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"Building Chroma at: {persist_dir}") | |
| vectordb = Chroma.from_documents( | |
| documents=docs, | |
| embedding=embed, | |
| persist_directory=str(persist_dir), | |
| collection_name="legal_documents", | |
| ) | |
| count = vectordb._collection.count() | |
| print(f"✅ Ingestion complete. Stored {count} chunks in 'legal_documents'.") | |
| if count == 0: | |
| print("ERROR: Zero chunks after build. Investigate your input files.") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |