Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| import hashlib | |
| from pathlib import Path | |
| import ftfy | |
| import unidecode | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader | |
| import logging | |
| # Configure logging to show up in Docker logs | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def clean_text(text: str) -> str: | |
| """MNC-grade scrubbing for structural and encoding noise.""" | |
| # 1. Structural Scrubbing | |
| text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text) | |
| text = re.sub(r'^\s*-\s*\d+\s*-\s*$', '', text, flags=re.MULTILINE) | |
| text = re.sub(r'[-*_]{3,}', '', text) | |
| # 2. Encoding Repairs | |
| text = ftfy.fix_text(text) | |
| text = unidecode.unidecode(text) | |
| text = unicodedata.normalize('NFKC', text) | |
| # 3. Whitespace Normalization | |
| text = re.sub(r'[\t\xa0]', ' ', text) | |
| text = re.sub(r'(?<=[a-z])\n(?=[a-z])', ' ', text) # Fix mid-sentence breaks | |
| text = re.sub(r' +', ' ', text) | |
| return text.strip() | |
| def build_index(data_dir: str, persist_dir: str): | |
| """Processes messy data into a professional vector store.""" | |
| logger.info(f"Starting ingestion from: {data_dir}") | |
| loader = DirectoryLoader(data_dir, glob="**/*.pdf", loader_cls=PyPDFLoader) | |
| raw_docs = loader.load() | |
| logger.info(f"Loaded {len(raw_docs)} documents.") | |
| # Gemini 2025 standard embedding model | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1200, chunk_overlap=150, add_start_index=True | |
| ) | |
| final_chunks = [] | |
| for i, doc in enumerate(raw_docs): | |
| logger.info(f"Processing doc {i+1}/{len(raw_docs)}: {doc.metadata.get('source', 'unknown')}") | |
| cleaned_content = clean_text(doc.page_content) | |
| source_name = Path(doc.metadata.get("source", "unknown")).name | |
| # Metadata extraction for citations | |
| metadata = { | |
| "source": source_name, | |
| "page": doc.metadata.get("page", 1), | |
| "chunk_hash": hashlib.md5(cleaned_content.encode()).hexdigest() | |
| } | |
| chunks = splitter.create_documents([cleaned_content], metadatas=[metadata]) | |
| final_chunks.extend(chunks) | |
| logger.info(f"Total chunks created: {len(final_chunks)}") | |
| logger.info(f"Persisting to VectorDB at: {persist_dir}") | |
| vectorstore = Chroma.from_documents( | |
| documents=final_chunks, | |
| embedding=embeddings, | |
| persist_directory=persist_dir | |
| ) | |
| logger.info("VectorDB successfully built and persisted.") | |
| return vectorstore |