Spaces:
Sleeping
Sleeping
File size: 2,972 Bytes
9806c71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import re
import unicodedata
import hashlib
from pathlib import Path
import ftfy
import unidecode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
import logging
# Configure logging to show up in Docker logs
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def clean_text(text: str) -> str:
"""MNC-grade scrubbing for structural and encoding noise."""
# 1. Structural Scrubbing
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text)
text = re.sub(r'^\s*-\s*\d+\s*-\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r'[-*_]{3,}', '', text)
# 2. Encoding Repairs
text = ftfy.fix_text(text)
text = unidecode.unidecode(text)
text = unicodedata.normalize('NFKC', text)
# 3. Whitespace Normalization
text = re.sub(r'[\t\xa0]', ' ', text)
text = re.sub(r'(?<=[a-z])\n(?=[a-z])', ' ', text) # Fix mid-sentence breaks
text = re.sub(r' +', ' ', text)
return text.strip()
def build_index(data_dir: str, persist_dir: str):
"""Processes messy data into a professional vector store."""
logger.info(f"Starting ingestion from: {data_dir}")
loader = DirectoryLoader(data_dir, glob="**/*.pdf", loader_cls=PyPDFLoader)
raw_docs = loader.load()
logger.info(f"Loaded {len(raw_docs)} documents.")
# Gemini 2025 standard embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
splitter = RecursiveCharacterTextSplitter(
chunk_size=1200, chunk_overlap=150, add_start_index=True
)
final_chunks = []
for i, doc in enumerate(raw_docs):
logger.info(f"Processing doc {i+1}/{len(raw_docs)}: {doc.metadata.get('source', 'unknown')}")
cleaned_content = clean_text(doc.page_content)
source_name = Path(doc.metadata.get("source", "unknown")).name
# Metadata extraction for citations
metadata = {
"source": source_name,
"page": doc.metadata.get("page", 1),
"chunk_hash": hashlib.md5(cleaned_content.encode()).hexdigest()
}
chunks = splitter.create_documents([cleaned_content], metadatas=[metadata])
final_chunks.extend(chunks)
logger.info(f"Total chunks created: {len(final_chunks)}")
logger.info(f"Persisting to VectorDB at: {persist_dir}")
vectorstore = Chroma.from_documents(
documents=final_chunks,
embedding=embeddings,
persist_directory=persist_dir
)
logger.info("VectorDB successfully built and persisted.")
return vectorstore |