# processing.py — robust loader supporting .md/.txt/.pdf and safe FAISS load/rebuild import os import re from glob import glob from functools import lru_cache from concurrent.futures import ThreadPoolExecutor from typing import List # Document loaders #from langchain.document_loaders import PyPDFLoader, TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter # Try new embeddings package first (avoids deprecation warnings) try: from langchain_huggingface import HuggingFaceEmbeddings except Exception: from langchain.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import Document from config import Config # document loaders: prefer langchain_community but fall back to langchain try: from langchain_community.document_loaders import TextLoader, PyPDFLoader except Exception: from langchain.document_loaders import TextLoader, PyPDFLoader # embeddings: prefer langchain_huggingface if present # FAISS vectorstore (langchain-community implementation) try: from langchain_community.vectorstores import FAISS except Exception: # older langchain may expose vectorstores differently; this keeps it explicit from langchain_community.vectorstores import FAISS def _abs(p: str) -> str: return os.path.abspath(os.path.expanduser(p)) def clean_text(text: str) -> str: text = re.sub(r"[^\x00-\x7F]+", " ", text) return re.sub(r"\s+", " ", text).strip() def _load_text_file(path: str) -> List[Document]: """Load .txt and .md files using TextLoader; return list[Document].""" loader = TextLoader(path, encoding="utf-8") docs = loader.load() for d in docs: d.page_content = clean_text(d.page_content) d.metadata["source"] = os.path.basename(path) return docs def _load_pdf(path: str) -> List[Document]: loader = PyPDFLoader(path) pages = loader.load_and_split() docs = [] for p in pages: p.page_content = clean_text(p.page_content) p.metadata["source"] = os.path.basename(path) docs.append(p) return docs def process_documents() -> List[Document]: """ Reads files matched by Config.DOC_GLOB and returns splitted document chunks. Supports .pdf, .md, .txt. Add more extensions if needed. """ files = glob(Config.DOC_GLOB) if not files: raise RuntimeError( f"No files found for DOC_GLOB={Config.DOC_GLOB} (cwd={os.getcwd()})" ) docs = [] with ThreadPoolExecutor() as ex: for p in files: ext = os.path.splitext(p)[1].lower() if ext in [".txt", ".md", ".markdown", ".rst"]: docs.extend(_load_text_file(p)) elif ext in [".pdf"]: docs.extend(_load_pdf(p)) else: # fallback attempt: try text loader try: docs.extend(_load_text_file(p)) except Exception: print(f"Skipping unsupported file type: {p}") if not docs: raise RuntimeError("No documents loaded from files — check DOC_GLOB and file contents.") # Split into chunks splitter = RecursiveCharacterTextSplitter( chunk_size=Config.CHUNK_SIZE, chunk_overlap=Config.CHUNK_OVERLAP ) chunks = splitter.split_documents(docs) return chunks @lru_cache(maxsize=1) def _get_embeddings(): return HuggingFaceEmbeddings( model_name=Config.EMBEDDING_MODEL, model_kwargs={"device": getattr(Config, "EMBEDDING_DEVICE", "cpu")} ) def load_or_create_index(force_rebuild: bool = False): """ Load FAISS index from Config.INDEX_DIR or create it from source documents. If loading fails, it will rebuild from documents. Set force_rebuild=True to force rebuild. """ emb = _get_embeddings() index_dir = _abs(Config.INDEX_DIR) os.makedirs(index_dir, exist_ok=True) # Try loading existing index if not forced to rebuild if os.path.isdir(index_dir) and not force_rebuild: try: print(f"📂 Attempting to load existing FAISS index from {index_dir} ...") return FAISS.load_local(index_dir, emb, allow_dangerous_deserialization=True) except Exception as e: print("⚠️ Failed to load existing FAISS index:", e) try: print("Index dir listing:", os.listdir(index_dir)) except Exception as e2: print("Could not list index dir:", e2) print("Will attempt to rebuild the index from source documents.") # Rebuild index print("📁 Building FAISS index from source documents...") chunks = process_documents() if not chunks: raise RuntimeError("No chunks to index after processing documents.") index = FAISS.from_documents(chunks, emb) index.save_local(index_dir) print("✅ FAISS index built and saved to", index_dir) return index