Spaces:

Arnic
/

AI-Versia

Build error

File size: 4,968 Bytes

166ec24

# processing.py  — robust loader supporting .md/.txt/.pdf and safe FAISS load/rebuild

import os
import re
from glob import glob
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from typing import List

# Document loaders
#from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Try new embeddings package first (avoids deprecation warnings)
try:
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS
from langchain.schema import Document

from config import Config

# document loaders: prefer langchain_community but fall back to langchain
try:
    from langchain_community.document_loaders import TextLoader, PyPDFLoader
except Exception:
    from langchain.document_loaders import TextLoader, PyPDFLoader



# embeddings: prefer langchain_huggingface if present


# FAISS vectorstore (langchain-community implementation)
try:
    from langchain_community.vectorstores import FAISS
except Exception:
    # older langchain may expose vectorstores differently; this keeps it explicit
    from langchain_community.vectorstores import FAISS


def _abs(p: str) -> str:
    return os.path.abspath(os.path.expanduser(p))

def clean_text(text: str) -> str:
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def _load_text_file(path: str) -> List[Document]:
    """Load .txt and .md files using TextLoader; return list[Document]."""
    loader = TextLoader(path, encoding="utf-8")
    docs = loader.load()
    for d in docs:
        d.page_content = clean_text(d.page_content)
        d.metadata["source"] = os.path.basename(path)
    return docs

def _load_pdf(path: str) -> List[Document]:
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    docs = []
    for p in pages:
        p.page_content = clean_text(p.page_content)
        p.metadata["source"] = os.path.basename(path)
        docs.append(p)
    return docs

def process_documents() -> List[Document]:
    """
    Reads files matched by Config.DOC_GLOB and returns splitted document chunks.
    Supports .pdf, .md, .txt. Add more extensions if needed.
    """
    files = glob(Config.DOC_GLOB)
    if not files:
        raise RuntimeError(
            f"No files found for DOC_GLOB={Config.DOC_GLOB} (cwd={os.getcwd()})"
        )

    docs = []
    with ThreadPoolExecutor() as ex:
        for p in files:
            ext = os.path.splitext(p)[1].lower()
            if ext in [".txt", ".md", ".markdown", ".rst"]:
                docs.extend(_load_text_file(p))
            elif ext in [".pdf"]:
                docs.extend(_load_pdf(p))
            else:
                # fallback attempt: try text loader
                try:
                    docs.extend(_load_text_file(p))
                except Exception:
                    print(f"Skipping unsupported file type: {p}")
    if not docs:
        raise RuntimeError("No documents loaded from files — check DOC_GLOB and file contents.")

    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=Config.CHUNK_SIZE,
        chunk_overlap=Config.CHUNK_OVERLAP
    )
    chunks = splitter.split_documents(docs)
    return chunks

@lru_cache(maxsize=1)
def _get_embeddings():
    return HuggingFaceEmbeddings(
        model_name=Config.EMBEDDING_MODEL,
        model_kwargs={"device": getattr(Config, "EMBEDDING_DEVICE", "cpu")}
    )

def load_or_create_index(force_rebuild: bool = False):
    """
    Load FAISS index from Config.INDEX_DIR or create it from source documents.
    If loading fails, it will rebuild from documents. Set force_rebuild=True to force rebuild.
    """
    emb = _get_embeddings()
    index_dir = _abs(Config.INDEX_DIR)
    os.makedirs(index_dir, exist_ok=True)

    # Try loading existing index if not forced to rebuild
    if os.path.isdir(index_dir) and not force_rebuild:
        try:
            print(f"📂 Attempting to load existing FAISS index from {index_dir} ...")
            return FAISS.load_local(index_dir, emb, allow_dangerous_deserialization=True)
        except Exception as e:
            print("⚠️ Failed to load existing FAISS index:", e)
            try:
                print("Index dir listing:", os.listdir(index_dir))
            except Exception as e2:
                print("Could not list index dir:", e2)
            print("Will attempt to rebuild the index from source documents.")

    # Rebuild index
    print("📁 Building FAISS index from source documents...")
    chunks = process_documents()
    if not chunks:
        raise RuntimeError("No chunks to index after processing documents.")
    index = FAISS.from_documents(chunks, emb)
    index.save_local(index_dir)
    print("✅ FAISS index built and saved to", index_dir)
    return index