File size: 4,968 Bytes
166ec24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# processing.py  β€” robust loader supporting .md/.txt/.pdf and safe FAISS load/rebuild

import os
import re
from glob import glob
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from typing import List

# Document loaders
#from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Try new embeddings package first (avoids deprecation warnings)
try:
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS
from langchain.schema import Document

from config import Config

# document loaders: prefer langchain_community but fall back to langchain
try:
    from langchain_community.document_loaders import TextLoader, PyPDFLoader
except Exception:
    from langchain.document_loaders import TextLoader, PyPDFLoader



# embeddings: prefer langchain_huggingface if present


# FAISS vectorstore (langchain-community implementation)
try:
    from langchain_community.vectorstores import FAISS
except Exception:
    # older langchain may expose vectorstores differently; this keeps it explicit
    from langchain_community.vectorstores import FAISS


def _abs(p: str) -> str:
    return os.path.abspath(os.path.expanduser(p))

def clean_text(text: str) -> str:
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def _load_text_file(path: str) -> List[Document]:
    """Load .txt and .md files using TextLoader; return list[Document]."""
    loader = TextLoader(path, encoding="utf-8")
    docs = loader.load()
    for d in docs:
        d.page_content = clean_text(d.page_content)
        d.metadata["source"] = os.path.basename(path)
    return docs

def _load_pdf(path: str) -> List[Document]:
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    docs = []
    for p in pages:
        p.page_content = clean_text(p.page_content)
        p.metadata["source"] = os.path.basename(path)
        docs.append(p)
    return docs

def process_documents() -> List[Document]:
    """
    Reads files matched by Config.DOC_GLOB and returns splitted document chunks.
    Supports .pdf, .md, .txt. Add more extensions if needed.
    """
    files = glob(Config.DOC_GLOB)
    if not files:
        raise RuntimeError(
            f"No files found for DOC_GLOB={Config.DOC_GLOB} (cwd={os.getcwd()})"
        )

    docs = []
    with ThreadPoolExecutor() as ex:
        for p in files:
            ext = os.path.splitext(p)[1].lower()
            if ext in [".txt", ".md", ".markdown", ".rst"]:
                docs.extend(_load_text_file(p))
            elif ext in [".pdf"]:
                docs.extend(_load_pdf(p))
            else:
                # fallback attempt: try text loader
                try:
                    docs.extend(_load_text_file(p))
                except Exception:
                    print(f"Skipping unsupported file type: {p}")
    if not docs:
        raise RuntimeError("No documents loaded from files β€” check DOC_GLOB and file contents.")

    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=Config.CHUNK_SIZE,
        chunk_overlap=Config.CHUNK_OVERLAP
    )
    chunks = splitter.split_documents(docs)
    return chunks

@lru_cache(maxsize=1)
def _get_embeddings():
    return HuggingFaceEmbeddings(
        model_name=Config.EMBEDDING_MODEL,
        model_kwargs={"device": getattr(Config, "EMBEDDING_DEVICE", "cpu")}
    )

def load_or_create_index(force_rebuild: bool = False):
    """
    Load FAISS index from Config.INDEX_DIR or create it from source documents.
    If loading fails, it will rebuild from documents. Set force_rebuild=True to force rebuild.
    """
    emb = _get_embeddings()
    index_dir = _abs(Config.INDEX_DIR)
    os.makedirs(index_dir, exist_ok=True)

    # Try loading existing index if not forced to rebuild
    if os.path.isdir(index_dir) and not force_rebuild:
        try:
            print(f"πŸ“‚ Attempting to load existing FAISS index from {index_dir} ...")
            return FAISS.load_local(index_dir, emb, allow_dangerous_deserialization=True)
        except Exception as e:
            print("⚠️ Failed to load existing FAISS index:", e)
            try:
                print("Index dir listing:", os.listdir(index_dir))
            except Exception as e2:
                print("Could not list index dir:", e2)
            print("Will attempt to rebuild the index from source documents.")

    # Rebuild index
    print("πŸ“ Building FAISS index from source documents...")
    chunks = process_documents()
    if not chunks:
        raise RuntimeError("No chunks to index after processing documents.")
    index = FAISS.from_documents(chunks, emb)
    index.save_local(index_dir)
    print("βœ… FAISS index built and saved to", index_dir)
    return index