File size: 4,968 Bytes
166ec24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# processing.py β robust loader supporting .md/.txt/.pdf and safe FAISS load/rebuild
import os
import re
from glob import glob
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from typing import List
# Document loaders
#from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Try new embeddings package first (avoids deprecation warnings)
try:
from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from config import Config
# document loaders: prefer langchain_community but fall back to langchain
try:
from langchain_community.document_loaders import TextLoader, PyPDFLoader
except Exception:
from langchain.document_loaders import TextLoader, PyPDFLoader
# embeddings: prefer langchain_huggingface if present
# FAISS vectorstore (langchain-community implementation)
try:
from langchain_community.vectorstores import FAISS
except Exception:
# older langchain may expose vectorstores differently; this keeps it explicit
from langchain_community.vectorstores import FAISS
def _abs(p: str) -> str:
return os.path.abspath(os.path.expanduser(p))
def clean_text(text: str) -> str:
text = re.sub(r"[^\x00-\x7F]+", " ", text)
return re.sub(r"\s+", " ", text).strip()
def _load_text_file(path: str) -> List[Document]:
"""Load .txt and .md files using TextLoader; return list[Document]."""
loader = TextLoader(path, encoding="utf-8")
docs = loader.load()
for d in docs:
d.page_content = clean_text(d.page_content)
d.metadata["source"] = os.path.basename(path)
return docs
def _load_pdf(path: str) -> List[Document]:
loader = PyPDFLoader(path)
pages = loader.load_and_split()
docs = []
for p in pages:
p.page_content = clean_text(p.page_content)
p.metadata["source"] = os.path.basename(path)
docs.append(p)
return docs
def process_documents() -> List[Document]:
"""
Reads files matched by Config.DOC_GLOB and returns splitted document chunks.
Supports .pdf, .md, .txt. Add more extensions if needed.
"""
files = glob(Config.DOC_GLOB)
if not files:
raise RuntimeError(
f"No files found for DOC_GLOB={Config.DOC_GLOB} (cwd={os.getcwd()})"
)
docs = []
with ThreadPoolExecutor() as ex:
for p in files:
ext = os.path.splitext(p)[1].lower()
if ext in [".txt", ".md", ".markdown", ".rst"]:
docs.extend(_load_text_file(p))
elif ext in [".pdf"]:
docs.extend(_load_pdf(p))
else:
# fallback attempt: try text loader
try:
docs.extend(_load_text_file(p))
except Exception:
print(f"Skipping unsupported file type: {p}")
if not docs:
raise RuntimeError("No documents loaded from files β check DOC_GLOB and file contents.")
# Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)
return chunks
@lru_cache(maxsize=1)
def _get_embeddings():
return HuggingFaceEmbeddings(
model_name=Config.EMBEDDING_MODEL,
model_kwargs={"device": getattr(Config, "EMBEDDING_DEVICE", "cpu")}
)
def load_or_create_index(force_rebuild: bool = False):
"""
Load FAISS index from Config.INDEX_DIR or create it from source documents.
If loading fails, it will rebuild from documents. Set force_rebuild=True to force rebuild.
"""
emb = _get_embeddings()
index_dir = _abs(Config.INDEX_DIR)
os.makedirs(index_dir, exist_ok=True)
# Try loading existing index if not forced to rebuild
if os.path.isdir(index_dir) and not force_rebuild:
try:
print(f"π Attempting to load existing FAISS index from {index_dir} ...")
return FAISS.load_local(index_dir, emb, allow_dangerous_deserialization=True)
except Exception as e:
print("β οΈ Failed to load existing FAISS index:", e)
try:
print("Index dir listing:", os.listdir(index_dir))
except Exception as e2:
print("Could not list index dir:", e2)
print("Will attempt to rebuild the index from source documents.")
# Rebuild index
print("π Building FAISS index from source documents...")
chunks = process_documents()
if not chunks:
raise RuntimeError("No chunks to index after processing documents.")
index = FAISS.from_documents(chunks, emb)
index.save_local(index_dir)
print("β
FAISS index built and saved to", index_dir)
return index
|