Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import argparse | |
| import sys | |
| import os | |
| from langchain_community.document_loaders import TextLoader, PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() # still works locally | |
| HF_API_TOKEN = os.getenv("HUGGING_FACE_API_TOKEN") | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| EMBED_MODEL_NAME = os.getenv("HUGGING_FACE_EMBEDDING_MODEL") | |
| LLM_MODEL_NAME = os.getenv("LLM_MODEL") | |
| ROOT_DIR = Path(__file__).parent | |
| INDEX_DIR = Path(f"{ROOT_DIR}/data_index") | |
| ROOT_DIR = Path(__file__).parent | |
| INDEX_DIR = Path(f"{ROOT_DIR}/data_index") | |
| DATA_DIR = Path(f"{ROOT_DIR}/data") | |
| def load_documents(data_dir: Path): | |
| docs = [] | |
| for path in data_dir.rglob("*"): | |
| if path.is_dir(): | |
| continue | |
| try: | |
| if path.suffix.lower() in [".txt", ".md"]: | |
| docs.extend(TextLoader(str(path), encoding="utf-8").load()) | |
| elif path.suffix.lower() == ".pdf": | |
| docs.extend(PyPDFLoader(str(path)).load()) | |
| except Exception as e: | |
| print(f"[skip] {path.name}: {e}", file=sys.stderr) | |
| if not docs: | |
| raise RuntimeError(f"No documents found in {data_dir}. Put .txt/.md/.pdf files there.") | |
| return docs | |
| def build_vectorstore(docs): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120) | |
| chunks = splitter.split_documents(docs) | |
| embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME) | |
| vs = FAISS.from_documents(chunks, embeddings) | |
| return vs | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Ingest documents and build FAISS index.") | |
| args = parser.parse_args() | |
| print(f"Loading documents from {DATA_DIR}") | |
| docs = load_documents(DATA_DIR) | |
| print(f"Loaded {len(docs)} documents. Building index…") | |
| vs = build_vectorstore(docs) | |
| INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
| vs.save_local(str(INDEX_DIR)) | |
| # Persist embedding model name for safety | |
| (INDEX_DIR / "embeddings_model.txt").write_text(EMBED_MODEL_NAME, encoding="utf-8") | |
| print(f"Index saved to {INDEX_DIR.resolve()}") | |
| if __name__ == "__main__": | |
| main() | |