Spaces:
Sleeping
Sleeping
| """Ingest documents from data/ into a Chroma vectorstore using OpenAI embeddings. | |
| Usage: | |
| python -m src.ingest --data-dir ./data --persist-dir ./vectorstore | |
| """ | |
| import os | |
| os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false") | |
| os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true") | |
| os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false") | |
| import argparse | |
| from typing import List | |
| from langchain.document_loaders import TextLoader, CSVLoader, PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| try: | |
| from langchain_openai import OpenAIEmbeddings | |
| except Exception: | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| def load_documents_from_dir(data_dir: str) -> List: | |
| docs = [] | |
| for fname in sorted(os.listdir(data_dir)): | |
| path = os.path.join(data_dir, fname) | |
| if os.path.isdir(path): | |
| continue | |
| if fname.lower().endswith((".txt", ".md")): | |
| loader = TextLoader(path, encoding="utf-8") | |
| docs.extend(loader.load()) | |
| elif fname.lower().endswith(".csv"): | |
| loader = CSVLoader(path, encoding="utf-8") | |
| docs.extend(loader.load()) | |
| elif fname.lower().endswith(".pdf"): | |
| try: | |
| loader = PyPDFLoader(path) | |
| docs.extend(loader.load()) | |
| except Exception: | |
| print(f"Warning: Could not load PDF {path}. Ensure pypdf is installed.") | |
| else: | |
| print(f"Skipping unknown file type: {path}") | |
| return docs | |
| def ingest(data_dir: str = "./data", persist_dir: str = "./vectorstore", chunk_size: int = 1000, chunk_overlap: int = 200): | |
| assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment" | |
| print(f"Loading documents from {data_dir}") | |
| docs = load_documents_from_dir(data_dir) | |
| print(f"Loaded {len(docs)} documents") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| split_docs = text_splitter.split_documents(docs) | |
| print(f"Split into {len(split_docs)} chunks") | |
| embeddings = OpenAIEmbeddings() | |
| os.makedirs(persist_dir, exist_ok=True) | |
| db = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_dir) | |
| db.persist() | |
| print(f"Vectorstore persisted to {persist_dir}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--data-dir", type=str, default="./data") | |
| parser.add_argument("--persist-dir", type=str, default="./vectorstore") | |
| parser.add_argument("--chunk-size", type=int, default=1000) | |
| parser.add_argument("--chunk-overlap", type=int, default=200) | |
| args = parser.parse_args() | |
| ingest(args.data_dir, args.persist_dir, args.chunk_size, args.chunk_overlap) | |