cmd0160's picture
Adding base files
9797603
raw
history blame
2.81 kB
"""Ingest documents from data/ into a Chroma vectorstore using OpenAI embeddings.
Usage:
python -m src.ingest --data-dir ./data --persist-dir ./vectorstore
"""
import os
os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
import argparse
from typing import List
from langchain.document_loaders import TextLoader, CSVLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
try:
from langchain_openai import OpenAIEmbeddings
except Exception:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
def load_documents_from_dir(data_dir: str) -> List:
docs = []
for fname in sorted(os.listdir(data_dir)):
path = os.path.join(data_dir, fname)
if os.path.isdir(path):
continue
if fname.lower().endswith((".txt", ".md")):
loader = TextLoader(path, encoding="utf-8")
docs.extend(loader.load())
elif fname.lower().endswith(".csv"):
loader = CSVLoader(path, encoding="utf-8")
docs.extend(loader.load())
elif fname.lower().endswith(".pdf"):
try:
loader = PyPDFLoader(path)
docs.extend(loader.load())
except Exception:
print(f"Warning: Could not load PDF {path}. Ensure pypdf is installed.")
else:
print(f"Skipping unknown file type: {path}")
return docs
def ingest(data_dir: str = "./data", persist_dir: str = "./vectorstore", chunk_size: int = 1000, chunk_overlap: int = 200):
assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment"
print(f"Loading documents from {data_dir}")
docs = load_documents_from_dir(data_dir)
print(f"Loaded {len(docs)} documents")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = text_splitter.split_documents(docs)
print(f"Split into {len(split_docs)} chunks")
embeddings = OpenAIEmbeddings()
os.makedirs(persist_dir, exist_ok=True)
db = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_dir)
db.persist()
print(f"Vectorstore persisted to {persist_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-dir", type=str, default="./data")
parser.add_argument("--persist-dir", type=str, default="./vectorstore")
parser.add_argument("--chunk-size", type=int, default=1000)
parser.add_argument("--chunk-overlap", type=int, default=200)
args = parser.parse_args()
ingest(args.data_dir, args.persist_dir, args.chunk_size, args.chunk_overlap)