neuro-tutor-rag / ingest.py
Deevyankar's picture
Create ingest.py
d56daa4 verified
import os
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
PDF_DIR = "data"
COLLECTION_NAME = "neuro_course"
def get_persist_dir() -> str:
# Hugging Face persistent storage is /data (if enabled in Space settings)
# If not enabled, this falls back to local folder (may reset on restart)
return "/data/chroma" if os.path.exists("/data") else "storage/chroma"
def main():
if not os.getenv("OPENAI_API_KEY"):
raise RuntimeError("OPENAI_API_KEY is missing. Add it as a Space secret.")
persist_dir = get_persist_dir()
os.makedirs(persist_dir, exist_ok=True)
# Load PDF(s) from /data folder in repo
docs = SimpleDirectoryReader(PDF_DIR).load_data()
# Chunking (good default for slides/handouts)
splitter = SentenceSplitter(chunk_size=900, chunk_overlap=120)
nodes = splitter.get_nodes_from_documents(docs)
# Persistent Chroma collection
client = chromadb.PersistentClient(path=persist_dir)
collection = client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
index = VectorStoreIndex(
nodes,
storage_context=storage_context,
embed_model=embed_model
)
index.storage_context.persist()
print("✅ Ingestion complete.")
print(f"Persist dir: {persist_dir}")
print(f"Docs: {len(docs)} | Chunks: {len(nodes)}")
print(f"Collection: {COLLECTION_NAME}")
if __name__ == "__main__":
main()