Spaces:
Sleeping
Sleeping
| import os | |
| from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from llama_index.embeddings.openai import OpenAIEmbedding | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| import chromadb | |
| PDF_DIR = "data" | |
| COLLECTION_NAME = "neuro_course" | |
| def get_persist_dir() -> str: | |
| # Hugging Face persistent storage is /data (if enabled in Space settings) | |
| # If not enabled, this falls back to local folder (may reset on restart) | |
| return "/data/chroma" if os.path.exists("/data") else "storage/chroma" | |
| def main(): | |
| if not os.getenv("OPENAI_API_KEY"): | |
| raise RuntimeError("OPENAI_API_KEY is missing. Add it as a Space secret.") | |
| persist_dir = get_persist_dir() | |
| os.makedirs(persist_dir, exist_ok=True) | |
| # Load PDF(s) from /data folder in repo | |
| docs = SimpleDirectoryReader(PDF_DIR).load_data() | |
| # Chunking (good default for slides/handouts) | |
| splitter = SentenceSplitter(chunk_size=900, chunk_overlap=120) | |
| nodes = splitter.get_nodes_from_documents(docs) | |
| # Persistent Chroma collection | |
| client = chromadb.PersistentClient(path=persist_dir) | |
| collection = client.get_or_create_collection(COLLECTION_NAME) | |
| vector_store = ChromaVectorStore(chroma_collection=collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| embed_model = OpenAIEmbedding(model="text-embedding-3-small") | |
| index = VectorStoreIndex( | |
| nodes, | |
| storage_context=storage_context, | |
| embed_model=embed_model | |
| ) | |
| index.storage_context.persist() | |
| print("✅ Ingestion complete.") | |
| print(f"Persist dir: {persist_dir}") | |
| print(f"Docs: {len(docs)} | Chunks: {len(nodes)}") | |
| print(f"Collection: {COLLECTION_NAME}") | |
| if __name__ == "__main__": | |
| main() | |