Spaces:
Sleeping
Sleeping
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import TextLoader | |
| import os | |
| DATA_DIR = "data" | |
| CHROMA_DIR = "chroma_db" | |
| embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| all_docs = [] | |
| for filename in os.listdir(DATA_DIR): | |
| if filename.endswith(".txt"): | |
| loader = TextLoader(os.path.join(DATA_DIR, filename)) | |
| docs = loader.load() | |
| chunks = text_splitter.split_documents(docs) | |
| all_docs.extend(chunks) | |
| db = Chroma.from_documents(all_docs, embedding, persist_directory=CHROMA_DIR) | |
| db.persist() | |
| print("✅ Ingestion complete") | |