Spaces:
Runtime error
Runtime error
| import os | |
| import orjson | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.document_loaders import DirectoryLoader | |
| SRC_JSON = "research_methods_info.json" | |
| OUT_DIR = "/tmp/chunks" | |
| def preprocess_chunks(): | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| with open(SRC_JSON, "rb") as f: | |
| data = orjson.loads(f.read()) | |
| splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| for idx, rec in enumerate(data.get("methods", [])): | |
| parts = [ | |
| f"Name:\n{rec.get('name','')}", | |
| f"Description:\n{rec.get('description','')}", | |
| # gerekirse diğer alanlar | |
| ] | |
| text = "\n\n".join([p for p in parts if p.strip()]) | |
| chunks = splitter.split_text(text) | |
| for j, chunk in enumerate(chunks): | |
| with open(f"{OUT_DIR}/{idx:03d}_{j:02d}.txt", "w", encoding="utf-8") as outf: | |
| outf.write(chunk) | |
| print(f"✅ {len(os.listdir(OUT_DIR))} dosya yazıldı → {OUT_DIR}/") | |
| CHROMADB_DIR = "/tmp/chromadb" | |
| def embed_chunks(): | |
| print("⚙️ Generating embeddings and persisting to chromadb/ …") | |
| docs = DirectoryLoader(OUT_DIR, glob="**/*.txt").load() | |
| db = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=CHROMADB_DIR) | |
| db.persist() | |
| print("✅ Embedding işlemi tamamlandı.") | |
| # Eğer bağımsız çalıştırılırsa | |
| if __name__ == "__main__": | |
| preprocess_chunks() | |
| embed_chunks() | |