import os import orjson from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.document_loaders import DirectoryLoader SRC_JSON = "research_methods_info.json" OUT_DIR = "/tmp/chunks" def preprocess_chunks(): os.makedirs(OUT_DIR, exist_ok=True) with open(SRC_JSON, "rb") as f: data = orjson.loads(f.read()) splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) for idx, rec in enumerate(data.get("methods", [])): parts = [ f"Name:\n{rec.get('name','')}", f"Description:\n{rec.get('description','')}", # gerekirse diğer alanlar ] text = "\n\n".join([p for p in parts if p.strip()]) chunks = splitter.split_text(text) for j, chunk in enumerate(chunks): with open(f"{OUT_DIR}/{idx:03d}_{j:02d}.txt", "w", encoding="utf-8") as outf: outf.write(chunk) print(f"✅ {len(os.listdir(OUT_DIR))} dosya yazıldı → {OUT_DIR}/") CHROMADB_DIR = "/tmp/chromadb" def embed_chunks(): print("⚙️ Generating embeddings and persisting to chromadb/ …") docs = DirectoryLoader(OUT_DIR, glob="**/*.txt").load() db = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=CHROMADB_DIR) db.persist() print("✅ Embedding işlemi tamamlandı.") # Eğer bağımsız çalıştırılırsa if __name__ == "__main__": preprocess_chunks() embed_chunks()