erds / preprocess_chunks.py
mfirat007's picture
Update preprocess_chunks.py
ce21f60 verified
raw
history blame
1.54 kB
import os
import orjson
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
SRC_JSON = "research_methods_info.json"
OUT_DIR = "/tmp/chunks"
def preprocess_chunks():
os.makedirs(OUT_DIR, exist_ok=True)
with open(SRC_JSON, "rb") as f:
data = orjson.loads(f.read())
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
for idx, rec in enumerate(data.get("methods", [])):
parts = [
f"Name:\n{rec.get('name','')}",
f"Description:\n{rec.get('description','')}",
# gerekirse diğer alanlar
]
text = "\n\n".join([p for p in parts if p.strip()])
chunks = splitter.split_text(text)
for j, chunk in enumerate(chunks):
with open(f"{OUT_DIR}/{idx:03d}_{j:02d}.txt", "w", encoding="utf-8") as outf:
outf.write(chunk)
print(f"✅ {len(os.listdir(OUT_DIR))} dosya yazıldı → {OUT_DIR}/")
CHROMADB_DIR = "/tmp/chromadb"
def embed_chunks():
print("⚙️ Generating embeddings and persisting to chromadb/ …")
docs = DirectoryLoader(OUT_DIR, glob="**/*.txt").load()
db = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=CHROMADB_DIR)
db.persist()
print("✅ Embedding işlemi tamamlandı.")
# Eğer bağımsız çalıştırılırsa
if __name__ == "__main__":
preprocess_chunks()
embed_chunks()