Spaces:
Runtime error
Runtime error
File size: 1,536 Bytes
7bef56d 0d89fd2 7bef56d 0d89fd2 c8928ae 7bef56d 0d89fd2 0f443b7 ce21f60 0f443b7 ce21f60 0f443b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import os
import orjson
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
SRC_JSON = "research_methods_info.json"
OUT_DIR = "/tmp/chunks"
def preprocess_chunks():
os.makedirs(OUT_DIR, exist_ok=True)
with open(SRC_JSON, "rb") as f:
data = orjson.loads(f.read())
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
for idx, rec in enumerate(data.get("methods", [])):
parts = [
f"Name:\n{rec.get('name','')}",
f"Description:\n{rec.get('description','')}",
# gerekirse diğer alanlar
]
text = "\n\n".join([p for p in parts if p.strip()])
chunks = splitter.split_text(text)
for j, chunk in enumerate(chunks):
with open(f"{OUT_DIR}/{idx:03d}_{j:02d}.txt", "w", encoding="utf-8") as outf:
outf.write(chunk)
print(f"✅ {len(os.listdir(OUT_DIR))} dosya yazıldı → {OUT_DIR}/")
CHROMADB_DIR = "/tmp/chromadb"
def embed_chunks():
print("⚙️ Generating embeddings and persisting to chromadb/ …")
docs = DirectoryLoader(OUT_DIR, glob="**/*.txt").load()
db = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=CHROMADB_DIR)
db.persist()
print("✅ Embedding işlemi tamamlandı.")
# Eğer bağımsız çalıştırılırsa
if __name__ == "__main__":
preprocess_chunks()
embed_chunks()
|