File size: 1,536 Bytes
7bef56d
0d89fd2
7bef56d
0d89fd2
 
c8928ae
7bef56d
0d89fd2
0f443b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce21f60
0f443b7
 
 
ce21f60
0f443b7
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import orjson
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader

SRC_JSON = "research_methods_info.json"
OUT_DIR = "/tmp/chunks"

def preprocess_chunks():
    os.makedirs(OUT_DIR, exist_ok=True)

    with open(SRC_JSON, "rb") as f:
        data = orjson.loads(f.read())

    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    for idx, rec in enumerate(data.get("methods", [])):
        parts = [
            f"Name:\n{rec.get('name','')}",
            f"Description:\n{rec.get('description','')}",
            # gerekirse diğer alanlar
        ]
        text = "\n\n".join([p for p in parts if p.strip()])
        chunks = splitter.split_text(text)

        for j, chunk in enumerate(chunks):
            with open(f"{OUT_DIR}/{idx:03d}_{j:02d}.txt", "w", encoding="utf-8") as outf:
                outf.write(chunk)

    print(f"✅ {len(os.listdir(OUT_DIR))} dosya yazıldı → {OUT_DIR}/")

CHROMADB_DIR = "/tmp/chromadb"
def embed_chunks():
    print("⚙️ Generating embeddings and persisting to chromadb/ …")
    docs = DirectoryLoader(OUT_DIR, glob="**/*.txt").load()
    db = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=CHROMADB_DIR)
    db.persist()
    print("✅ Embedding işlemi tamamlandı.")


# Eğer bağımsız çalıştırılırsa
if __name__ == "__main__":
    preprocess_chunks()
    embed_chunks()