File size: 1,502 Bytes
565e754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import sys
sys.path.append(os.getcwd())
import time
import datetime

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import pandas as pd

from src.config import pyro_source, CHANNEL_ID
from src.data.clean import clean_df
from src.db_utils.sql_utils import sql_dump_df, sql_get_by_date
from src.db_utils.qdrant_utils import qdrant_insert
from src.data.splitter import Splitter


today = datetime.datetime.today()

# Загрузка в sql
posts = pyro_source.load_days(
    channel_id=CHANNEL_ID,
    from_date=datetime.datetime.today(),
)

df = pd.DataFrame(posts)
df = clean_df(df)

sql_dump_df(df, "posts", if_exists="append")

# Загрузка в qdrant
splitter_mode = "recursive"
model_name = "deepvk/USER-bge-m3"
vector_index_name = f"{splitter_mode}_{model_name.split('/')[1]}"

splitter = Splitter(splitter_mode, chunk_size=256, chunk_overlap=64)
emb = HuggingFaceEmbeddings(
    model_name=model_name, 
    encode_kwargs={"normalize_embeddings": True},
)

batch_size = 16
offset = 0
rows = sql_get_by_date(today.date().isoformat())
for i in range(0, len(rows), batch_size):
    dfs = []
    for r in rows[i:i+batch_size]:
        chunks = splitter.split_text(r["content"])
        vectors = emb.embed_documents(chunks)

        dfs.append(pd.DataFrame({"doc_id": r["ctid"], "text": chunks, "vector": vectors}))

    print(f"{offset} - {offset + batch_size}:", qdrant_insert(pd.concat(dfs), vector_index_name))
    
    offset += batch_size

    time.sleep(0.3)