File size: 1,502 Bytes
565e754 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os
import sys
sys.path.append(os.getcwd())
import time
import datetime
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import pandas as pd
from src.config import pyro_source, CHANNEL_ID
from src.data.clean import clean_df
from src.db_utils.sql_utils import sql_dump_df, sql_get_by_date
from src.db_utils.qdrant_utils import qdrant_insert
from src.data.splitter import Splitter
today = datetime.datetime.today()
# Загрузка в sql
posts = pyro_source.load_days(
channel_id=CHANNEL_ID,
from_date=datetime.datetime.today(),
)
df = pd.DataFrame(posts)
df = clean_df(df)
sql_dump_df(df, "posts", if_exists="append")
# Загрузка в qdrant
splitter_mode = "recursive"
model_name = "deepvk/USER-bge-m3"
vector_index_name = f"{splitter_mode}_{model_name.split('/')[1]}"
splitter = Splitter(splitter_mode, chunk_size=256, chunk_overlap=64)
emb = HuggingFaceEmbeddings(
model_name=model_name,
encode_kwargs={"normalize_embeddings": True},
)
batch_size = 16
offset = 0
rows = sql_get_by_date(today.date().isoformat())
for i in range(0, len(rows), batch_size):
dfs = []
for r in rows[i:i+batch_size]:
chunks = splitter.split_text(r["content"])
vectors = emb.embed_documents(chunks)
dfs.append(pd.DataFrame({"doc_id": r["ctid"], "text": chunks, "vector": vectors}))
print(f"{offset} - {offset + batch_size}:", qdrant_insert(pd.concat(dfs), vector_index_name))
offset += batch_size
time.sleep(0.3)
|