Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| sys.path.append(os.getcwd()) | |
| import time | |
| import datetime | |
| from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
| import pandas as pd | |
| from src.config import pyro_source, CHANNEL_ID | |
| from src.data.clean import clean_df | |
| from src.db_utils.sql_utils import sql_dump_df, sql_get_by_date | |
| from src.db_utils.qdrant_utils import qdrant_insert | |
| from src.data.splitter import Splitter | |
| today = datetime.datetime.today() | |
| # Загрузка в sql | |
| posts = pyro_source.load_days( | |
| channel_id=CHANNEL_ID, | |
| from_date=datetime.datetime.today(), | |
| ) | |
| df = pd.DataFrame(posts) | |
| df = clean_df(df) | |
| sql_dump_df(df, "posts", if_exists="append") | |
| # Загрузка в qdrant | |
| splitter_mode = "recursive" | |
| model_name = "deepvk/USER-bge-m3" | |
| vector_index_name = f"{splitter_mode}_{model_name.split('/')[1]}" | |
| splitter = Splitter(splitter_mode, chunk_size=256, chunk_overlap=64) | |
| emb = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| encode_kwargs={"normalize_embeddings": True}, | |
| ) | |
| batch_size = 16 | |
| offset = 0 | |
| rows = sql_get_by_date(today.date().isoformat()) | |
| for i in range(0, len(rows), batch_size): | |
| dfs = [] | |
| for r in rows[i:i+batch_size]: | |
| chunks = splitter.split_text(r["content"]) | |
| vectors = emb.embed_documents(chunks) | |
| dfs.append(pd.DataFrame({"doc_id": r["ctid"], "text": chunks, "vector": vectors})) | |
| print(f"{offset} - {offset + batch_size}:", qdrant_insert(pd.concat(dfs), vector_index_name)) | |
| offset += batch_size | |
| time.sleep(0.3) | |