Архипов Дмитрий
commited on
Commit
·
565e754
1
Parent(s):
1272224
test
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +22 -1
- app.py +429 -0
- classification_results.csv +0 -0
- contestin.ipynb +331 -0
- env.example.txt +23 -0
- frontend.py +404 -0
- news_classification_langgraph.ipynb +697 -0
- news_classification_pipeline.ipynb +0 -0
- qa_evaluation_example.ipynb +465 -0
- question_generation.ipynb +591 -0
- requirements.txt +34 -3
- root.crt +59 -0
- server.py +194 -0
- src/__init__.py +5 -0
- src/config.py +49 -0
- src/data/__init__.py +0 -0
- src/data/__pycache__/__init__.cpython-313.pyc +0 -0
- src/data/__pycache__/parser.cpython-313.pyc +0 -0
- src/data/__pycache__/splitter.cpython-313.pyc +0 -0
- src/data/clean.py +71 -0
- src/data/parser.py +126 -0
- src/data/splitter.py +132 -0
- src/dataset/rbc/channel_rbc_news_posts.csv +0 -0
- src/dataset/test_cases.csv +0 -0
- src/db_utils/__init__.py +0 -0
- src/db_utils/__pycache__/__init__.cpython-313.pyc +0 -0
- src/db_utils/__pycache__/history_utils.cpython-313.pyc +0 -0
- src/db_utils/__pycache__/qdrant_utils.cpython-313.pyc +0 -0
- src/db_utils/__pycache__/sql_utils.cpython-313.pyc +0 -0
- src/db_utils/db_example_usage.ipynb +881 -0
- src/db_utils/history_utils.py +269 -0
- src/db_utils/qdrant_utils.py +58 -0
- src/db_utils/sql_utils.py +92 -0
- src/evaluation/__init__.py +16 -0
- src/evaluation/__pycache__/__init__.cpython-313.pyc +0 -0
- src/evaluation/__pycache__/qa_evaluator.cpython-313.pyc +0 -0
- src/evaluation/qa_evaluator.py +254 -0
- src/evaluation/score_system.ipynb +687 -0
- src/parser/__pycache__/__init__.cpython-313.pyc +0 -0
- src/parser/__pycache__/pyrosource.cpython-313.pyc +0 -0
- src/rag/__init__.py +1 -0
- src/rag/__pycache__/__init__.cpython-313.pyc +0 -0
- src/rag/__pycache__/llm.cpython-313.pyc +0 -0
- src/rag/__pycache__/question_enricher.cpython-313.pyc +0 -0
- src/rag/__pycache__/rag.cpython-313.pyc +0 -0
- src/rag/__pycache__/retriever.cpython-313.pyc +0 -0
- src/rag/llm.py +13 -0
- src/rag/question_enricher.py +99 -0
- src/rag/rag.py +90 -0
- src/rag/retriever.py +27 -0
Dockerfile
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
FROM python:3.13.5-slim
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
RUN apt-get update && apt-get install -y \
|
|
@@ -10,6 +28,9 @@ RUN apt-get update && apt-get install -y \
|
|
| 10 |
|
| 11 |
COPY requirements.txt ./
|
| 12 |
COPY src/ ./src/
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
RUN pip3 install -r requirements.txt
|
| 15 |
|
|
@@ -17,4 +38,4 @@ EXPOSE 8501
|
|
| 17 |
|
| 18 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
|
| 20 |
-
ENTRYPOINT ["streamlit", "run", "
|
|
|
|
| 1 |
+
ARG DB_USER
|
| 2 |
+
ARG DB_PASS
|
| 3 |
+
ARG DB_HOST
|
| 4 |
+
ARG DB_PORT
|
| 5 |
+
ARG DB_NAME
|
| 6 |
+
ARG PATH_TO_CERT
|
| 7 |
+
ARG QDRANT_URL
|
| 8 |
+
ARG OPENROUTER_API_KEY
|
| 9 |
+
|
| 10 |
FROM python:3.13.5-slim
|
| 11 |
|
| 12 |
+
ENV DB_USER=$DB_USER
|
| 13 |
+
ENV DB_PASS=$DB_PASS
|
| 14 |
+
ENV DB_HOST=$DB_HOST
|
| 15 |
+
ENV DB_PORT=$DB_PORT
|
| 16 |
+
ENV DB_NAME=$DB_NAME
|
| 17 |
+
ENV PATH_TO_CERT=$PATH_TO_CERT
|
| 18 |
+
ENV QDRANT_URL=$QDRANT_URL
|
| 19 |
+
ENV OPENROUTER_API_KEY=$OPENROUTER_API_KEY
|
| 20 |
+
|
| 21 |
WORKDIR /app
|
| 22 |
|
| 23 |
RUN apt-get update && apt-get install -y \
|
|
|
|
| 28 |
|
| 29 |
COPY requirements.txt ./
|
| 30 |
COPY src/ ./src/
|
| 31 |
+
COPY frontend.py ./
|
| 32 |
+
COPY .streamlit/ ./.streamlit/
|
| 33 |
+
COPY root.crt ./
|
| 34 |
|
| 35 |
RUN pip3 install -r requirements.txt
|
| 36 |
|
|
|
|
| 38 |
|
| 39 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 40 |
|
| 41 |
+
ENTRYPOINT ["streamlit", "run", "frontend.py"]
|
app.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Streamlit Frontend для RAG вопросно-ответной системы
|
| 3 |
+
Чат-интерфейс с поддержкой нескольких диалогов
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import List, Dict, Optional
|
| 8 |
+
import uuid
|
| 9 |
+
|
| 10 |
+
from src import RAG
|
| 11 |
+
from src.db_utils.history_utils import (
|
| 12 |
+
init_history_table,
|
| 13 |
+
log_query,
|
| 14 |
+
get_all_history,
|
| 15 |
+
get_history_by_dialogue,
|
| 16 |
+
search_history,
|
| 17 |
+
get_history_stats,
|
| 18 |
+
delete_history,
|
| 19 |
+
get_recent_dialogues
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# --- Инициализация RAG и БД ---
|
| 24 |
+
@st.cache_resource(show_spinner=False)
|
| 25 |
+
def get_rag():
|
| 26 |
+
"""Initialize RAG once and cache it"""
|
| 27 |
+
return RAG(
|
| 28 |
+
embed_model_name = "Qwen/Qwen3-Embedding-0.6B",
|
| 29 |
+
embed_index_name = "recursive_Qwen3-Embedding-0.6B"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@st.cache_resource(show_spinner=False)
|
| 34 |
+
def init_db():
|
| 35 |
+
"""Initialize database once and cache it"""
|
| 36 |
+
try:
|
| 37 |
+
init_history_table()
|
| 38 |
+
return True
|
| 39 |
+
except Exception as e:
|
| 40 |
+
st.error(f"⚠️ Не удалось инициализировать таблицу истории: {e}")
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# --- Session State Management ---
|
| 45 |
+
def init_session_state():
|
| 46 |
+
"""Initialize session state with caching"""
|
| 47 |
+
if "current_dialogue_id" not in st.session_state:
|
| 48 |
+
st.session_state.current_dialogue_id = None
|
| 49 |
+
if "chat_list" not in st.session_state:
|
| 50 |
+
st.session_state.chat_list = []
|
| 51 |
+
if "current_chat_messages" not in st.session_state:
|
| 52 |
+
st.session_state.current_chat_messages = []
|
| 53 |
+
if "chat_list_loaded" not in st.session_state:
|
| 54 |
+
st.session_state.chat_list_loaded = False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def generate_dialogue_id() -> str:
|
| 58 |
+
"""Generate unique dialogue ID"""
|
| 59 |
+
return f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_chat_display_name(dialogue_id: str, first_query: str = None) -> str:
|
| 63 |
+
"""Get display name for chat - always from DB, no caching"""
|
| 64 |
+
if first_query:
|
| 65 |
+
# Use first 40 chars of first query as name
|
| 66 |
+
name = first_query[:40] + "..." if len(first_query) > 40 else first_query
|
| 67 |
+
return name
|
| 68 |
+
|
| 69 |
+
return "Новый диалог"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# --- Chat Management Functions ---
|
| 73 |
+
|
| 74 |
+
def load_chats_list():
|
| 75 |
+
"""Load and cache chats list from DB"""
|
| 76 |
+
try:
|
| 77 |
+
st.session_state.chat_list = get_recent_dialogues(limit=50)
|
| 78 |
+
st.session_state.chat_list_loaded = True
|
| 79 |
+
except Exception as e:
|
| 80 |
+
st.error(f"❌ Ошибка при загрузке чатов: {e}")
|
| 81 |
+
st.session_state.chat_list = []
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def create_new_chat():
|
| 85 |
+
"""Create a new chat"""
|
| 86 |
+
new_id = generate_dialogue_id()
|
| 87 |
+
st.session_state.current_dialogue_id = new_id
|
| 88 |
+
st.session_state.current_chat_messages = []
|
| 89 |
+
st.session_state.needs_rerun = True
|
| 90 |
+
return new_id
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def switch_to_chat(dialogue_id: str):
|
| 94 |
+
"""Switch to an existing chat and load its messages"""
|
| 95 |
+
st.session_state.current_dialogue_id = dialogue_id
|
| 96 |
+
load_current_chat_messages()
|
| 97 |
+
st.session_state.needs_rerun = True
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def load_current_chat_messages():
|
| 101 |
+
"""Load messages for current chat from DB and cache"""
|
| 102 |
+
if not st.session_state.current_dialogue_id:
|
| 103 |
+
st.session_state.current_chat_messages = []
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
st.session_state.current_chat_messages = get_history_by_dialogue(
|
| 108 |
+
st.session_state.current_dialogue_id
|
| 109 |
+
)
|
| 110 |
+
except Exception as e:
|
| 111 |
+
st.error(f"❌ Ошибка при загрузке сообщений: {e}")
|
| 112 |
+
st.session_state.current_chat_messages = []
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def get_current_chat_messages() -> List[Dict]:
|
| 116 |
+
"""Get cached messages for current chat"""
|
| 117 |
+
return st.session_state.current_chat_messages
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def send_message(query: str) -> Optional[Dict]:
|
| 121 |
+
"""Send a message in current chat and update cache"""
|
| 122 |
+
try:
|
| 123 |
+
if not st.session_state.current_dialogue_id:
|
| 124 |
+
create_new_chat()
|
| 125 |
+
|
| 126 |
+
# Get RAG and invoke with cached history
|
| 127 |
+
rag = get_rag()
|
| 128 |
+
|
| 129 |
+
# Use cached messages
|
| 130 |
+
current_history = get_current_chat_messages()
|
| 131 |
+
|
| 132 |
+
# Pass history to RAG (it will use last N messages internally for enrichment)
|
| 133 |
+
result = rag.invoke(query, history=current_history)
|
| 134 |
+
|
| 135 |
+
# Log to history DB
|
| 136 |
+
query_id = log_query(
|
| 137 |
+
query=query,
|
| 138 |
+
answer=result.get("answer", ""),
|
| 139 |
+
reason=result.get("reason", ""),
|
| 140 |
+
dialogue_id=st.session_state.current_dialogue_id
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
result["query_id"] = query_id
|
| 144 |
+
|
| 145 |
+
# Update only current messages, not all chats
|
| 146 |
+
load_current_chat_messages()
|
| 147 |
+
|
| 148 |
+
# Mark that we need to refresh chat list (but don't do it immediately)
|
| 149 |
+
st.session_state.chat_list_loaded = False
|
| 150 |
+
st.session_state.needs_rerun = True
|
| 151 |
+
|
| 152 |
+
return result
|
| 153 |
+
except Exception as e:
|
| 154 |
+
st.error(f"❌ Ошибка при отправке сообщения: {e}")
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def delete_chat(dialogue_id: str) -> bool:
|
| 159 |
+
"""Delete a chat from DB and update cache"""
|
| 160 |
+
try:
|
| 161 |
+
delete_history(dialogue_id=dialogue_id)
|
| 162 |
+
|
| 163 |
+
# If deleted current chat, clear selection
|
| 164 |
+
if st.session_state.current_dialogue_id == dialogue_id:
|
| 165 |
+
st.session_state.current_dialogue_id = None
|
| 166 |
+
st.session_state.current_chat_messages = []
|
| 167 |
+
|
| 168 |
+
# Mark that we need to reload chat list
|
| 169 |
+
st.session_state.chat_list_loaded = False
|
| 170 |
+
st.session_state.needs_rerun = True
|
| 171 |
+
|
| 172 |
+
return True
|
| 173 |
+
except Exception as e:
|
| 174 |
+
st.error(f"❌ Ошибка при удалении чата: {e}")
|
| 175 |
+
return False
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
# --- Page: Chat Interface ---
|
| 181 |
+
def page_chat():
|
| 182 |
+
"""Main chat interface page"""
|
| 183 |
+
|
| 184 |
+
# Custom CSS to fix chat input at the bottom + keyboard shortcuts
|
| 185 |
+
st.markdown("""
|
| 186 |
+
<style>
|
| 187 |
+
/* Fix chat input at the bottom of main content area */
|
| 188 |
+
section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 189 |
+
position: fixed;
|
| 190 |
+
bottom: 0;
|
| 191 |
+
background: white;
|
| 192 |
+
padding: 1rem;
|
| 193 |
+
z-index: 999;
|
| 194 |
+
border-top: 1px solid #e6e6e6;
|
| 195 |
+
margin-left: 0;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
/* Add padding to main content to prevent overlap with fixed input */
|
| 199 |
+
.main .block-container {
|
| 200 |
+
padding-bottom: 100px;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
/* Dark mode support */
|
| 204 |
+
[data-testid="stAppViewContainer"][data-theme="dark"] section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 205 |
+
background: rgb(14, 17, 23);
|
| 206 |
+
border-top: 1px solid #333;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
/* Adjust width to account for sidebar */
|
| 210 |
+
@media (min-width: 768px) {
|
| 211 |
+
section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 212 |
+
left: var(--sidebar-width, 21rem);
|
| 213 |
+
right: 0;
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
/* When sidebar is collapsed */
|
| 218 |
+
section[data-testid="stSidebar"][aria-expanded="false"] ~ div .stChatInput {
|
| 219 |
+
left: 0;
|
| 220 |
+
}
|
| 221 |
+
</style>
|
| 222 |
+
|
| 223 |
+
<script>
|
| 224 |
+
// Add keyboard shortcuts support
|
| 225 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 226 |
+
// Find chat input field
|
| 227 |
+
const observer = new MutationObserver(function(mutations) {
|
| 228 |
+
const chatInput = document.querySelector('textarea[data-testid="stChatInput"]');
|
| 229 |
+
if (chatInput && !chatInput.hasAttribute('data-shortcut-attached')) {
|
| 230 |
+
chatInput.setAttribute('data-shortcut-attached', 'true');
|
| 231 |
+
|
| 232 |
+
// Add keyboard event listener
|
| 233 |
+
chatInput.addEventListener('keydown', function(e) {
|
| 234 |
+
// Enter (without Shift) - send message
|
| 235 |
+
if (e.key === 'Enter' && !e.shiftKey) {
|
| 236 |
+
e.preventDefault();
|
| 237 |
+
// Trigger the send button
|
| 238 |
+
const sendButton = document.querySelector('button[kind="primary"]');
|
| 239 |
+
if (sendButton) {
|
| 240 |
+
sendButton.click();
|
| 241 |
+
}
|
| 242 |
+
}
|
| 243 |
+
// Ctrl+Enter or Cmd+Enter - send message (alternative)
|
| 244 |
+
else if (e.key === 'Enter' && (e.ctrlKey || e.metaKey)) {
|
| 245 |
+
e.preventDefault();
|
| 246 |
+
const sendButton = document.querySelector('button[kind="primary"]');
|
| 247 |
+
if (sendButton) {
|
| 248 |
+
sendButton.click();
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
// Shift+Enter - new line (default behavior)
|
| 252 |
+
});
|
| 253 |
+
}
|
| 254 |
+
});
|
| 255 |
+
|
| 256 |
+
observer.observe(document.body, {
|
| 257 |
+
childList: true,
|
| 258 |
+
subtree: true
|
| 259 |
+
});
|
| 260 |
+
});
|
| 261 |
+
</script>
|
| 262 |
+
""", unsafe_allow_html=True)
|
| 263 |
+
|
| 264 |
+
# Check if we have a current chat
|
| 265 |
+
if not st.session_state.current_dialogue_id:
|
| 266 |
+
# Show welcome screen
|
| 267 |
+
st.title("💬 Чат с RAG системой")
|
| 268 |
+
st.markdown("---")
|
| 269 |
+
|
| 270 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 271 |
+
with col2:
|
| 272 |
+
st.info("👋 Добро пожаловать! Создайте новый чат или выберите существующий из списка слева.")
|
| 273 |
+
|
| 274 |
+
if st.button("🆕 Начать новый чат", type="primary", use_container_width=True):
|
| 275 |
+
create_new_chat()
|
| 276 |
+
|
| 277 |
+
return
|
| 278 |
+
|
| 279 |
+
# Get cached messages
|
| 280 |
+
current_messages = get_current_chat_messages()
|
| 281 |
+
|
| 282 |
+
# Display chat header
|
| 283 |
+
if current_messages:
|
| 284 |
+
chat_name = get_chat_display_name(
|
| 285 |
+
st.session_state.current_dialogue_id,
|
| 286 |
+
current_messages[0]["query"]
|
| 287 |
+
)
|
| 288 |
+
else:
|
| 289 |
+
chat_name = "Новый диалог"
|
| 290 |
+
|
| 291 |
+
col1, col2 = st.columns([4, 1])
|
| 292 |
+
with col1:
|
| 293 |
+
st.title(f"💬 {chat_name}")
|
| 294 |
+
with col2:
|
| 295 |
+
if st.button("🗑️ Удалить чат", use_container_width=True):
|
| 296 |
+
if delete_chat(st.session_state.current_dialogue_id):
|
| 297 |
+
st.success("✅ Чат удален")
|
| 298 |
+
|
| 299 |
+
st.markdown("---")
|
| 300 |
+
|
| 301 |
+
# Chat messages container - load from DB
|
| 302 |
+
if not current_messages:
|
| 303 |
+
st.info("📝 Начните диалог, задав первый вопрос ниже")
|
| 304 |
+
else:
|
| 305 |
+
# Display all messages
|
| 306 |
+
for msg in current_messages:
|
| 307 |
+
# User message
|
| 308 |
+
with st.chat_message("user"):
|
| 309 |
+
st.markdown(msg["query"])
|
| 310 |
+
timestamp_str = msg.get("timestamp", "")
|
| 311 |
+
try:
|
| 312 |
+
dt = datetime.fromisoformat(timestamp_str)
|
| 313 |
+
st.caption(f"🕐 {dt.strftime('%H:%M:%S')}")
|
| 314 |
+
except:
|
| 315 |
+
pass
|
| 316 |
+
|
| 317 |
+
# Assistant message
|
| 318 |
+
with st.chat_message("assistant"):
|
| 319 |
+
st.markdown(msg["answer"])
|
| 320 |
+
|
| 321 |
+
# Show reasoning in expander
|
| 322 |
+
if msg.get("reason"):
|
| 323 |
+
with st.expander("📝 Обоснование"):
|
| 324 |
+
st.markdown(msg["reason"])
|
| 325 |
+
|
| 326 |
+
# Input area - fixed at the bottom via CSS
|
| 327 |
+
query = st.chat_input(
|
| 328 |
+
"Введите ваш вопрос...",
|
| 329 |
+
key="chat_input"
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
if query:
|
| 333 |
+
# Send message and get response
|
| 334 |
+
with st.spinner("🤔 Думаю..."):
|
| 335 |
+
result = send_message(query)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# --- Main App ---
|
| 340 |
+
def main():
|
| 341 |
+
st.set_page_config(
|
| 342 |
+
page_title="RAG Chat System",
|
| 343 |
+
page_icon="💬",
|
| 344 |
+
layout="wide",
|
| 345 |
+
initial_sidebar_state="expanded"
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
# Initialize session state FIRST (before any other operations)
|
| 349 |
+
init_session_state()
|
| 350 |
+
|
| 351 |
+
# Initialize needs_rerun flag if not exists
|
| 352 |
+
if "needs_rerun" not in st.session_state:
|
| 353 |
+
st.session_state.needs_rerun = False
|
| 354 |
+
|
| 355 |
+
# Initialize history table once using cache
|
| 356 |
+
init_db()
|
| 357 |
+
|
| 358 |
+
# Load chats list if not loaded yet
|
| 359 |
+
if not st.session_state.chat_list_loaded:
|
| 360 |
+
load_chats_list()
|
| 361 |
+
|
| 362 |
+
# Sidebar
|
| 363 |
+
with st.sidebar:
|
| 364 |
+
st.title("💬 RAG Chat")
|
| 365 |
+
|
| 366 |
+
# New chat button
|
| 367 |
+
if st.button("➕ Новый чат", use_container_width=True, type="primary"):
|
| 368 |
+
create_new_chat()
|
| 369 |
+
|
| 370 |
+
st.markdown("---")
|
| 371 |
+
|
| 372 |
+
# Chats list - use cached
|
| 373 |
+
col1, col2 = st.columns([3, 1])
|
| 374 |
+
with col1:
|
| 375 |
+
st.subheader("📝 Ваши чаты")
|
| 376 |
+
with col2:
|
| 377 |
+
if st.button("🔄", help="Обновить список чатов"):
|
| 378 |
+
st.session_state.chat_list_loaded = False
|
| 379 |
+
load_chats_list()
|
| 380 |
+
|
| 381 |
+
if not st.session_state.chat_list:
|
| 382 |
+
st.info("Нет чатов. Создайте новый!")
|
| 383 |
+
else:
|
| 384 |
+
# Display chats from cache
|
| 385 |
+
for chat in st.session_state.chat_list:
|
| 386 |
+
dialogue_id = chat["dialogue_id"]
|
| 387 |
+
message_count = chat.get("message_count", 0)
|
| 388 |
+
started_at = chat.get("started_at", "")
|
| 389 |
+
|
| 390 |
+
# Get chat name (only load history if chat has messages)
|
| 391 |
+
if message_count > 0:
|
| 392 |
+
history = get_history_by_dialogue(dialogue_id)
|
| 393 |
+
first_query = history[0]["query"] if history else None
|
| 394 |
+
else:
|
| 395 |
+
first_query = None
|
| 396 |
+
chat_name = get_chat_display_name(dialogue_id, first_query)
|
| 397 |
+
|
| 398 |
+
# Format time
|
| 399 |
+
try:
|
| 400 |
+
dt = datetime.fromisoformat(started_at)
|
| 401 |
+
time_str = dt.strftime('%d.%m %H:%M')
|
| 402 |
+
except:
|
| 403 |
+
time_str = ""
|
| 404 |
+
|
| 405 |
+
# Check if this is current chat
|
| 406 |
+
is_current = dialogue_id == st.session_state.current_dialogue_id
|
| 407 |
+
|
| 408 |
+
# Format button text with chat name and metadata
|
| 409 |
+
button_text = f"{'📌' if is_current else '💬'} {chat_name}\n💬 {message_count} • {time_str}"
|
| 410 |
+
|
| 411 |
+
if st.button(
|
| 412 |
+
button_text,
|
| 413 |
+
key=f"chat_{dialogue_id}",
|
| 414 |
+
use_container_width=True,
|
| 415 |
+
type="primary" if is_current else "secondary"
|
| 416 |
+
):
|
| 417 |
+
switch_to_chat(dialogue_id)
|
| 418 |
+
|
| 419 |
+
# Handle rerun at the end if needed
|
| 420 |
+
if st.session_state.needs_rerun:
|
| 421 |
+
st.session_state.needs_rerun = False
|
| 422 |
+
st.rerun()
|
| 423 |
+
|
| 424 |
+
# Main content area
|
| 425 |
+
page_chat()
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
if __name__ == "__main__":
|
| 429 |
+
main()
|
classification_results.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
contestin.ipynb
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "0bc42803",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"data = pd.read_csv('src/dataset/rbc/channel_rbc_news_posts.csv')"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": 21,
|
| 18 |
+
"id": "4400213e",
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [
|
| 21 |
+
{
|
| 22 |
+
"data": {
|
| 23 |
+
"text/plain": [
|
| 24 |
+
"count 4847\n",
|
| 25 |
+
"mean 2025-09-14 12:58:58.353620736\n",
|
| 26 |
+
"min 2025-04-15 00:00:00\n",
|
| 27 |
+
"25% 2025-08-08 00:00:00\n",
|
| 28 |
+
"50% 2025-09-23 00:00:00\n",
|
| 29 |
+
"75% 2025-10-28 00:00:00\n",
|
| 30 |
+
"max 2025-12-03 00:00:00\n",
|
| 31 |
+
"Name: message_dt, dtype: object"
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
"execution_count": 21,
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"output_type": "execute_result"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"source": [
|
| 40 |
+
"data[\"message_dt\"] = pd.to_datetime(data[\"message_dt\"])\n",
|
| 41 |
+
"data = data.sort_values(\"message_dt\")\n",
|
| 42 |
+
"data[\"message_dt\"].describe()"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": 38,
|
| 48 |
+
"id": "ed3c14d3",
|
| 49 |
+
"metadata": {},
|
| 50 |
+
"outputs": [],
|
| 51 |
+
"source": [
|
| 52 |
+
"data = data.sort_values(\"views\", ascending=False)"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"cell_type": "code",
|
| 57 |
+
"execution_count": 41,
|
| 58 |
+
"id": "85a0528d",
|
| 59 |
+
"metadata": {},
|
| 60 |
+
"outputs": [
|
| 61 |
+
{
|
| 62 |
+
"data": {
|
| 63 |
+
"text/plain": [
|
| 64 |
+
"'▪️Роскомнадзор сообщил об ограничении звонков через Telegram и *WhatsApp.\\n\\n«По данным правоохранительных органов и многочисленных обращений граждан, иностранные мессенджеры Telegram и WhatsApp стали основными голосовыми сервисами, используемыми для обмана и вымогательства денег, вовлечения в диверсионную и террористическую деятельность российских граждан», – пояснили РБК в пресс-службе ведомства. \\n\\nНикаких иных ограничений функционала в Telegram и WhatsApp не вводится, подчеркивают в Роскомнадзоре. \\n\\n*WhatsApp принадлежит Meta, деятельность которой признана в России и экстремистской и запрещена. \\n\\n🐚 Картина дня — в телеграм-канале РБК'"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"execution_count": 41,
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"output_type": "execute_result"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"source": [
|
| 73 |
+
"data.iloc[0][\"content\"]"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 34,
|
| 79 |
+
"id": "8458b416",
|
| 80 |
+
"metadata": {},
|
| 81 |
+
"outputs": [
|
| 82 |
+
{
|
| 83 |
+
"data": {
|
| 84 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAPgpJREFUeJzt3QmYFNW58PG32YZ9l01AUIgbiBqXuCTRyJUoUcnNpvEmfCZXo2KiMY8LuXEhJoFsXmJCQM01mEQlJhEXFJSgArLv+74OyzCsszLDMFPfc2roppfq6qruqu7TXf/f8/T0dHd11alTp6reOkt1yDAMQwAAADTSJNcJAAAAiEeAAgAAtEOAAgAAtEOAAgAAtEOAAgAAtEOAAgAAtEOAAgAAtEOAAgAAtNNMNNPQ0CD79u2Tdu3aSSgUynVyAACAA+q+rxUVFdKrVy9p0qRJ4QUoKjjp06dPrpMBAADSUFxcLL1795aCC1BUzUl4Bdu3b5/r5AAAAAfKy8vNCobwebzgApRws44KTghQAADIL151z6CTLAAA0A4BCgAA0A4BCgAA0A4BCgAA0A4BCgAA0A4BCgAA0A4BCgAA0A4BCgAA0A4BCgAAyP8AZc6cOXLLLbeYPwak7hb35ptvRj6rq6uTxx57TAYPHixt2rQxp/n2t79t/r4OAACAbwFKVVWVDBkyRCZMmJDwWXV1tSxfvlyeeOIJ8/mNN96QTZs2ya233up2MQAAIMBChvp95HS/HArJ1KlTZcSIEUmnWbJkiVxxxRWya9cu6du3r6MfG+rQoYOUlZXxWzwAAOQJr8/fvvdBUQlVgUzHjh0tP6+trTVXKvqRKyVlNTJp9jY5WnUiZ2kAAAA+Byg1NTVmn5Q77rgjaTQ1duxYM+IKP9RPNefKN19cKOOmb5Qfvr4yZ2kAAAA+Biiqw+zXv/51US1IEydOTDrd6NGjzVqW8KO4uDhn22X7oSrzec7mgzlLAwAAEGnmZ3Ci+p18+OGHtm1RRUVF5gMAAMC3ACUcnGzZskU++ugj6dKli9eLAAAABc51gFJZWSlbt26NvN6xY4esXLlSOnfuLD179pSvfvWr5hDjadOmSX19vZSUlJjTqc9btGjhbeoBAEBBch2gLF26VK6//vrI64cffth8HjlypDz99NPy9ttvm68vvvjimO+p2pTrrrsu8xQDAICC5zpAUUGG3a1TMritCgAAgInf4rFAiAUAQG4RoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoFgw+DljAAByigAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAABohwAFAADkf4AyZ84cueWWW6RXr14SCoXkzTffjPncMAx58sknpWfPntKqVSsZOnSobNmyxcs0AwCAAuc6QKmqqpIhQ4bIhAkTLD//1a9+Jc8995xMmjRJFi1aJG3atJFhw4ZJTU2NF+kFAAAB0MztF2666SbzYUXVnowfP15+8pOfyG233Wa+95e//EW6d+9u1rTcfvvtmacYAAAUPE/7oOzYsUNKSkrMZp2wDh06yJVXXikLFiyw/E5tba2Ul5fHPHRWWlEjk2Zvk8OVtUmn2VpaIS/O2S41dfVZTRsAAIGtQbGjghNF1ZhEU6/Dn8UbO3asjBkzRvLF/3tpiazfXy4fbSyVv3/vKstphj47x3wur6mTH914bpZTCABA/sv5KJ7Ro0dLWVlZ5FFcXCw6U8GJsmjHkZTTrth9LAspAgCg8HgaoPTo0cN8PnDgQMz76nX4s3hFRUXSvn37mAcAAAg2TwOU/v37m4HIrFmzIu+pPiVqNM9VV1k3hwAAAGTcB6WyslK2bt0a0zF25cqV0rlzZ+nbt6889NBD8rOf/UwGDhxoBixPPPGEec+UESNGuF0UAAAIKNcBytKlS+X666+PvH744YfN55EjR8rkyZPl0UcfNe+Vcs8998ixY8fk2muvlRkzZkjLli29TTkAAChYrgOU6667zrzfSTLq7rI//elPzUfQGZI8nwAAgMajeAAAAOIRoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoAAAAO0QoPjI5ieLAACADQIUAACgHQIUAACgHQIUAACgHQIUAACgHQIUAACgHQIUAACgHQIUAACgHQIUAACgncAGKJW1J+X52dtk9+HqXCcFAADECWyA8rNp62Xs9I3yxd/NyXVSAABAnMAGKAu3Hzafq0/U5zopAAAgTmADlGzgt3gAAEgPAQoAANAOAQoAANAOAQoAANBOYAOUUCiU6yQAAIAkAhugAAAAfRGgAAAA7RCgAAAA7RCgAAAA7RCgAAAA7RCgAAAA7RCgAAAA7RCg+MgQw8/ZAwBQsAhQAACAdghQAACAdgIboHCjewAA9BXYAAUAAOiLAAUAAGiHAAUAAGiHAAUAAGiHAAUAAGiHAAUAABR+gFJfXy9PPPGE9O/fX1q1aiXnnHOOPPPMM2IY3FUVAAA400w89stf/lImTpwoL7/8slx44YWydOlSueuuu6RDhw7ygx/8QIKEmAwAAE0ClPnz58ttt90mw4cPN1/369dPXnvtNVm8eLHXiwIAAAXK8yaeq6++WmbNmiWbN282X69atUo++eQTuemmmyynr62tlfLy8piHTreSfX9diUxfsz/p5/UNhrz0yQ5Zu7cs7aQs2n5YXlu8O+3vAwBQaDyvQXn88cfNIOO8886Tpk2bmn1Sfv7zn8udd95pOf3YsWNlzJgxoqOaunr53l+Xmf+vevJG6dC6ecI0/1q+R346bb35/85xjbVGbn3jhYXm84BubeXyfp0zSjMAAIXA8xqU119/XV555RV59dVXZfny5WZflN/85jfms5XRo0dLWVlZ5FFcXCy6OFHfEPm/6sRJy2k27q/wbHnFR6o9mxcAAPnM8xqURx55xKxFuf32283XgwcPll27dpk1JSNHjkyYvqioyHwAAAD4VoNSXV0tTZrEzlY19TQ0nK6NAAAAyGoNyi233GL2Oenbt685zHjFihXy7LPPyne+8x2vFwUAAAqU5wHK73//e/NGbffff7+UlpZKr1695Hvf+548+eSTks+ycZs57psCAIBPAUq7du1k/Pjx5iPfORmJHHI4XBkAADjHb/EAAADtEKAAAADtEKD42G+En0cEACA9gQ1Q6DoCAIC+AhugeIVOsgAAeI8ABQAAaIcABQAAaIcAxSEjC3dRo1MtAACNCFBshBx0MKGzLQAA3iNAAQAA2iFAAQAA2iFAAQAA2iFAAQAA2glsgOKkAywAAMiNwAYoWcG4YQAA0kKAAgAAtEOAErCbwQEAkA8IUGxE91IhdgAAIHsIUDJEX1sAALxHgAIAALRDgJIFe48dl0mzt0nZ8bpsLA4AgLzXLNcJyHdO+qbc9od5cqiyVtbuLZM/fPPSbCQLAIC8Rg1KFqjgRJm/7XA2FgcAQN4jQMkQnWQBAPBeYAMUbnQPAIC+Ahug6HgjNm7TBgBAIwIUHxmEHAAApIUAJcP+JfwqMgAA3iNAAQAA2iFAAQAA2iFAyfAmbPwCMQAA3iNA8VGIwcwAAKSFACVDdp1kGcUDAEB6CFCyiPucAADgTGADFG5RDwCAvgIboGiJKhYAAEwEKB6O6AEAAN4gQMkQPzoIAID3CFB8RK0LAADpIUCx4aRVx8uWH4YlAwDQiAAFAABohwBFI9x5FgCARgQoWewkS58UAACcIUDxETeDAwAgPQQoPnJbY0InWQAAfAxQ9u7dK//1X/8lXbp0kVatWsngwYNl6dKlks/9PQgeAADInmZez/Do0aNyzTXXyPXXXy/Tp0+XM844Q7Zs2SKdOnWSgsSd2gAA0D9A+eUvfyl9+vSRP//5z5H3+vfv7/ViAABAAfO8ieftt9+Wyy67TL72ta9Jt27d5JJLLpEXX3wx6fS1tbVSXl4e8wAAAMHmeYCyfft2mThxogwcOFDef/99ue++++QHP/iBvPzyy5bTjx07Vjp06BB5qNoXXRhOernyI4IAAOgfoDQ0NMill14qv/jFL8zak3vuuUfuvvtumTRpkuX0o0ePlrKyssijuLhYCoWRTsADAAC8D1B69uwpF1xwQcx7559/vuzevdty+qKiImnfvn3MI6/QSRYAAP0DFDWCZ9OmTTHvbd68Wc466yyvFwUAAAqU5wHKD3/4Q1m4cKHZxLN161Z59dVX5YUXXpBRo0Z5vaiCQwsQAAA+BSiXX365TJ06VV577TUZNGiQPPPMMzJ+/Hi58847vV4UAAAoUJ7fB0X50pe+ZD4K6Xdy0qndoHsKAADp4bd4fLxlPmN2AABIDwEKAADQDgGKDcPjHxGkRgUAAGcIUHzqswIAANJHgOIQQ4ABAMgeAhQfO8kCAID0EKD4iN/eAQAgPQQoGqETLQAAjQhQHCJ4AAAgewhQMsToHgAAvEeA4qMQ0QsAAGkhQMlwaLGr4ccppmUoMwAAjQhQfMQoHgAA0kOAohFahAAAaESAkiGCCgAAvEeAAgAAtEOAohE6yQIA0IgAxUfc3A0AgPQQoDjEiBwAALKHACVDbn7LmBoVAACcCWyAksu7vE5ZvFsWbj+cs+UDAKC7ZrlOgNac3EnW5SyX7jwij7+xxvx/57jh6aULAIACF9galGzUuFh9Y/eRak/SAwBAISNA8bGTLH1OAABIDwFKhnLXkwUAgMJFgJJldi1FBnUuAACYCFAAAIB2CFCyiJu9AQDgDAGKQ3R4BQAgewhQ/OxTQlQDAEBaCFAks06rBCEAAHgvsAFKKEfDiUMMTAYAIKXABihu0VoDAED2EKBkKIe/OQgAQMEKbICSjRoRghcAANIT2AAlG+I70KYKiuhwCwBAwAMUWmYAANBXYAMUt7yq3aDZBwCA1AhQMsSwYQAAvEeAkmGtiZtfIE7VrMRQZgAAGhGgZBEBCAAAzgQ2QHHbFySdviNuAxI67gIAEPAAxS2GAAMAkD0EKBmikywAAN4jQNGoFoY+KgAAZClAGTdunIRCIXnooYckaOhTAgCAhgHKkiVL5Pnnn5eLLrrIz8UAAIAC41uAUllZKXfeeae8+OKL0qlTJ8l/7htgaLIBAECzAGXUqFEyfPhwGTp0qO10tbW1Ul5eHvPQhR8BhmruSr5AZ0tcvOOIvLZ4t6NpD1bUyqTZ28xn+G/9vnL509ztcrK+wXa6PUer5fnZ26S8po7NAgAWmokPpkyZIsuXLzebeFIZO3asjBkzxo9kFKyvP7/AfD67axu58uwuttPe89elsmL3MXl/XYlMvf+aLKUwuG5+bq753LxpExl5db+k042YME8OVZ6QDfvLZfztl2QxhQAQ0BqU4uJiefDBB+WVV16Rli1bppx+9OjRUlZWFnmo7we2y6vLu8HtOlKdchoVnEQ/IzvW7Suz/VwFJ8q8bYezlCIACHgNyrJly6S0tFQuvfTSyHv19fUyZ84c+cMf/mA26TRt2jTyWVFRkfnQn4Y9SjRMEtzd2I8bAAJAlgKUG264QdasWRPz3l133SXnnXeePPbYYzHBSS6lc+v6TOfj5ocFAQAIMs8DlHbt2smgQYNi3mvTpo106dIl4f2C5/bymMtpAABM3Ek2y7h5GwAAORrFE+/jjz8W3VBZAQCAvqhByeOAhj4t+nJeXDQsWACggcAGKJ51ks3GQgAACJjABihOGA6qTbj+BQDAewQoDqVVGaJjuxAAAHmAACWLVLxCqw8AAKkRoORxZYiOaUIj7iQLAJkJbIAS8uiOJF52gyXeAAAg4AEKAADQFwFKFtH/BAAAZwhQfBTfZEOfEaQqIwCARgQoaQYbue77onBy0xd3+QWAzBCgZDkAsDtxUcMCAEAjAhQAAKAdAhSHQh58x0jRxEMnWgAAGhGg5HF/D5qE8r/AOPm9JwAIIgKUTNlUe3DqAQAgPQQoWRRK0YzDxTQAAAEPUHLR34MaFQAAnAlsgAIAAPRFgOKQjs0v3AxMX06Li4bFCgC0ENgAxUnAke2gJNsjOlYWH5OX5+90tNy5Ww7KP5ftcTzv5buPyl8WWM97/b5y+dPc7XKyvsF2HrUn683pthyoSLm8mrp6eXHOdtlaWinpKK2okUmzt8mhytqU085YWyLT1+xPazmAlxbvOCKvLNpFpqIgNct1AgpZwrnZSO9+Kn4ZMWGe+XxGuyK5eXBP22m/9X+LzeeL+3SQAd3apZz3f/5xvvnco31LufHCHjGf3fzcXPO5RbMm8u2r+iWdx8SPt8n4f2+Rn727QXaOG267vN9/uEUmfLRNfv5e6mmt/PfLS2X1njKZteGA/OPeq5NOV33ipNz7t2Xm/6ufvlHat2zuelmAV77+/ALz+eyubeWqc7qQsSgoga1B0fGmaKEcJcpNrcOB8tQ1DNG2HaxK+pmqSbGzYvcxx8tZtuuoZEIFJ8qSnfbzqa07XetTc6I+o2UCXtl9JPl+BuSrwAYoOM1Ny5LbVqhC/u0hL5Kf73kAPVCOUIgIUHzskJpQIRLK/4NSNjvm6lnL5Ww67hALAJkhQMmQq3NoinM7JzUAABoRoMDXWhG7Wh4vl2v3I4x+oVoduqClEIUosAFKKEcnMB2bLXTlJquyla+5CIQAIIgCG6AAWtyojWoYALBEgOKQF+cRr5tSqNYFABQqAhQbQbmVvJ/DjLM1r1wISvmA/vJ9XwKsEKDAFSMAN66zpWGSAKAQEaBkER0sAQBwhgDFxyrU+CaAxtch7ep1/awVsesE6uVyc1HZYjuE2uHKUTMPT8oiJQkFiAAl6yfG7JywAzfMOEttLzq2OgFAISJAgSteDoulYx8AIBkCFB9ZX9VzCQ4AQCoEKPC1KqOQa0k8WbUCzh9kTyHvZwiu4AYoLjsTeNEJzeuDiFHww4wlb3G+AIDMBDdAycJZxiqosTvpchUEAEAjApQs0zEI8XWYsWTp14xzMsxYw42JQKIkohAFN0DJwckl/iQaf4LL5yYNf5AhABBUwQ1QckDFI3kfhBjBvuwrhH5EAJAPghuguO0kq+GZRMc0oRHNPwCQmeAGKB7J5A6m8QFGrgIOP5dbyEFUIa8b8gyFEQXI8wBl7Nixcvnll0u7du2kW7duMmLECNm0aZMEkdUxI/9beLJ3VtayOYygBADyM0CZPXu2jBo1ShYuXCgzZ86Uuro6ufHGG6WqqkqCTp3bOL/F5ocdHeMTAEB2NPN6hjNmzIh5PXnyZLMmZdmyZfK5z33O68XlNSMAtSLZqnEJaVbd4nSt6auCbJY3IJ/43gelrKzMfO7cubPl57W1tVJeXh7zyIZQFnZ6q3NmyMPlBf3ktnD7YZmyeHfKPPrLgp2ybNdRCapVxcdk8rwd0tBQ+OXlYEWtTJq9zXz2wmuLd8ui7YfFSzsOVckLc7ZJ9YmToqMZa0tkxtr9oqNj1SfM7bu/7Ljn835vzX55f12J5/OFRjUo0RoaGuShhx6Sa665RgYNGpS0z8qYMWMkCPS6xk+PlzFRpgHW7S8sNJ8Hdm+XdJp/byiVJ99aZ/6/c9xwyac+OF65bcI887lz2yK5dUgvKWR3/2WprCw+Jh+sK5E37r8mo3mpwGT0G2s8Kzth1//mY/P5QHmtPPGlC0QnVbUn5d6/LTP/XzdmmLQp8vUU4dqPXl8lszaWyquLdsucR6/3bL5l1XVy/yvLzf83/eyLUtSsqWfzhqY1KKovytq1a2XKlClJpxk9erRZyxJ+FBcXS6GKP7Ul3Kgtq6kpHHuOVif9bGtpZVbTorPNJRVS6FRwoizf3ficiV1HkpcrLyzdeUR0U1NXH/n/eNT/upi9+aD5vNvjbVNRWxf5/2R9/l2EFCrfwuMHHnhApk2bJnPmzJHevXsnna6oqMh8BGYUj2Z9JQplmHGyXPUzu71YNw6F+tJvT/V/P9Px+BTNr+Tpvt5B5XmAomoFvv/978vUqVPl448/lv79+3u9iIKRjyenbHZ7cXrMyGaaHC8rHzcuskr3IhLULm4BXe1gBCiqWefVV1+Vt956y7wXSklJY6ejDh06SKtWrSRfZWNndd1JVgp9mHHmVzVcF0Xnd76XmOziqlo/jccE78sxx4mA9EGZOHGi2Zfkuuuuk549e0Yef//736UQuakZDPqoGz9RQwt4sB+RiSj0Jh44zSs9csrPZBgFXDOQbzUSupQ36DcirhDKt1c4h+mD3+LxUWHe6t7DeRn5V7PidZIJGvSV7/tqQdZE+tZJ1p/5IjMEKA5PHl5cTaRq03YbuXNy0zcfdLz61C9FwaZjGYmhefL8EtDV1hIBShbFByC6HKB8PcFnKXpIOszYx+tgHQMj5M9VtadNPBIMfm2S6OME+7U+AhugZKNKz+0y3I4ayEW1ZDbbZ/UcZhyUUwGCXu0fpJIe9G2tq8AGKIUg38+VRhYOGhx4Cqe8wH9+1jh6gf05WAhQcogTRuHes8LpttWlmQ8AdBPYAMVtcJBOMBH/HfXS7jzqupOs+yRl/SSZnWHGuZFvoQXBUCEPM/ZoRlHHp8BeQAV1vTUU2ADFK26v2/N9p/d2mHH+ZUb+pRi6NndoWZa0TJT/2yR6rgTy+ghsgKJZiwAyOBjlY6AD/QX9GBHUEzWHE30ENkDJhZCuB72C/jVjP4cZZ75yHAyDy8vAOh+ae73g2+4c3bTl0yLgHgEKXAn6CdVx51cd80nHNAEaiL0PCjuKLghQHF5NeFFk4+eR6X6Q7ztSytQ7vQ9K5rNIS77lfr6lFzm+e7aGBUbHCmj4hwDFR1b7t5ZNPAUQCOVjvgIozItReCOwAUquzmc6nuv9/TXjLKxwjvJUx22Z74EmdPo142BiN9FHYAMUHejeIc1afv2AiPe/ZhydaA86yWY8B+Trzf90PBFqmKTsbBOPfxgW3iBA8VHI4oCUj00RuTqQOs0qHQ8oOqYJ7uThruqpwNa4BXS1dUSA4lAh76x+rlv2si2U/U6yhVskENAgVvfjXDaCRr1zIFgIUDLkpkYkflrNjwWW8jHNnsrj9Q/8tnMpH2s7kZ6Yhts83scLDQFKtkfxBL7iODp/DE/am+0OKL7eqE3yS76lF9mn/YmaoDFQCFCySO3wOlbrpuJtt1AkzVwEipYBQEDF3P+FnVIbBCgi8uqi3bJ4xxHbQrt2X7nnmV8fd4SyO2BNX7Nf3l9XEvPeG8v3Jky3tbRCXpyzXWpP1jtOh1ruzkNV8sKcbVJVe/LUe4b8beEuWbozNl8mz9spa/aUJczjX8v2yNwtBxPnLSKvLy2WeVsPWS7770t2y4Jth83/1bN6HfbOqn2R/+sbkmdOsl+JVumcPH9n3Loa8tcFO2XZrqPma7V+aj2jFR+pjnm963CVPD97m/n+pNnbo+bV+LyppEL+NHe7nDjZEPOZeq3eV58nc6K+QeZvPWTmkXKyvkH+75Mdsm5fYx6rPFV5G620okYmnUqPSpdKX7zDlbXmNGraMDVvlZ4N+0+X5bLqOnO6fceOJ8zDzKuFu2TZrtgy0NBgyISPtkq/x9+V1xY3bq8lO4/IK4sa83HPUZVP26S8ps5ynRduPyxTTn0v3sz1B+Td1fvNcqjWTZXL6O0UXueKmjrLtMWLL1PRrLZbeP+pqauXQxW1Cd9R+5X6zuYDFeZD/R+/rx2tOmGu/4Hy03mvzN92SF5f0ridlS2llTH5/u/1B2Ta6n0pj0tqndW6R/cXiT52qDxTeVR9onFfjt4f/jxvh7n9wo7EpTV6Pi/N22F+J1z+K2tPysvzd8rK4mMx8919uHGbhI8dysGKxvJ3qLIxD1VZ+O+Xl8iv399o5q3KYzXvn76zXn48dY3MWLs/YTtFl6mwiprTy4gv3/HrEhZdZpwct56fvd1clx2HquQnb66R8f/ebLlu6jls28FK8/ip1i16f7v9hQXy2w82RcqzystwGVPb95Mthyz37XC+1dTVy+P/Wi3ff22FPDdri7lt1XqEj0dbDsQe79W2Vdt49Z7YbaSoY/A/Th1n8kWzXCdAB2oHSeWJN9fKtz5zVsbLim7ieWPZHvl/1/RP+R11ErnvleXm/xuf+WLk/fX7y82dqH/XNpH3hj47x3yuPlEvDw4d6Dhd//G/s6Wu3pA9R4/LT28bJJ9sPSQ/eXOt+dm2X9wcmW7xziNyyx8+kZ3jhkfeUzvJj/6xyvw/+n1l3b5ymfjxNsvPVuw+Ju+tKYl8dseLC83/z+3RXs5oVxQzrTqB33FFX3FDpTPerA2l8sRb6yLL/OqkBQnT3PzcXFnz9LDI6xv/d47UnmyQsdM3xkwXvtIaNn6OZRClDvDjzO9sSFj3aN/80yLz+cJe7WX5rqPyzLT1kfR96/8Wm/8P6dNBBnRrZ/7/3clLZc3eslPzFvnNB5tky89PbyPlvr8tN7dV9AlPndTCSQyn59F/rZL31x2Qvy7YJfMe/0LMPGZvPmiW++jplXdW75Nfv7/J/H/0G2vM7fK1U/nYv0sb+cGUFXKo8oSs31cuz91xScL63v5C43Ye2L2tfPqszpH36+ob5O6/LDX/v2VILzNADa9b/HZ6ce52cxnxaYsXXaYu7tMx5rPwdlPl/r7rzonZf9TJ+HeztiTM74XZ2+W3MzeLvLsh8p4qG6OuHxB5rdZ/7pZD5slv5sOfj7z/zRcbt3M0lW8q39XJ7L9Prftnzu5iBj/h41L8+n1lYmNe9O7UynKdhz47W042GLK/rEaevvXChP2hQ6vm8p+X9jb/f/BUWt9Yvkc++OHptCp/nrfTfLRo1sQM4p6fs90MAuLTpI4dKg92HamWX3x5sPme2o4qkPlgXYm8cf818tRb6+TfG0rNhwoal++OPYGqk7Xyqe7t5JK+nSJ5Ey5TVw/omrCeqvyrC7e3HrjWdl2GPzdXymtOytbSSvn114ZE3levw8ethaNviLyvLmrUhUM4TUppRW3CuqkLxqn3X2O+d8NvZ5vPh6tOyOibzo/MZ+H2I+ZDHaenrd4vz87cbOZVtOi8DO/bH24slde/d5V5ITAlKqj9/Ydb5MuXnCmvL90jf/hoayRgUwHKA18YKG+t2itj3jl9/Ih256njzODeHeS8Hu0lH1CDkkMb466sk3WXqKitizmIRyuNu1oIW1HcWEPglDpIK+ErNrVDOVWSJA2K1ZV5mDqAWlFX4EdOnXzCoq/6M6kqV1c6qURfpSnxB5RkVsfVLFldxdjZd6zGDOislJSdvlpTBzCrbRdNBSfK2r2n52dVCTVnc+MV3F6L7bT9oHUZUCfPZHYero4EDslqzcJ2x9VURQd4szYcSLpuSngZTqkylYzVdloRV0sQFl97YPWeOkmGa0hSCed7dG1q+fE62XU4eXpTbR8VnCiLLGpflOgavXBaNx9IntZwDVM4OIkX3j9UzVh8noQDEXXBExYfnEQrPnrcskwlsypqn0u2Lio4UeafqqkNi65piW/WiV6XZOumLrDiLdt51LIMqYDDybEkvG+Hj8PLTtX0hqn9Ibwe0cepcJrizylW9h9LfrzWDQFKDjk9sUZ39Ez4PZ8M5x0/j/D3Qi6GHtp1/E2nT5vV/WJSrY/T5XjZZzY+TfEHObcdolU+J0ufX+3idvmR7DO79XJzy/Bs9sGwW5bVZ8nKvFWeeLEeMT9W57A8xzTxWOR2snWwS25G5cyus7rTWVik2beyb5uQFK8dfK2JBwebUMhdGXRyzMmnPjaBDVD8vktk0gNE1GIbHB7ZYoOF+GUkWbajOVt9L1zSQ76e5OzTYJOuDGVrFJWZWpeLalzDUFZP5qE0PrPbpq4C42wGKLafuUmI1cp7cUfh2P4kfh6e/LrXid1cM1kfr5KbsN4hu4sN+9eZHredzyPk6D3DRT7nU+fswAYoORPTW9xZwYkpdCmu2k/PK71SaFWDktlJzv2RyaomwfYK2MXhw9calOTHP8fzS16Dkv1APdlnIce1cfapzuZx0i4t1jUo1t/3qwYlbo7OroRTLDfpxYt9tOZLHjs9Dlhui/STZDsf25rAhAEMzlIRPV0TD441Icc1KHaXN/HTSt4gQMlhTYzjJp6Y6t8UoX6GrCLxlIvw4Wov/uCRD/tUfI2Y+7KhTkxJPvHpqGIbXGZYhWIz8OrUpHpsVSc1dnaBu+H1MFeHbTypAvOkFy8i2tagWKbZo3KScAFhc4xLtwYlusxH7/9eXhiFLN5zVYMi+SOwAUo2Do6WJ6iYakWHTTwuqiKTTef84BiOxF008dj1QQn53wfFzbp62bSX8kZzeVCDYpfIpPGJzezcpNNpDWJu+qDEvXZw9RqWaRFz3gfF+n+79+zeDy87XbY3TMxgHt7VoMRtJxfpyLRspt3EEwo5es9VHxRNLgycCGyAkk9CeXYL/6Cyu0LTlb+/VeTuQKhz5710And3889+H7ls0nF9/EiTmyZOJ0IO3ytUgQ1QnBROJ8Urk0KYWI1oeHbVme7BPtNdyoudUqU9cfPY9CFwEQx4uXPb1Walkw2NV87ZrkKxkawPisNOsimT7PFVqv2i3HW8cFpTafVZyINOss6+k97nfgWCXszXTU2QF/MOi755nWUTn+NhPNbT+R2fGS6Wo+9lQKLABijZYHWyji4/Tkfx2I38Sd5J1mEi4+dh0dbuZohvqn4HTjQ0WPRBSZkGZ0eAbF7IedvEY2S/k2zS9+O3TfTJ1Xk6E08C/lFlKmk6LBacsJ+F9wsHnWTTuTKPDeyS90VK/h2rz1MfG7z8AVMvAgk36xEvVbbHH5tsm86z2MRj27k4ZPVm8nnQSRa+dZJ1cpJ1HNRkyM0Y/iZR3dUz7yga/l5aX0s9X/FP/KZJ5z4I2a6+bZLOfVCcntQyKKq6VmM72aZepL1Jli4dvbhXhy7r43Zdosu+V8fVZHOxS5rdRV0Tiy/arqeGTWmZoAYlhxzvEjFtB5l3hHO0KDd9UGyWG0qjM6FVJ0H7oZFGbjrJJizWZuUdN21l+T4otjUozoYZJyueKZsgEsqyf8G3uwYem+ZXq6tXj+/9Y9akeXDDLSe5mbgtDZ86yTodZmzR3OZw+e4b7ZNfWKVbgxJbO5X+Oiem0OHIO3Eifxp5CFByyGnUbthE28nmke4VQaZXEnbfd9r8Y3UfFK+ucPy9AVZm37fLHy+aztxyWoMSvW2i05kq4IhfJzfBjVt2abEsWy5OUAmBd5r3/rFNj+V30vvcr1pYT/qfWTa3ZTzbpPNOtoy074OSRsn1pFncONXE46Do5eJYkq7ABihaVIQlHKBTtxkn9kFxNGv7ZMScVCxqPlINp7Vpy439LP5AniQ9Fp+6vQLOSidZ2z4Uzm625fhW9z6dVNK5Gou/Mowuk7G3X7eX7F4jfnBbAZeszFvegyL5hXn6NShOOjvG5Lv957Hv+9QHxaf7oHjVB8WupivVcdVptqRTo20XMIb8uNU9AQqSF7iQ62g7erps9UFx1xSSvKrUvpNaVF64Gv6hJ8/vh5GFbe1FjZJnzYx+rq7LeSer4vdruGyuThrZ+umHkAbrYneB5FkfFBc1iFbfSRSyeEeLy+usCGwNSq6kuuqx/o71/41vJPtSGonLsA3a7VVKzPdi4hOXt7p389slvvZByVzMXYNzHLM5vupNt6w5rEH0gt28rT4xXH3fWc2grTRWPWUTj0d54VV6sjWPpPN2McrL61E8tkPzPRyeHXI0zDh/rgADG6DosIkcByg230l+O+v01tCyiSfFrJwGIXZimwncn7xych+UFK8zHWZst929k0YnWZs+KPGBpqv88/PkZBvgpm5WsNovks07vU6ysXno9j5N1k0jSb5n+LND2G1vxx1Grd5zWi5SNfHY1OCmun2D89puZ8t22ick5LSJx7Jh3H1adBPYACWfmnhyMszYRcmIHvZmNCQ/ySUMQc5yxy27Ph7ezD/2/3wfZpwsMfGBS7IymUlR1bUS28kPwHlRBR+9HD+b+rz4QTsnsjH61e26xOaxN2lIZ1vZHdObWAYoNhcVuu44aQpsgOIs0vT3rOm0GtGu46Fn7f9x33P3Wzw21d0xtStx34upLYhfx4QqFJs0O19ZL9tvE66y45eV1jDjZPP3pyza3lLB4XeSXcmnTLHDDt9eyLQ7jOHgd1Ay/f2p08uK7WCd7j7u5McC7W6655bdV51miZsOy5n3QfF+mHHM/B0PM7aZh1j1QUk+D6/ukK6LwAYoOnA+7Db6O4ajeaRb05JpUGa3Tk5/8t6qiceLmiNXfVVykXcN8TVsUZ/l4KiS7GCXcO+MhiTpTJFmu6GdXq+u21EUiSeo5DOwqxl0nL6Y+SX/zG65Tu+e69eFl5fDZb2er2I3m1TH1XSO1bHLtjv2eXNss5Pu3Z5zjQAly2L7diSvbUjG6fh8N0XQ6sTgpl+J3bR2zTjJrmDUzpzsCtYy/bapcz+deDQ/9zUo9kGbG8775Di9ynN2gnbVB8WmBsXrY6j9CcJqeuvXllevmSXN8gTiZP9LN7/8GmYsOe6DknIRroJUN0eg6KkMT+9xFHLZByVpujI4luQSAUqWpVM2YmtQJOtSLdOuhsf5SS52fn7sROaBX7Ipsz4omfS292JQU7KO0n7dRM9NcBP7vdTTuk5isip/J31Q0vktnvh5ZKnfWbb2h+wMM7aXGHIYvh5XndZUeHEflFSf5VFMEoMAJYfsrphjp7O+QrWaR6p5pUyT1dVkipkZtkdam7b0JP0tDJc38TKbbhze6NruxnFu2bVTpxeImiuSZH7ZP8QkG1GU0G/BYVl1c8JIt0bA6bJSfT/hLRdVeGk18diUnaTrl2q/9KHzsv3y/JmJV2U/2cgsy18zttm37ZeR7IM0vuNh38HY42v+hCsEKFmWzpWRXQ1F8qDGOasTTMwQPJtfgm1Mg82VSMxnzmpQ1HecBmJueNHxLW4Oca8MT5t4Mkmr06t4p0m033bWgUWqJNvVqrtZXUf7kV3NnoMhuqeHcVp0knWQ2tRBfnQexneWTvadFMt0sCxPf5vKkzvJWrznsDCkWhcjg34/TstjsunSrVkOObxRm5uyQBOPiEyYMEH69esnLVu2lCuvvFIWL14sOvFqv8z2sC631b3ppi/dX/p0dSfZqP+z1XHL7t4Hni4nzSGcXv26rNNl2x3UmyTJK2/7LWQum82ejoYZhzJfn6yV01Dh/GpyJovIZcdRt31QmoRyP2w8r2tQ/v73v8vDDz8sTz31lCxfvlyGDBkiw4YNk9LSUj8Wl7cc9wyP/t9p3XOaO5xVVaGr6Nx2uuQ1DPFX3m5GUZifOOlgbA7fdJZWL5p4XA9pjm3hyajJyOmynbZnx/ZBiR+aav19V02DGWwPRzUYrpt4rGsq7X4HRVwMRbZLYFxLX9rNuMm3i82ILMdNGYYvJ/lMZpGyD4rNMcWzHwuMCeSddgy2qUEJOeuEEl5u8t/ysv4/kAHKs88+K3fffbfcddddcsEFF8ikSZOkdevW8tJLL/mxuLyVTqGP/0ryYcbppcnqe6mu4OzacuM7vyZTH32waEgcx+NlO+3ptHm7p3r9S9Dx1f65ZLf4mF8zjtrgqTtXJ9/GbtbXSYAUXy5TfT/h1ucZbnc3QX7ikFcjvWHGSb8nGbPMM7+GGXtURWb3g4B+DzP24lfe7aTc12I6BOdPhNLM6xmeOHFCli1bJqNHj46816RJExk6dKgsWLAgYfra2lrzEVZeXi5+OFRZKxM+2hp5vWTnUdvpx7yzTiprTia8F/96/tbDST/fWFKR8N7fFu6K/D93y6GYz6eu2Csl5TUJaSk7Xhf5/4W522M+e23xblmy80jCd9bsLTPnHV0W49MSNn1tSUw+qem2llZG3vvN+5sSvhM9r9KK09vvd7O2SLuWp4vV2n1lkf9/+8EmaVN0+rOaugbLZfxr+V5ZUXwsZnn/3nAgafrfWL5Hth08nd5k0/3ivQ3mNgn7+bsbLKezm0e0l+btkHfX7I+8Xrj99Hb4aFOpdG1b5Gp+/1i2x8z/sLHvbYz8P2VxsayMyxO79J6ot+84FJ5+z9HjSeexKTqv3lsvzZs2Xs8sjdt3/nfm5sj/b63aZ5uuaG8s3yt7j51efnVtfeT/unrDcd6p7disaeKl45h31seUqV1Hqi2/v3jnkYRlrN8fexz69fubpGXzJjJz/YGE7y/ffSzm+9H7a/h9u3OCmiZ63SfP2xmz/cZN3yhFzRKvJaetPl323lyxN6bshPdLq7z7YH2JjHmncT8sjzrGqWkr4o55yfx02vqEq3W13lbLU+9F73d2VJnYHbedVJk6Un0i6XfCy4xfl3i1JxuSHrf+GnVcDq+L03WLtv1QVeS9aav3OQogxs/cLG2jjpnR8561IbG8rbI4Dqhjg5p+UdQxKDpt9VEJ+MfSPWb5VsenUdcPEJ2FDI8vzfbt2ydnnnmmzJ8/X6666qrI+48++qjMnj1bFi1aFDP9008/LWPGjEmYT1lZmbRv396zdG0/WClf+O1sz+YHAEC+OvuMNvLhj67zdJ6qgqFDhw6enb89r0FxS9W0qP4q0SvYp08fz5fTsXULGXX9OZHXKix7b81+ueqcLrL5QKV0btNC+nZuLe+vK5FhF/Ywr5aUDfsr5MONpXLrkF7Sp3MrOXGywaxx+I8LukvrFk3NaT7edFDOPqOt9O3cynx9rLrOrB350kU9zauM6hP15pXXzYN7SvOmIXln1X7zKiGcnt1HjpsB1HXnnpE0/Wv3lktdfYNc0rejOf3M9SVyfs/2cvU5XWKmO1pdJ/O2HpLhgxuXra5Ep6/dLzec113aFDWmN2x/WY2s3Vtmrou6clJX/SqNzU71tFqw7bCc0a5IBnRrK9sPVsm+sho5Wd8Qs65hy3YdlZbNm8qFvRoL5f5jNWaUfsP53czaqrZFzeT8nu3Mzw5VnDBrfW4a3EMW7zgiHVo1l3N7tJNdh6vNx+c+1TVyVTBv62HzuyOvPishT9S0Ow9Xyec/1ZhvM9aWyKV9O0m39o01FwfKa815qFqbXh1amjuk8snWw5HXqgZBbf/PDuxqlge1fW84v7t0btM8shxVk/bhplIZdkEPmbGuRHp1aGXm7WX9OpmfH6k6YeaVyrvDlafXTfn3+lIzT3p2bBmT9qraenl5wU4Z2K2tDOzWToqPVptpUD7ceFA+1b2t9O7Uysx3tZ2uGXB6O4e36dDzu5tl78r+naVL2xYx8z9Zb8h7p7a72rZr9h4zt/OsDaVyXo92cmanxu2nLqzeXb3fzHO1HeLN33ZYurdrKed0a8y7sDmbD5m1dEP6dJRrB3SR4iPHzVq36887wyzvqsbrpkGN5T2eKr87DlVGtlu0jfsrzPSqvFX7WXjd9h49LpsOVJqvZ208IDcP6imLdhyRHu1j06byRm1HlTeqvMeXqWhHqupkwbZD5nYL1wZE77tqf3tx7g5zu1zUu0Pke2pfHnxmR2nRrInM33Z6X4vkfUNjGr5wbreYK+Odh6tl9+FqubhPR/OYonzhvG7SvlWzSI1V+fGTcnn/TpEauS5tWsjA7m1j0q3yQtVIqH0rOt/Dwvvy8MG95FSlV8z+MOzC7rZpXbevXGrq6qWy9qT079pWOrdubpZ/lU+qnPfp1Fr6dW2dsH9EHzvit4M63k6avU16dmglIy7pFdlXP9l6SMqP18nVA7rK/rLjMWXCat3Ue2+v2id3XNFX5m45GJm/Xb6rfS1cZuJr2qKPW+oYe7T6hPlQ63L8RL1ZS9WqeVP50pDYdYs/pqoaFrVPDL+oZ0xH1QkfbZMzO7aS2y7uFTnfqA7Dqoz17NDSPIdccOqYGZ1v0eeX15fukYqaOrOsf+bsLua5StXcXn1OV7PcRR/vldmbD0q/Lm3krC6nt5Gy41CVWXauPXWc6dQ69pgRiBoU1cSj+pv885//lBEjRkTeHzlypBw7dkzeeuutrEZgAADAf16fvz3vJNuiRQv59Kc/LbNmzYq819DQYL6ObvIBAADIahOParJRNSaXXXaZXHHFFTJ+/HipqqoyR/UAAADkJED5xje+IQcPHpQnn3xSSkpK5OKLL5YZM2ZI9+6n2z4BAACy1gclU/RBAQAg/2jfBwUAACBTBCgAAEA7BCgAAEA7BCgAAEA7BCgAAEA7BCgAAEA7BCgAAEA7BCgAAEA7BCgAACAYt7rPRPjGtuqOdAAAID+Ez9te3aBeuwCloqLCfO7Tp0+ukwIAANI4j6tb3hfcb/E0NDTIvn37pF27dhIKhTyP7lTgU1xc7MnvBIA81xHlnDwPAsq5fnmuwgkVnPTq1UuaNGlSeDUoaqV69+7t6zJUxhKgZBd5nn3kOXkeBJRzvfLci5qTMDrJAgAA7RCgAAAA7QQqQCkqKpKnnnrKfAZ5Xqgo5+R5EFDOCz/PteskCwAAEKgaFAAAkB8IUAAAgHYIUAAAgHYIUAAAgHYCE6BMmDBB+vXrJy1btpQrr7xSFi9enOsk5Y05c+bILbfcYt4dUN3d980334z5XPWzfvLJJ6Vnz57SqlUrGTp0qGzZsiVmmiNHjsidd95p3tynY8eO8t3vflcqKytjplm9erV89rOfNbeRulvhr371KwmisWPHyuWXX27eTblbt24yYsQI2bRpU8w0NTU1MmrUKOnSpYu0bdtWvvKVr8iBAwdiptm9e7cMHz5cWrdubc7nkUcekZMnT8ZM8/HHH8ull15q9sofMGCATJ48WYJq4sSJctFFF0VuQnXVVVfJ9OnTI5+T5/4aN26ceXx56KGHyHMfPf3002Y+Rz/OO+88PfPcCIApU6YYLVq0MF566SVj3bp1xt1332107NjROHDgQK6Tlhfee+8943/+53+MN954Q434MqZOnRrz+bhx44wOHToYb775prFq1Srj1ltvNfr3728cP348Ms0Xv/hFY8iQIcbChQuNuXPnGgMGDDDuuOOOyOdlZWVG9+7djTvvvNNYu3at8dprrxmtWrUynn/+eSNohg0bZvz5z38282HlypXGzTffbPTt29eorKyMTHPvvfcaffr0MWbNmmUsXbrU+MxnPmNcffXVkc9PnjxpDBo0yBg6dKixYsUKcxt27drVGD16dGSa7du3G61btzYefvhhY/369cbvf/97o2nTpsaMGTOMIHr77beNd99919i8ebOxadMm48c//rHRvHlzczso5Ll/Fi9ebPTr18+46KKLjAcffDDyPnnuvaeeesq48MILjf3790ceBw8e1DLPAxGgXHHFFcaoUaMir+vr641evXoZY8eOzWm68lF8gNLQ0GD06NHD+PWvfx1579ixY0ZRUZEZZCiqgKrvLVmyJDLN9OnTjVAoZOzdu9d8/cc//tHo1KmTUVtbG5nmscceM84991wj6EpLS838mz17diR/1YnzH//4R2SaDRs2mNMsWLDAfK0OGk2aNDFKSkoi00ycONFo3759JI8fffRR80AV7Rvf+IYZIKGRKpN/+tOfyHMfVVRUGAMHDjRmzpxpfP7zn48EKJRz/wIUdbFoRbc8L/gmnhMnTsiyZcvMZofo3/tRrxcsWJDTtBWCHTt2SElJSUz+qt9iUM1o4fxVz6pZ57LLLotMo6ZX22HRokWRaT73uc9JixYtItMMGzbMbNo4evSoBFlZWZn53LlzZ/NZlee6urqYPFdVtH379o3J88GDB0v37t1j8lP92Ne6desi00TPIzwN+4VIfX29TJkyRaqqqsymHvLcP6o5QTUXxJdF8tw/qgleNdmfffbZZtO7arLRMc8LPkA5dOiQebCJzkxFvVYnVmQmnId2+aueVTtltGbNmpkn3OhprOYRvYwgUr/urdrkr7nmGhk0aFAkP1Qgp4I+uzxPlZ/JplEHmuPHj0sQrVmzxmx3V+3m9957r0ydOlUuuOAC8twnKghcvny52e8qHuXcH+riUfUHmTFjhtnvSl1kqr5/6leIdctz7X7NGEDs1eXatWvlk08+IVuy4Nxzz5WVK1eatVb//Oc/ZeTIkTJ79mzy3gfFxcXy4IMPysyZM82O8ciOm266KfK/6hSuApazzjpLXn/9dXOQg04Kvgala9eu0rRp04ReyOp1jx49cpauQhHOQ7v8Vc+lpaUxn6se32pkT/Q0VvOIXkbQPPDAAzJt2jT56KOPpHfv3pH3VX6opstjx47Z5nmq/Ew2jRrBotuBKlvU1aMacfDpT3/avKofMmSI/O53vyPPfaCaE9RxQY30UDWq6qGCweeee878X11xU879p2pLPvWpT8nWrVu1K+dNgnDAUQebWbNmxVSbq9eqbRmZ6d+/v1kYo/NXVeOpviXh/FXPqsCrA1LYhx9+aG4HFb2Hp1HDmVX7Z5i6slJXtJ06dQrUZlJ9kVVwopoXVD6pPI6mynPz5s1j8lz11VHtyNF5rporogNDlZ/qAKGaLMLTRM8jPA37xWmqjNbW1pLnPrjhhhvMMqpqrMIP1U9N9YkI/08595+63cO2bdvM20Rod2wxAjLMWI0qmTx5sjmi5J577jGHGUf3QoZ9L3s1nEw9VJF59tlnzf937doVGWas8vOtt94yVq9ebdx2222Ww4wvueQSY9GiRcYnn3xi9tqPHmaseo+rYcbf+ta3zGGdapupYWpBHGZ83333mcO2P/7445ihgNXV1TFDAdXQ4w8//NAcCnjVVVeZj/ihgDfeeKM5VFkN7zvjjDMshwI+8sgjZk/9CRMmBHqY8eOPP26OlNqxY4dZjtVrNdLsgw8+MD8nz/0XPYqHPPfHj370I/PYosr5vHnzzOHCapiwGi2oWzkPRICiqHHYKtPV/VDUsGN1Pw4489FHH5mBSfxj5MiRkaHGTzzxhBlgqEDwhhtuMO8jEe3w4cNmQNK2bVtzONpdd91lBj7R1D1Urr32WnMeZ555phn4BJFVXquHujdKmAr+7r//fnMYrDoQfPnLXzaDmGg7d+40brrpJvN+MuoApA5MdXV1Cdv24osvNveLs88+O2YZQfOd73zHOOuss8y8UAdcVY7DwYlCnmc/QCHPvaeG+/bs2dMs5+o4q15v3bpVyzwPqT+ZVRABAAB4q+D7oAAAgPxDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAALRDgAIAAEQ3/x+DCCICMmBDCgAAAABJRU5ErkJggg==",
|
| 85 |
+
"text/plain": [
|
| 86 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"output_type": "display_data"
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
"source": [
|
| 94 |
+
"import matplotlib.pyplot as plt\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"plt.plot(data[\"message_dt\"].diff().dt.days.values);"
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"cell_type": "code",
|
| 101 |
+
"execution_count": 25,
|
| 102 |
+
"id": "eaf90115",
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"outputs": [
|
| 105 |
+
{
|
| 106 |
+
"data": {
|
| 107 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGdCAYAAAA8F1jjAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJmtJREFUeJzt3QtUVWXex/E/BIKhQFCAFITTssQyLS9IuUqTCS/jJeliY+U0jHbxktpospZamYWWU6ahVGOajWQ5k2Y6Q2OYOiV5wexihlqYjIY0GaA0IMZ+1/O87zkvB0EBD/Kcfb6ftfaCs/c++zwPHuF3/vt59vaxLMsSAAAAA/m2dAMAAADqQ1ABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGAsggoAADAWQQUAABjLTzxQdXW1HDlyRNq2bSs+Pj4t3RwAANAA6hqzx48fl+joaPH19bVvUFEhJSYmpqWbAQAAmqCwsFAuu+wy+wYVVUlxdDQ4OLilmwMAABqgrKxMFxocf8dtG1Qcp3tUSCGoAADgWRozbIPBtAAAwFgEFQAAYCyCCgAAsE9Q2bJliwwePFhPLVLnmNasWXPaPnv37pUhQ4ZISEiIBAUFSY8ePeTQoUPO7RUVFTJ27FgJDw+XNm3aSEpKihw9evTcewMAALw7qJSXl0uXLl0kIyOjzu3ffPON9O7dWzp27CibNm2Szz//XGbMmCGBgYHOfSZNmiTvvfeerFq1SjZv3qynGw8fPvzcegIAAGzHx1JXX2nqk318ZPXq1TJs2DDnuhEjRoi/v7+88cYbdT6ntLRULrnkEsnKypLbb79dr/v6668lPj5ecnNzpVevXg2a3qSqNepYzPoBAMAzNOXvt6+7rxi7fv16ufLKKyU5OVkiIiIkISHB5fRQXl6eVFVVSVJSknOdqr7ExsbqoAIAANAsQaW4uFhOnDghc+bMkf79+8s///lPue222/RpHXWKRykqKpJWrVpJaGioy3MjIyP1trpUVlbqFFZzAQAA9ufn7oqKMnToUD0ORenatats3bpVMjMz5eabb27ScdPT0+XJJ590Z1MBAIC3VVQuvvhi8fPzk06dOrmsV+NPHLN+oqKi5OTJk1JSUuKyj5r1o7bVJS0tTZ/Pcizq0vkAAMD+3BpU1CkdNRU5Pz/fZf2+ffvk8ssv199369ZND7bNyclxblf7qyCTmJhY53EDAgKcl8vnsvkAAHiPRp/6UWNQDhw44HxcUFAgu3fvlrCwMD0gdsqUKXLXXXfJTTfdJH379pXs7Gw9FVlNVVbUaN/U1FSZPHmyfo4KHuPHj9chpSEzfgAAgPdo9PRkFThUAKlt1KhRsmzZMv39a6+9pseV/Pvf/5arrrpKjy9R41ZqXvDt0UcflTfffFMPlFUzhBYtWlTvqZ/amJ4MAIDnacrf73O6jkpLIagAAOB5mvL3262zfgAA5oqbtt7l8cE5g1qsLUBDcVNCAABgLIIKAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGAsggoAADAWQQUAABiLoAIAAIxFUAEAAMYiqAAAAGMRVAAAgLEIKgAAwFgEFQAAYCyCCgAAMBZBBQAAGIugAgAAjEVQAQAAxiKoAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGCfoLJlyxYZPHiwREdHi4+Pj6xZs6befR988EG9z/z5813WHzt2TEaOHCnBwcESGhoqqampcuLEiab1AAC8WNy09S4LIN4eVMrLy6VLly6SkZFxxv1Wr14tn3zyiQ40tamQsmfPHtmwYYOsW7dOh58xY8Y0tikAAMDm/Br7hAEDBujlTA4fPizjx4+X999/XwYNGuSybe/evZKdnS07duyQ7t2763ULFy6UgQMHyrx58+oMNgAAwDu5fYxKdXW13HvvvTJlyhS5+uqrT9uem5urT/c4QoqSlJQkvr6+sm3btjqPWVlZKWVlZS4LAACwP7cHlblz54qfn59MmDChzu1FRUUSERHhsk7tHxYWprfVJT09XUJCQpxLTEyMu5sNAADsHlTy8vLkxRdflGXLlulBtO6SlpYmpaWlzqWwsNBtxwYAAF4SVP71r39JcXGxxMbG6iqJWr777jt59NFHJS4uTu8TFRWl96np1KlTeiaQ2laXgIAAPUOo5gIAAOyv0YNpz0SNTVHjTWpKTk7W6++//379ODExUUpKSnT1pVu3bnrdxo0b9diWhIQEdzYHAAB4W1BR1zs5cOCA83FBQYHs3r1bjzFRlZTw8HCX/f39/XWl5KqrrtKP4+PjpX///jJ69GjJzMyUqqoqGTdunIwYMYIZPwAA4NxO/ezcuVOuu+46vSiTJ0/W38+cObPBx1ixYoV07NhR+vXrp6cl9+7dW1555ZXGNgUAANhcoysqffr0EcuyGrz/wYMHT1unqi9ZWVmNfWkAAOBluNcPAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGAsggoAAPCOmxICAID6xU1b7/z+4JxB/KgagIoKAAAwFkEFAAAYi6ACAACMRVABAADGYjAtAKDJA0IVBoWiOVFRAQAAxiKoAAAAYxFUAACAsRijArQQLvwEAGdHRQUAABiLoAIAAIxFUAEAAMYiqAAAAGMxmBYAcF5woTg0BRUVAABgLIIKAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGAsggoAALBPUNmyZYsMHjxYoqOjxcfHR9asWePcVlVVJY899ph07txZgoKC9D733XefHDlyxOUYx44dk5EjR0pwcLCEhoZKamqqnDhxwj09AgAA3htUysvLpUuXLpKRkXHatp9//ll27dolM2bM0F/feecdyc/PlyFDhrjsp0LKnj17ZMOGDbJu3TodfsaMGXNuPQEAALbT6JsSDhgwQC91CQkJ0eGjppdeekl69uwphw4dktjYWNm7d69kZ2fLjh07pHv37nqfhQsXysCBA2XevHm6CgMAAHBexqiUlpbqU0TqFI+Sm5urv3eEFCUpKUl8fX1l27ZtdR6jsrJSysrKXBYAAGB/zRpUKioq9JiVu+++W49HUYqKiiQiIsJlPz8/PwkLC9Pb6pKenq6rNY4lJiamOZsNAADsHlTUwNo777xTLMuSxYsXn9Ox0tLSdGXGsRQWFrqtnQAAwEZjVBoTUr777jvZuHGjs5qiREVFSXFxscv+p06d0jOB1La6BAQE6AUAgNripq13fn9wziB+QDbj21whZf/+/fLBBx9IeHi4y/bExEQpKSmRvLw85zoVZqqrqyUhIcHdzQEAAN5UUVHXOzlw4IDzcUFBgezevVuPMWnXrp3cfvvtemqymnb8yy+/OMedqO2tWrWS+Ph46d+/v4wePVoyMzN1sBk3bpyMGDGCGT8AAODcgsrOnTulb9++zseTJ0/WX0eNGiVPPPGErF27Vj/u2rWry/M+/PBD6dOnj/5+xYoVOpz069dPz/ZJSUmRBQsWNLYpAADA5hodVFTYUANk63OmbQ6qupKVldXYlwYAAF6Ge/0AAABjEVQAAICxCCoAAMBYBBUAAOBdF3wDAMCTLhSncLE4M1FRAQAAxiKoAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFtOTAZyGaZsATEFFBQAAGIugAgAAjEVQAQAAxiKoAAAAYzGYFgAAw8V58X2JqKgAAABjUVEBABjHmysIcEVFBQAAGIugAgAAjEVQAQAAxiKoAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFhd8AwDYBheKsx8qKgAAwFgEFQAAYCyCCgAAMBZBBQAAGIugAgAAjEVQAQAA9gkqW7ZskcGDB0t0dLT4+PjImjVrXLZbliUzZ86Udu3aSevWrSUpKUn279/vss+xY8dk5MiREhwcLKGhoZKamionTpw4994AAADvDirl5eXSpUsXycjIqHP7s88+KwsWLJDMzEzZtm2bBAUFSXJyslRUVDj3USFlz549smHDBlm3bp0OP2PGjDm3ngAAANtp9AXfBgwYoJe6qGrK/PnzZfr06TJ06FC9bvny5RIZGakrLyNGjJC9e/dKdna27NixQ7p37673WbhwoQwcOFDmzZunKzUAAABuH6NSUFAgRUVF+nSPQ0hIiCQkJEhubq5+rL6q0z2OkKKo/X19fXUFpi6VlZVSVlbmsgAAAPtza1BRIUVRFZSa1GPHNvU1IiLCZbufn5+EhYU596ktPT1dBx7HEhMT485mAwAAQ3nErJ+0tDQpLS11LoWFhS3dJAAA4GlBJSoqSn89evSoy3r12LFNfS0uLnbZfurUKT0TyLFPbQEBAXqGUM0FAADYn1uDSvv27XXYyMnJca5T40nU2JPExET9WH0tKSmRvLw85z4bN26U6upqPZYFAACgybN+1PVODhw44DKAdvfu3XqMSWxsrEycOFFmz54tHTp00MFlxowZeibPsGHD9P7x8fHSv39/GT16tJ7CXFVVJePGjdMzgpjxAwAAzimo7Ny5U/r27et8PHnyZP111KhRsmzZMpk6daq+1oq6LoqqnPTu3VtPRw4MDHQ+Z8WKFTqc9OvXT8/2SUlJ0ddeAQAALSNu2nqXxwfnDPLMoNKnTx99vZT6qKvVzpo1Sy/1UdWXrKysxr40AADwMh4x6wcAAHgnggoAADAWQQUAABiLoAIAAIxFUAEAAMYiqAAAAGMRVAAAgLEIKgAAwFgEFQAAYCyCCgAAMBZBBQAAGIugAgAAjEVQAQAAxiKoAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABj+bV0AwAA5ombtt75/cE5g1q0LfBuVFQAAICxCCoAAMBYBBUAAGAsggoAADAWg2kBiLcPFlUYMAqYiYoKAAAwFkEFAAAYi6ACAACMRVABAADGIqgAAABjMesH8BLMcgHgidxeUfnll19kxowZ0r59e2ndurVcccUV8tRTT4llWc591PczZ86Udu3a6X2SkpJk//797m4KAADwcG4PKnPnzpXFixfLSy+9JHv37tWPn332WVm4cKFzH/V4wYIFkpmZKdu2bZOgoCBJTk6WiooKdzcHAAB4MLef+tm6dasMHTpUBg3637ttxsXFyZtvvinbt293VlPmz58v06dP1/spy5cvl8jISFmzZo2MGDHC3U0CAONxag44TxWVG264QXJycmTfvn368WeffSYfffSRDBgwQD8uKCiQoqIifbrHISQkRBISEiQ3N7fOY1ZWVkpZWZnLAgAA7M/tFZVp06bpINGxY0e54IIL9JiVp59+WkaOHKm3q5CiqApKTeqxY1tt6enp8uSTT7q7qQCA81wpAlq8ovL222/LihUrJCsrS3bt2iWvv/66zJs3T39tqrS0NCktLXUuhYWFbm0zAADwkorKlClTdFXFMdakc+fO8t133+mqyKhRoyQqKkqvP3r0qJ7146Aed+3atc5jBgQE6AUAQJUC3sXtFZWff/5ZfH1dD6tOAVVXV+vv1bRlFVbUOBYHdapIzf5JTEx0d3MAAIAHc3tFZfDgwXpMSmxsrFx99dXy6aefyvPPPy+///3v9XYfHx+ZOHGizJ49Wzp06KCDi7ruSnR0tAwbNszdzQEAAB7M7UFFXS9FBY+HH35YiouLdQB54IEH9AXeHKZOnSrl5eUyZswYKSkpkd69e0t2drYEBga6uzkAAMCDuT2otG3bVl8nRS31UVWVWbNm6QUAAKA+3OsHMBwXAgPQ1N8ZB+cMavC+puLuyQAAwFhUVAAY8ckOAOpCRQUAABiLigoAtABPGBsAmICKCgAAMBYVFQBGYDwLPw+gLlRUAACAsQgqAADAWAQVAABgLIIKAAAwFoNpARvhcvstj0HB9sf/s/OLigoAADAWQQUAABiLoAIAAIzFGBUAsDEu1Q9PR0UFAAAYi6ACAACMRVABAADGIqgAAABjEVQAAICxCCoAAMBYBBUAAGAsggoAADAWF3wD4FG4IRy8ARfq+39UVAAAgLEIKgAAwFgEFQAAYCyCCgAAMBZBBQAAGIugAgAAjEVQAQAAxiKoAAAAY3HBNwAAFxiDd1VUDh8+LPfcc4+Eh4dL69atpXPnzrJz507ndsuyZObMmdKuXTu9PSkpSfbv398cTQEAAB7M7UHlp59+khtvvFH8/f3lH//4h3z11Vfypz/9SS666CLnPs8++6wsWLBAMjMzZdu2bRIUFCTJyclSUVHh7uYAAAAP5vZTP3PnzpWYmBhZunSpc1379u1dqinz58+X6dOny9ChQ/W65cuXS2RkpKxZs0ZGjBjh7iYB8NJ7pBycM6hF24KG4942OG8VlbVr10r37t3ljjvukIiICLnuuuvk1VdfdW4vKCiQoqIifbrHISQkRBISEiQ3N7fOY1ZWVkpZWZnLAgAA7M/tQeXbb7+VxYsXS4cOHeT999+Xhx56SCZMmCCvv/663q5CiqIqKDWpx45ttaWnp+sw41hUxQYAANif24NKdXW1XH/99fLMM8/oasqYMWNk9OjRejxKU6WlpUlpaalzKSwsdGubAQCAlwQVNZOnU6dOLuvi4+Pl0KFD+vuoqCj99ejRoy77qMeObbUFBARIcHCwywIAAOzP7UFFzfjJz893Wbdv3z65/PLLnQNrVSDJyclxbldjTtTsn8TERHc3BwAAeDC3z/qZNGmS3HDDDfrUz5133inbt2+XV155RS+Kj4+PTJw4UWbPnq3HsajgMmPGDImOjpZhw4a5uzkAAJzXGUvMNjM8qPTo0UNWr16tx5XMmjVLBxE1HXnkyJHOfaZOnSrl5eV6/EpJSYn07t1bsrOzJTAw0N3NAQAAHqxZLqH/m9/8Ri/1UVUVFWLUAgAAmi6uVkXHbrgpIQAAMBZBBQAAGIugAgAAjEVQAQAAxiKoAAAAYxFUAACAd01PBgDAROd7Kq/dpw6fD1RUAACAsQgqAADAWAQVAABgLIIKAAAwFoNpAQCwqTgb3NmZigoAADAWQQUAABiLoAIAAIxFUAEAAMYiqAAAAGMx68fD2GEE99n6ZZc+AQDOHRUVAABgLIIKAAAwFqd+vJRdTyEBAOyFigoAADAWFRUA54SB0EDzVLrxv6ioAAAAY1FRgdsw7gV2wXsZdhXngVUbKioAAMBYVFQAD8OnfQDehIoKAAAwFkEFAAAYi1M/sC1OkQBo6u8MLoJpDioqAADAWFRUAHg0T5xuCaDhqKgAAADvDSpz5swRHx8fmThxonNdRUWFjB07VsLDw6VNmzaSkpIiR48ebe6mAM3yab7mAgDwoKCyY8cOefnll+Xaa691WT9p0iR57733ZNWqVbJ582Y5cuSIDB8+vDmbAgAAPFCzjVE5ceKEjBw5Ul599VWZPXu2c31paaksWbJEsrKy5JZbbtHrli5dKvHx8fLJJ59Ir169pKUx8hsAAJtXVNSpnUGDBklSUpLL+ry8PKmqqnJZ37FjR4mNjZXc3Nzmag4AAPBAzVJRWblypezatUuf+qmtqKhIWrVqJaGhoS7rIyMj9ba6VFZW6sWhrKysGVoNAABsX1EpLCyURx55RFasWCGBgYFuOWZ6erqEhIQ4l5iYGLccFwAAeFlQUad2iouL5frrrxc/Pz+9qAGzCxYs0N+rysnJkyelpKTE5Xlq1k9UVFSdx0xLS9NjWxyLCkMAAMD+3H7qp1+/fvLFF1+4rLv//vv1OJTHHntMV0P8/f0lJydHT0tW8vPz5dChQ5KYmFjnMQMCAvQCADj/mHoPWwWVtm3byjXXXOOyLigoSF8zxbE+NTVVJk+eLGFhYRIcHCzjx4/XIcWEGT8AAMDLL6H/wgsviK+vr66oqEGyycnJsmjRopZoCgADcUNJz8IlHeDxQWXTpk0uj9Ug24yMDL0AAADUh5sSAnyCxzlg/AbQvLgpIQAAMBZBBQAAGIugAgAAjEVQAQAAxmIwLeClmFIKwBNQUQEAAMYiqAAAAGMRVAAAgLEYowIAQAvcxoGLBTYMFRUAAGAsggoAADAWp34A4Cy4m3PLYzq996KiAgAAjEVQAQAAxiKoAAAAYzFGBQCAs2AqccuhogIAAIxFRQVAo/DJ8vz/7PiZw5tRUQEAAMYiqAAAAGNx6qcZcYEieBtOUXgW/r3gCaioAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFkEFAAAYi6ACAACMxXVU3IhrEgAAvzPhXlRUAACAsQgqAADAWAQVAABgLIIKAADwnqCSnp4uPXr0kLZt20pERIQMGzZM8vPzXfapqKiQsWPHSnh4uLRp00ZSUlLk6NGj7m4KAADwcG4PKps3b9Yh5JNPPpENGzZIVVWV3HrrrVJeXu7cZ9KkSfLee+/JqlWr9P5HjhyR4cOHu7spAADAw7l9enJ2drbL42XLlunKSl5entx0001SWloqS5YskaysLLnlllv0PkuXLpX4+Hgdbnr16uXuJgEAcN5wqQoPG6OigokSFhamv6rAoqosSUlJzn06duwosbGxkpubW+cxKisrpayszGUBAAD216xBpbq6WiZOnCg33nijXHPNNXpdUVGRtGrVSkJDQ132jYyM1NvqG/cSEhLiXGJiYpqz2QAAwBuCihqr8uWXX8rKlSvP6ThpaWm6MuNYCgsL3dZGAADghZfQHzdunKxbt062bNkil112mXN9VFSUnDx5UkpKSlyqKmrWj9pWl4CAAL0AAADv4vagYlmWjB8/XlavXi2bNm2S9u3bu2zv1q2b+Pv7S05Ojp6WrKjpy4cOHZLExER3NwcA4MUY2Or5/JrjdI+a0fPuu+/qa6k4xp2osSWtW7fWX1NTU2Xy5Ml6gG1wcLAONiqkMOMHAAA0a1BZvHix/tqnTx+X9WoK8u9+9zv9/QsvvCC+vr66oqJm9CQnJ8uiRYvc3RQADcSnTgBedernbAIDAyUjI0MvAAAA9eFePwAAwFgEFQAA4H3TkwHAkzBOBzATFRUAAGAsggoAADAWQQUAABiLoAIAAIxFUAEAAMYiqAAAAGMRVAAAgLEIKgAAwFhc8A2A1/CGi7p5Qx/hXaioAAAAYxFUAACAsQgqAADAWAQVAABgLIIKAAAwFkEFAAAYi6ACAACMRVABAADG4oJvAFrkQmQH5wziJw/grKioAAAAYxFUAACAsQgqAADAWAQVAABgLAbTAmgR3OUXQENQUQEAAMYiqAAAAGMRVAAAgLEYowI0EmMrAOD8oaICAACMRVABAADG4tQPAAAeJq7WvbPsjIoKAAAwVosGlYyMDImLi5PAwEBJSEiQ7du3t2RzAACAYVosqLz11lsyefJkefzxx2XXrl3SpUsXSU5OluLi4pZqEgAAMEyLBZXnn39eRo8eLffff7906tRJMjMz5cILL5TXXnutpZoEAAAM0yKDaU+ePCl5eXmSlpbmXOfr6ytJSUmSm5t72v6VlZV6cSgtLdVfy8rKmqV91ZU/O79vzGvUfF5t7mpr7ddo6nHddRx3HbOpP3N3tedc9m3o8850nKa+Xu3nnqkfZztOU1/jfGjq69f+ubqr7Q1tT3O9vsnOR5+98efaEprjb6zjmJZlNfxJVgs4fPiwaqG1detWl/VTpkyxevbsedr+jz/+uN6fhZ8B7wHeA7wHeA/wHhCP/xkUFhY2ODN4xPRkVXlR41kcqqur5dixYxIeHi4+Pj5iFyppxsTESGFhoQQHB4vdeUt/vaWf3txnb+mvt/TTG/tbdp76qiopx48fl+jo6AY/p0WCysUXXywXXHCBHD161GW9ehwVFXXa/gEBAXqpKTQ0VOxKvUns/p/CG/vrLf305j57S3+9pZ/e2N/g89DXkJAQ8wfTtmrVSrp16yY5OTkuVRL1ODExsSWaBAAADNRip37UqZxRo0ZJ9+7dpWfPnjJ//nwpLy/Xs4AAAABaNKjcdddd8sMPP8jMmTOlqKhIunbtKtnZ2RIZGem1/zLq9Ja6rkzt01x25S399ZZ+enOfvaW/3tJPb+xvgMF99VEjalu6EQAAAHXhXj8AAMBYBBUAAGAsggoAADAWQQUAABiLoHIW6enp0qNHD2nbtq1ERETIsGHDJD8/32WfiooKGTt2rL5Sbps2bSQlJcXlYnafffaZ3H333fqqf61bt5b4+Hh58cUXXY6xadMmfZXd2ouaEXUmaiy0mjnVrl07fWx1v6T9+/e77BMXF3facefMmWPb/irr16+XhIQEvc9FF12k+2G3vtZ3XLXs2LHDdv1V9u3bJ0OHDtUXjVQXperdu7d8+OGHdR7PDv1Vd5b/9a9/rS9wqdo4ZswYOXHihEf185133pFbb73VeSXx3bt3n7bP2dpXmx36/Morr0ifPn30+1jtU1JSYrt+Hjt2TMaPHy9XXXWVfu3Y2FiZMGGC8359DeaOe/fYWXJysrV06VLryy+/tHbv3m0NHDjQio2NtU6cOOHc58EHH7RiYmKsnJwca+fOnVavXr2sG264wbl9yZIl1oQJE6xNmzZZ33zzjfXGG29YrVu3thYuXOjc58MPP9T3P8jPz7e+//575/LLL7+csX1z5syxQkJCrDVr1lifffaZNWTIEKt9+/bWf//7X+c+l19+uTVr1iyX49Zsv936+9e//tW66KKLrMWLF+vj79mzx3rrrbds19fKykqX46nlD3/4g96nurradv1VOnTooNuttu/bt896+OGHrQsvvFAf3279VfdEU+9j1cavv/7a2r59u25bSkqKR/Vz+fLl1pNPPmm9+uqr+vmffvrpafucrX212aHPL7zwgpWenq4Xtc9PP/1ku35+8cUX1vDhw621a9daBw4c0G1U/4drv4fPhqDSSMXFxfofZPPmzfpxSUmJ5e/vb61atcq5z969e/U+ubm59R5H/YLt27fvaW+Uut6s9VF/jKKioqznnnvOuU61JyAgwHrzzTddgor6T+EN/a2qqrIuvfRS689//rPt+1rbyZMnrUsuuUSHUjv294cfftDH3bJli3OfsrIyvW7Dhg226+/LL79sRUREuPyx+Pzzz/Vr7d+/3yP6WVNBQUGdf8ya2j5P7nNNjXmNYg/up8Pbb79ttWrVSv+ubihO/TSSo2QVFhamv+bl5UlVVZUu2zp07NhRl7hyc3PPeBzHMWpSF75TpWBV7v3444/P2JaCggJdmqv52uoeCuqUR+3XVqd6VHnuuuuuk+eee05OnTply/6qUvnhw4fF19dX91Ude8CAAfLll1/arq+1rV27Vn788ccGX93Z0/qr3r+qhLx8+XJ9FWv1Hn755Zd1SVzdksNu/a2srNS3G1HvZQdVPlc++ugjj+hnQzS1fbXb6kl9bqpSG/RTvbY63eXn1/DrzRJUGkHdj2jixIly4403yjXXXKPXqV826pdJ7Zskqivs1nd+b+vWrfLWW2/p880O6s2RmZkpf/vb3/Sizieq85fqD299HMevfTXf2q+tzgmuXLlSn8t/4IEH5JlnnpGpU6fasr/ffvut/vrEE0/I9OnTZd26dXqMijq2Ol9qp77WtmTJEklOTpbLLrus3uN6cn/VOfAPPvhAPv30U33OPjAwUJ5//nl9RWv1b2y3/t5yyy36e/XB4uTJk/LTTz/JtGnT9Lbvv//eI/rZEE1pn6f3uSmqbdDP//znP/LUU0+5vHaDNKnO46XUuUB1GqWwsNC5bsWKFbqMVVuPHj2sqVOnnrZenbO7+OKLraeeeuqsr3fTTTdZ99xzj/7+L3/5ixUUFORcVPn7448/1uW2I0eOuDzvjjvusO688856j6vOWfr5+VkVFRW2669qn9pHlc0dVD9VGzIzM23V15pUu319ffX4nIbwxP6q0yVqHMeAAQOsjz76yMrLy7Meeughfaqv9vPs0F9HGyMjI60LLrhAt/WPf/yjfqzGt3hCPxtyeqCx7bNDn2tq6GmXBz28n6WlpVbPnj2t/v3769PUjdFi9/rxNOPGjdOfzrds2eLyiTUqKkp/2lEjtmumWjXqWm2r6auvvpJ+/frpNKk+7Z+Nulmjo8Q7ZMgQXRZ2uPTSS52fqtRrqURc87VVCa8+6jiqbH7w4EFdSrdTfx3rO3Xq5Nyu7l3xq1/9Sg4dOmSrvta0dOlSfWpEHetsPLW/Gzdu1O1WlQXHbegXLVokGzZskNdff91ZbbBLf5Xf/va3elHrg4KCdFVJVZHU+9kT+tkQjWmfXfrcWOM8vJ/Hjx+X/v3760ro6tWrxd/fv1HPp6JyFupT3NixY63o6Gg9y6A2x2Cmmp9k1Qj92oOZ1KhtNTBuypQpDU6RSUlJ1m233XbWAXnz5s1zSa1nGnDpSMfq0/exY8ds11/H45qDaVV6V22pWWWxQ19r7qtmizz66KNnfE1P76+aOaDet8ePH3d57pVXXmk9/fTTtutvfdVQNcup5qdvk/vZmMG0Z2tfTZ7e54ZWVOzQz9LSUj0T6eabb7bKy8utpiConIUqLasphGpqV81pWz///LNLSU5NGdu4caOeHpaYmKiXmuU2NRtDldFqHkON4HZQs3LUNEU1ml/t/8gjj+hfyh988MEZ26dKwKGhoda7776rZwQMHTrUZYrj1q1b9bHV1DY1NU2FFNWW++67z5b9VdSx1OmA999/X/+nTU1N1f9JawczO/RVUcdRvyTUaP8z8fT+qlk/4eHherqjej+rqZTqVIj6Ra0e262/ippCqk5xqb6+9NJLelrpiy++6FH9/PHHH/UfsPXr1+v36cqVK/XjmlPKz9a+2uzQZ/W9WueY2qtOp6jH6rl26acKKQkJCVbnzp319OSar3/q1CmroQgqZ/sBidS5qLntDuoXi5rupa55oD7tqBRa8w35+OOP13kMdb7RYe7cudYVV1xhBQYGWmFhYVafPn30G+9sVOKeMWOGPm+tPo3169dP/1JzUL/k1BtFvdnVsePj461nnnmm3vEpnt5fRwVFVRdUOGnbtq3+ZKA+Udixr8rdd999xmtO2Km/O3bssG699VZ9XPVvqz6p/f3vf7dtf++99159TDUO4dprr9XXrfC0fqp21HVs9ZoNbZ8d+1zf69fsg6f388P/qxbVtagqTEP5/N8PAwAAwDhMTwYAAMYiqAAAAGMRVAAAgLEIKgAAwFgEFQAAYCyCCgAAMBZBBQAAGIugAgAAjEVQAQAAxiKoAAAAYxFUAACAsQgqAABATPU/H9NeMT76UNYAAAAASUVORK5CYII=",
|
| 108 |
+
"text/plain": [
|
| 109 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
"metadata": {},
|
| 113 |
+
"output_type": "display_data"
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"source": [
|
| 117 |
+
"import matplotlib.pyplot as plt\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"plt.hist(data[\"message_dt\"], bins=data[\"message_dt\"].nunique());"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"cell_type": "code",
|
| 124 |
+
"execution_count": 18,
|
| 125 |
+
"id": "b144db75",
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [
|
| 128 |
+
{
|
| 129 |
+
"data": {
|
| 130 |
+
"text/html": [
|
| 131 |
+
"<div>\n",
|
| 132 |
+
"<style scoped>\n",
|
| 133 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 134 |
+
" vertical-align: middle;\n",
|
| 135 |
+
" }\n",
|
| 136 |
+
"\n",
|
| 137 |
+
" .dataframe tbody tr th {\n",
|
| 138 |
+
" vertical-align: top;\n",
|
| 139 |
+
" }\n",
|
| 140 |
+
"\n",
|
| 141 |
+
" .dataframe thead th {\n",
|
| 142 |
+
" text-align: right;\n",
|
| 143 |
+
" }\n",
|
| 144 |
+
"</style>\n",
|
| 145 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 146 |
+
" <thead>\n",
|
| 147 |
+
" <tr style=\"text-align: right;\">\n",
|
| 148 |
+
" <th></th>\n",
|
| 149 |
+
" <th>message_dt</th>\n",
|
| 150 |
+
" <th>message_id</th>\n",
|
| 151 |
+
" <th>channel_id</th>\n",
|
| 152 |
+
" <th>content</th>\n",
|
| 153 |
+
" <th>views</th>\n",
|
| 154 |
+
" <th>original_author</th>\n",
|
| 155 |
+
" </tr>\n",
|
| 156 |
+
" </thead>\n",
|
| 157 |
+
" <tbody>\n",
|
| 158 |
+
" </tbody>\n",
|
| 159 |
+
"</table>\n",
|
| 160 |
+
"</div>"
|
| 161 |
+
],
|
| 162 |
+
"text/plain": [
|
| 163 |
+
"Empty DataFrame\n",
|
| 164 |
+
"Columns: [message_dt, message_id, channel_id, content, views, original_author]\n",
|
| 165 |
+
"Index: []"
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
"execution_count": 18,
|
| 169 |
+
"metadata": {},
|
| 170 |
+
"output_type": "execute_result"
|
| 171 |
+
}
|
| 172 |
+
],
|
| 173 |
+
"source": [
|
| 174 |
+
"data.loc[data[\"message_dt\"] == pd.to_datetime(\"2025-04-20\")]"
|
| 175 |
+
]
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"cell_type": "code",
|
| 179 |
+
"execution_count": null,
|
| 180 |
+
"id": "396744bd",
|
| 181 |
+
"metadata": {},
|
| 182 |
+
"outputs": [],
|
| 183 |
+
"source": [
|
| 184 |
+
"questions = [\n",
|
| 185 |
+
" \"Как повел себя российский рынок 13 марта 2025 года?\",\n",
|
| 186 |
+
" \"Как повел себя российский рынок после слов Путина о соглашении прекратить боевые действия на Украине?\",\n",
|
| 187 |
+
"]\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"answers = [\n",
|
| 190 |
+
" \"\"\n",
|
| 191 |
+
"]\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"links_to_answers = [\n",
|
| 194 |
+
" \"https://t.me/rbc_news/113818\",\n",
|
| 195 |
+
" \"https://t.me/rbc_news/113818\",\n",
|
| 196 |
+
" \n",
|
| 197 |
+
"]"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"cell_type": "code",
|
| 202 |
+
"execution_count": null,
|
| 203 |
+
"id": "f15ce769",
|
| 204 |
+
"metadata": {},
|
| 205 |
+
"outputs": [],
|
| 206 |
+
"source": [
|
| 207 |
+
"# Установка зависимостей (если нужно)\n",
|
| 208 |
+
"# !pip install openai pydantic python-dotenv\n"
|
| 209 |
+
]
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"cell_type": "code",
|
| 213 |
+
"execution_count": null,
|
| 214 |
+
"id": "98abb469",
|
| 215 |
+
"metadata": {},
|
| 216 |
+
"outputs": [],
|
| 217 |
+
"source": [
|
| 218 |
+
"import os\n",
|
| 219 |
+
"from dotenv import load_dotenv\n",
|
| 220 |
+
"from openai import OpenAI\n",
|
| 221 |
+
"from pydantic import BaseModel, Field\n",
|
| 222 |
+
"from typing import Literal\n",
|
| 223 |
+
"import json\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"load_dotenv()\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"# Инициализация клиента OpenRouter\n",
|
| 228 |
+
"# Получите API ключ на https://openrouter.ai/\n",
|
| 229 |
+
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 230 |
+
"OPENROUTER_BASE_URL = \"https://openrouter.ai/api/v1\"\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"client = OpenAI(\n",
|
| 233 |
+
" base_url=OPENROUTER_BASE_URL,\n",
|
| 234 |
+
" api_key=OPENROUTER_API_KEY,\n",
|
| 235 |
+
")\n"
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"cell_type": "code",
|
| 240 |
+
"execution_count": null,
|
| 241 |
+
"id": "59c6a4e5",
|
| 242 |
+
"metadata": {},
|
| 243 |
+
"outputs": [],
|
| 244 |
+
"source": [
|
| 245 |
+
"# Определение структурированных моделей для вывода\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"class MainMessage(BaseModel):\n",
|
| 248 |
+
" \"\"\"Основная мысль/сообщение новостного поста\"\"\"\n",
|
| 249 |
+
" main_topic: str = Field(\n",
|
| 250 |
+
" description=\"Основная тема или предмет новостного поста (например: 'Выпуск iPhone 17', 'Высказывание политика А о политике Б')\"\n",
|
| 251 |
+
" )\n",
|
| 252 |
+
" key_entities: list[str] = Field(\n",
|
| 253 |
+
" description=\"Ключевые сущности, упомянутые в посте (люди, организации, события, даты)\"\n",
|
| 254 |
+
" )\n",
|
| 255 |
+
" main_fact_or_statement: str = Field(\n",
|
| 256 |
+
" description=\"Основной факт или утверждение, содержащееся в посте\"\n",
|
| 257 |
+
" )\n",
|
| 258 |
+
" context: str = Field(\n",
|
| 259 |
+
" description=\"Дополнительный контекст, необходимый для понимания основной мысли\"\n",
|
| 260 |
+
" )\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"class ClassificationResult(BaseModel):\n",
|
| 264 |
+
" \"\"\"Результат классификации новостного поста\"\"\"\n",
|
| 265 |
+
" is_unambiguous: bool = Field(\n",
|
| 266 |
+
" description=\"Является ли основная тема поста однозначной при поиске. True - однозначная (факт), False - неоднозначная (могут быть противоречивые ответы)\"\n",
|
| 267 |
+
" )\n",
|
| 268 |
+
" confidence: float = Field(\n",
|
| 269 |
+
" description=\"Уверенность в классификации от 0.0 до 1.0\",\n",
|
| 270 |
+
" ge=0.0,\n",
|
| 271 |
+
" le=1.0\n",
|
| 272 |
+
" )\n",
|
| 273 |
+
" reasoning: str = Field(\n",
|
| 274 |
+
" description=\"Обоснование классификации: почему пост считается однозначным или неоднозначным\"\n",
|
| 275 |
+
" )\n",
|
| 276 |
+
" search_difficulty: Literal[\"easy\", \"medium\", \"hard\"] = Field(\n",
|
| 277 |
+
" description=\"Сложность поиска: easy - простой факт, medium - требует контекста, hard - неоднозначный, может иметь противоречивые ответы\"\n",
|
| 278 |
+
" )\n"
|
| 279 |
+
]
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"cell_type": "code",
|
| 283 |
+
"execution_count": null,
|
| 284 |
+
"id": "70a3b05e",
|
| 285 |
+
"metadata": {},
|
| 286 |
+
"outputs": [],
|
| 287 |
+
"source": [
|
| 288 |
+
"\n"
|
| 289 |
+
]
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"cell_type": "code",
|
| 293 |
+
"execution_count": null,
|
| 294 |
+
"id": "bb6a49ed",
|
| 295 |
+
"metadata": {},
|
| 296 |
+
"outputs": [],
|
| 297 |
+
"source": [
|
| 298 |
+
"\n"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"cell_type": "code",
|
| 303 |
+
"execution_count": null,
|
| 304 |
+
"id": "f972cb96",
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"outputs": [],
|
| 307 |
+
"source": []
|
| 308 |
+
}
|
| 309 |
+
],
|
| 310 |
+
"metadata": {
|
| 311 |
+
"kernelspec": {
|
| 312 |
+
"display_name": "venv",
|
| 313 |
+
"language": "python",
|
| 314 |
+
"name": "python3"
|
| 315 |
+
},
|
| 316 |
+
"language_info": {
|
| 317 |
+
"codemirror_mode": {
|
| 318 |
+
"name": "ipython",
|
| 319 |
+
"version": 3
|
| 320 |
+
},
|
| 321 |
+
"file_extension": ".py",
|
| 322 |
+
"mimetype": "text/x-python",
|
| 323 |
+
"name": "python",
|
| 324 |
+
"nbconvert_exporter": "python",
|
| 325 |
+
"pygments_lexer": "ipython3",
|
| 326 |
+
"version": "3.13.3"
|
| 327 |
+
}
|
| 328 |
+
},
|
| 329 |
+
"nbformat": 4,
|
| 330 |
+
"nbformat_minor": 5
|
| 331 |
+
}
|
env.example.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Telegram API credentials (для парсинга новостей)
|
| 2 |
+
TELEGRAM_API_ID=your_api_id_here
|
| 3 |
+
TELEGRAM_API_HASH=your_api_hash_here
|
| 4 |
+
|
| 5 |
+
# PostgreSQL Database credentials
|
| 6 |
+
DB_USER=your_db_user
|
| 7 |
+
DB_PASS=your_db_password
|
| 8 |
+
DB_HOST=your_db_host
|
| 9 |
+
DB_PORT=5432
|
| 10 |
+
DB_NAME=your_db_name
|
| 11 |
+
PATH_TO_CERT=/path/to/ssl/cert.pem
|
| 12 |
+
|
| 13 |
+
# Qdrant Vector Database
|
| 14 |
+
QDRANT_URL=http://localhost:6333
|
| 15 |
+
|
| 16 |
+
# LLM API Key (OpenRouter)
|
| 17 |
+
OPENROUTER_API_KEY=your_openrouter_api_key_here
|
| 18 |
+
|
| 19 |
+
# Chat settings
|
| 20 |
+
CHAT_HISTORY_LENGTH=3 # Количество предыдущих пар (вопрос, ответ) для контекста диалога
|
| 21 |
+
|
| 22 |
+
# Backend URL (для frontend)
|
| 23 |
+
BACKEND_URL=http://localhost:8000
|
frontend.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Streamlit Frontend для RAG вопросно-ответной системы
|
| 3 |
+
Чат-интерфейс с поддержкой нескольких диалогов
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import List, Dict, Optional
|
| 8 |
+
import uuid
|
| 9 |
+
|
| 10 |
+
from src import RAG
|
| 11 |
+
from src.db_utils.history_utils import (
|
| 12 |
+
init_history_table,
|
| 13 |
+
log_query,
|
| 14 |
+
get_all_history,
|
| 15 |
+
get_history_by_dialogue,
|
| 16 |
+
search_history,
|
| 17 |
+
get_history_stats,
|
| 18 |
+
delete_history,
|
| 19 |
+
get_recent_dialogues
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# --- Инициализация RAG ---
|
| 24 |
+
@st.cache_resource(show_spinner=False)
|
| 25 |
+
def get_rag():
|
| 26 |
+
"""Initialize RAG once and cache it"""
|
| 27 |
+
return RAG(
|
| 28 |
+
embed_model_name = "Qwen/Qwen3-Embedding-0.6B",
|
| 29 |
+
embed_index_name = "recursive_Qwen3-Embedding-0.6B"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# --- Session State Management ---
|
| 34 |
+
def init_session_state():
|
| 35 |
+
"""Initialize session state variables for chat support"""
|
| 36 |
+
if "current_dialogue_id" not in st.session_state:
|
| 37 |
+
st.session_state.current_dialogue_id = None
|
| 38 |
+
if "chat_list" not in st.session_state:
|
| 39 |
+
st.session_state.chat_list = []
|
| 40 |
+
if "current_chat_messages" not in st.session_state:
|
| 41 |
+
st.session_state.current_chat_messages = []
|
| 42 |
+
if "chat_names" not in st.session_state:
|
| 43 |
+
st.session_state.chat_names = {} # {dialogue_id: custom_name}
|
| 44 |
+
if "chats_loaded" not in st.session_state:
|
| 45 |
+
st.session_state.chats_loaded = False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def generate_dialogue_id() -> str:
|
| 49 |
+
"""Generate unique dialogue ID"""
|
| 50 |
+
return f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_chat_display_name(dialogue_id: str, first_query: str = None) -> str:
|
| 54 |
+
"""Get display name for chat"""
|
| 55 |
+
if dialogue_id in st.session_state.chat_names:
|
| 56 |
+
return st.session_state.chat_names[dialogue_id]
|
| 57 |
+
|
| 58 |
+
if first_query:
|
| 59 |
+
# Use first 40 chars of first query as name
|
| 60 |
+
name = first_query[:40] + "..." if len(first_query) > 40 else first_query
|
| 61 |
+
st.session_state.chat_names[dialogue_id] = name
|
| 62 |
+
return name
|
| 63 |
+
|
| 64 |
+
return "Новый диалог"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# --- Chat Management Functions ---
|
| 68 |
+
|
| 69 |
+
def load_chats_list():
|
| 70 |
+
"""Load all available chats from database"""
|
| 71 |
+
try:
|
| 72 |
+
dialogues = get_recent_dialogues(limit=50)
|
| 73 |
+
st.session_state.chat_list = dialogues
|
| 74 |
+
st.session_state.chats_loaded = True
|
| 75 |
+
|
| 76 |
+
# If no current chat selected and chats exist, select the first one
|
| 77 |
+
if not st.session_state.current_dialogue_id and dialogues:
|
| 78 |
+
switch_to_chat(dialogues[0]["dialogue_id"])
|
| 79 |
+
except Exception as e:
|
| 80 |
+
st.error(f"❌ Ошибка при загрузке чатов: {e}")
|
| 81 |
+
st.session_state.chat_list = []
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def create_new_chat():
|
| 85 |
+
"""Create a new chat"""
|
| 86 |
+
new_id = generate_dialogue_id()
|
| 87 |
+
st.session_state.current_dialogue_id = new_id
|
| 88 |
+
st.session_state.current_chat_messages = []
|
| 89 |
+
return new_id
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def switch_to_chat(dialogue_id: str):
|
| 93 |
+
"""Switch to an existing chat"""
|
| 94 |
+
st.session_state.current_dialogue_id = dialogue_id
|
| 95 |
+
load_chat_messages(dialogue_id)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def load_chat_messages(dialogue_id: str):
|
| 99 |
+
"""Load messages for a specific chat"""
|
| 100 |
+
try:
|
| 101 |
+
history = get_history_by_dialogue(dialogue_id)
|
| 102 |
+
st.session_state.current_chat_messages = history
|
| 103 |
+
except Exception as e:
|
| 104 |
+
st.error(f"❌ Ошибка при загрузке сообщений: {e}")
|
| 105 |
+
st.session_state.current_chat_messages = []
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def send_message(query: str) -> Optional[Dict]:
|
| 109 |
+
"""Send a message in current chat"""
|
| 110 |
+
try:
|
| 111 |
+
if not st.session_state.current_dialogue_id:
|
| 112 |
+
create_new_chat()
|
| 113 |
+
|
| 114 |
+
# Get RAG and invoke with history
|
| 115 |
+
rag = get_rag()
|
| 116 |
+
|
| 117 |
+
# Pass current chat history to RAG (it will use last N messages internally for enrichment)
|
| 118 |
+
result = rag.invoke(query, history=st.session_state.current_chat_messages)
|
| 119 |
+
|
| 120 |
+
# Log to history
|
| 121 |
+
query_id = log_query(
|
| 122 |
+
query=query,
|
| 123 |
+
answer=result.get("answer", ""),
|
| 124 |
+
reason=result.get("reason", ""),
|
| 125 |
+
dialogue_id=st.session_state.current_dialogue_id
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
result["query_id"] = query_id
|
| 129 |
+
|
| 130 |
+
# Update current chat messages
|
| 131 |
+
load_chat_messages(st.session_state.current_dialogue_id)
|
| 132 |
+
|
| 133 |
+
# Reload chats list to update
|
| 134 |
+
load_chats_list()
|
| 135 |
+
|
| 136 |
+
return result
|
| 137 |
+
except Exception as e:
|
| 138 |
+
st.error(f"❌ Ошибка при отправке сообщения: {e}")
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def delete_chat(dialogue_id: str) -> bool:
|
| 143 |
+
"""Delete a chat"""
|
| 144 |
+
try:
|
| 145 |
+
delete_history(dialogue_id=dialogue_id)
|
| 146 |
+
|
| 147 |
+
# If deleted current chat, switch to another or create new
|
| 148 |
+
if st.session_state.current_dialogue_id == dialogue_id:
|
| 149 |
+
st.session_state.current_dialogue_id = None
|
| 150 |
+
st.session_state.current_chat_messages = []
|
| 151 |
+
|
| 152 |
+
# Reload chats
|
| 153 |
+
load_chats_list()
|
| 154 |
+
|
| 155 |
+
return True
|
| 156 |
+
except Exception as e:
|
| 157 |
+
st.error(f"❌ Ошибка при удалении чата: {e}")
|
| 158 |
+
return False
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# --- Page: Chat Interface ---
|
| 164 |
+
def page_chat():
|
| 165 |
+
"""Main chat interface page"""
|
| 166 |
+
|
| 167 |
+
# Custom CSS to fix chat input at the bottom + keyboard shortcuts
|
| 168 |
+
st.markdown("""
|
| 169 |
+
<style>
|
| 170 |
+
/* Fix chat input at the bottom of main content area */
|
| 171 |
+
section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 172 |
+
position: fixed;
|
| 173 |
+
bottom: 0;
|
| 174 |
+
background: white;
|
| 175 |
+
padding: 1rem;
|
| 176 |
+
z-index: 999;
|
| 177 |
+
border-top: 1px solid #e6e6e6;
|
| 178 |
+
margin-left: 0;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Add padding to main content to prevent overlap with fixed input */
|
| 182 |
+
.main .block-container {
|
| 183 |
+
padding-bottom: 100px;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* Dark mode support */
|
| 187 |
+
[data-testid="stAppViewContainer"][data-theme="dark"] section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 188 |
+
background: rgb(14, 17, 23);
|
| 189 |
+
border-top: 1px solid #333;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
/* Adjust width to account for sidebar */
|
| 193 |
+
@media (min-width: 768px) {
|
| 194 |
+
section[data-testid="stSidebar"] ~ div .stChatInput {
|
| 195 |
+
left: var(--sidebar-width, 21rem);
|
| 196 |
+
right: 0;
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
/* When sidebar is collapsed */
|
| 201 |
+
section[data-testid="stSidebar"][aria-expanded="false"] ~ div .stChatInput {
|
| 202 |
+
left: 0;
|
| 203 |
+
}
|
| 204 |
+
</style>
|
| 205 |
+
|
| 206 |
+
<script>
|
| 207 |
+
// Add keyboard shortcuts support
|
| 208 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 209 |
+
// Find chat input field
|
| 210 |
+
const observer = new MutationObserver(function(mutations) {
|
| 211 |
+
const chatInput = document.querySelector('textarea[data-testid="stChatInput"]');
|
| 212 |
+
if (chatInput && !chatInput.hasAttribute('data-shortcut-attached')) {
|
| 213 |
+
chatInput.setAttribute('data-shortcut-attached', 'true');
|
| 214 |
+
|
| 215 |
+
// Add keyboard event listener
|
| 216 |
+
chatInput.addEventListener('keydown', function(e) {
|
| 217 |
+
// Enter (without Shift) - send message
|
| 218 |
+
if (e.key === 'Enter' && !e.shiftKey) {
|
| 219 |
+
e.preventDefault();
|
| 220 |
+
// Trigger the send button
|
| 221 |
+
const sendButton = document.querySelector('button[kind="primary"]');
|
| 222 |
+
if (sendButton) {
|
| 223 |
+
sendButton.click();
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
// Ctrl+Enter or Cmd+Enter - send message (alternative)
|
| 227 |
+
else if (e.key === 'Enter' && (e.ctrlKey || e.metaKey)) {
|
| 228 |
+
e.preventDefault();
|
| 229 |
+
const sendButton = document.querySelector('button[kind="primary"]');
|
| 230 |
+
if (sendButton) {
|
| 231 |
+
sendButton.click();
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
// Shift+Enter - new line (default behavior)
|
| 235 |
+
});
|
| 236 |
+
}
|
| 237 |
+
});
|
| 238 |
+
|
| 239 |
+
observer.observe(document.body, {
|
| 240 |
+
childList: true,
|
| 241 |
+
subtree: true
|
| 242 |
+
});
|
| 243 |
+
});
|
| 244 |
+
</script>
|
| 245 |
+
""", unsafe_allow_html=True)
|
| 246 |
+
|
| 247 |
+
# Check if we have a current chat
|
| 248 |
+
if not st.session_state.current_dialogue_id:
|
| 249 |
+
# Show welcome screen
|
| 250 |
+
st.title("💬 Чат с RAG системой")
|
| 251 |
+
st.markdown("---")
|
| 252 |
+
|
| 253 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 254 |
+
with col2:
|
| 255 |
+
st.info("👋 Добро пожаловать! Создайте новый чат или выберите существующий из списка слева.")
|
| 256 |
+
|
| 257 |
+
if st.button("🆕 Начать новый чат", type="primary", use_container_width=True):
|
| 258 |
+
create_new_chat()
|
| 259 |
+
st.rerun()
|
| 260 |
+
|
| 261 |
+
return
|
| 262 |
+
|
| 263 |
+
# Display chat header
|
| 264 |
+
if st.session_state.current_chat_messages:
|
| 265 |
+
chat_name = get_chat_display_name(
|
| 266 |
+
st.session_state.current_dialogue_id,
|
| 267 |
+
st.session_state.current_chat_messages[0]["query"]
|
| 268 |
+
)
|
| 269 |
+
else:
|
| 270 |
+
chat_name = "Новый диалог"
|
| 271 |
+
|
| 272 |
+
col1, col2 = st.columns([4, 1])
|
| 273 |
+
with col1:
|
| 274 |
+
st.title(f"💬 {chat_name}")
|
| 275 |
+
with col2:
|
| 276 |
+
if st.button("🗑️ Удалить чат", use_container_width=True):
|
| 277 |
+
if delete_chat(st.session_state.current_dialogue_id):
|
| 278 |
+
st.success("✅ Чат удален")
|
| 279 |
+
st.rerun()
|
| 280 |
+
|
| 281 |
+
st.markdown("---")
|
| 282 |
+
|
| 283 |
+
# Chat messages container
|
| 284 |
+
if not st.session_state.current_chat_messages:
|
| 285 |
+
st.info("📝 Начните диалог, задав первый вопрос ниже")
|
| 286 |
+
else:
|
| 287 |
+
# Display all messages
|
| 288 |
+
for msg in st.session_state.current_chat_messages:
|
| 289 |
+
# User message
|
| 290 |
+
with st.chat_message("user"):
|
| 291 |
+
st.markdown(msg["query"])
|
| 292 |
+
timestamp_str = msg.get("timestamp", "")
|
| 293 |
+
try:
|
| 294 |
+
dt = datetime.fromisoformat(timestamp_str)
|
| 295 |
+
st.caption(f"🕐 {dt.strftime('%H:%M:%S')}")
|
| 296 |
+
except:
|
| 297 |
+
pass
|
| 298 |
+
|
| 299 |
+
# Assistant message
|
| 300 |
+
with st.chat_message("assistant"):
|
| 301 |
+
st.markdown(msg["answer"])
|
| 302 |
+
|
| 303 |
+
# Show reasoning in expander
|
| 304 |
+
if msg.get("reason"):
|
| 305 |
+
with st.expander("📝 Обоснование"):
|
| 306 |
+
st.markdown(msg["reason"])
|
| 307 |
+
|
| 308 |
+
# Input area - fixed at the bottom via CSS
|
| 309 |
+
query = st.chat_input(
|
| 310 |
+
"Введите ваш вопрос...",
|
| 311 |
+
key="chat_input"
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
if query:
|
| 315 |
+
# Send message and get response
|
| 316 |
+
with st.spinner("🤔 Думаю..."):
|
| 317 |
+
result = send_message(query)
|
| 318 |
+
|
| 319 |
+
if result:
|
| 320 |
+
st.rerun()
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
# --- Main App ---
|
| 325 |
+
def main():
|
| 326 |
+
st.set_page_config(
|
| 327 |
+
page_title="RAG Chat System",
|
| 328 |
+
page_icon="💬",
|
| 329 |
+
layout="wide",
|
| 330 |
+
initial_sidebar_state="expanded"
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Initialize history table on startup
|
| 334 |
+
try:
|
| 335 |
+
init_history_table()
|
| 336 |
+
except Exception as e:
|
| 337 |
+
st.error(f"⚠️ Не удалось инициализировать таблицу истории: {e}")
|
| 338 |
+
|
| 339 |
+
# Initialize session state
|
| 340 |
+
init_session_state()
|
| 341 |
+
|
| 342 |
+
# Load chats list if not loaded
|
| 343 |
+
if not st.session_state.chats_loaded:
|
| 344 |
+
load_chats_list()
|
| 345 |
+
|
| 346 |
+
# Sidebar
|
| 347 |
+
with st.sidebar:
|
| 348 |
+
st.title("💬 RAG Chat")
|
| 349 |
+
|
| 350 |
+
# New chat button
|
| 351 |
+
if st.button("➕ Новый чат", use_container_width=True, type="primary"):
|
| 352 |
+
create_new_chat()
|
| 353 |
+
st.rerun()
|
| 354 |
+
|
| 355 |
+
st.markdown("---")
|
| 356 |
+
|
| 357 |
+
# Chats list
|
| 358 |
+
st.subheader("📝 Ваши чаты")
|
| 359 |
+
|
| 360 |
+
if not st.session_state.chat_list:
|
| 361 |
+
st.info("Нет чатов. Создайте новый!")
|
| 362 |
+
else:
|
| 363 |
+
# Display chats
|
| 364 |
+
for chat in st.session_state.chat_list:
|
| 365 |
+
dialogue_id = chat["dialogue_id"]
|
| 366 |
+
message_count = chat.get("message_count", 0)
|
| 367 |
+
started_at = chat.get("started_at", "")
|
| 368 |
+
|
| 369 |
+
# Get chat name (only load history if chat has messages)
|
| 370 |
+
if message_count > 0:
|
| 371 |
+
history = get_history_by_dialogue(dialogue_id)
|
| 372 |
+
first_query = history[0]["query"] if history else None
|
| 373 |
+
else:
|
| 374 |
+
first_query = None
|
| 375 |
+
chat_name = get_chat_display_name(dialogue_id, first_query)
|
| 376 |
+
|
| 377 |
+
# Format time
|
| 378 |
+
try:
|
| 379 |
+
dt = datetime.fromisoformat(started_at)
|
| 380 |
+
time_str = dt.strftime('%d.%m %H:%M')
|
| 381 |
+
except:
|
| 382 |
+
time_str = ""
|
| 383 |
+
|
| 384 |
+
# Check if this is current chat
|
| 385 |
+
is_current = dialogue_id == st.session_state.current_dialogue_id
|
| 386 |
+
|
| 387 |
+
# Format button text with chat name and metadata
|
| 388 |
+
button_text = f"{'📌' if is_current else '💬'} {chat_name}\n💬 {message_count} • {time_str}"
|
| 389 |
+
|
| 390 |
+
if st.button(
|
| 391 |
+
button_text,
|
| 392 |
+
key=f"chat_{dialogue_id}",
|
| 393 |
+
use_container_width=True,
|
| 394 |
+
type="primary" if is_current else "secondary"
|
| 395 |
+
):
|
| 396 |
+
switch_to_chat(dialogue_id)
|
| 397 |
+
st.rerun()
|
| 398 |
+
|
| 399 |
+
# Main content area
|
| 400 |
+
page_chat()
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
if __name__ == "__main__":
|
| 404 |
+
main()
|
news_classification_langgraph.ipynb
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Пайплайн классификации новостных постов (LangGraph)\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Этот пайплайн реализован с использованием **LangGraph** и состоит из двух узлов:\n",
|
| 10 |
+
"1. **Узел извлечения** - вычленяет основную мысль/сообщение из новостного поста\n",
|
| 11 |
+
"2. **Узел классификации** - определяет, является ли основная тема однозначной при поиске\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"LangGraph позволяет строить графовые структуры агентов с явным управлением состоянием и потоком данных.\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "code",
|
| 18 |
+
"execution_count": null,
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [],
|
| 21 |
+
"source": [
|
| 22 |
+
"# Установка зависимостей (раскомментируйте при необходимости)\n",
|
| 23 |
+
"# !pip install langgraph langchain langchain-openai pydantic python-dotenv pandas tqdm\n"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": null,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": [
|
| 32 |
+
"import os\n",
|
| 33 |
+
"from typing import Literal, Optional, TypedDict, Annotated\n",
|
| 34 |
+
"from dotenv import load_dotenv\n",
|
| 35 |
+
"from pydantic import BaseModel, Field\n",
|
| 36 |
+
"from langchain_openai import ChatOpenAI\n",
|
| 37 |
+
"from langchain_core.prompts import ChatPromptTemplate\n",
|
| 38 |
+
"from langchain_core.output_parsers import PydanticOutputParser\n",
|
| 39 |
+
"from langgraph.graph import StateGraph, END\n",
|
| 40 |
+
"import pandas as pd\n",
|
| 41 |
+
"from tqdm import tqdm\n",
|
| 42 |
+
"import operator\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"load_dotenv()\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"# Проверка наличия API ключа\n",
|
| 47 |
+
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 48 |
+
"if not OPENROUTER_API_KEY:\n",
|
| 49 |
+
" raise ValueError(\"Не найден OPENROUTER_API_KEY в переменных окружения\")\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"print(\"✅ API ключ загружен\")\n"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "markdown",
|
| 56 |
+
"metadata": {},
|
| 57 |
+
"source": [
|
| 58 |
+
"## Определение структурированных моделей вывода\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"cell_type": "code",
|
| 63 |
+
"execution_count": null,
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"outputs": [],
|
| 66 |
+
"source": [
|
| 67 |
+
"class MainMessage(BaseModel):\n",
|
| 68 |
+
" \"\"\"Основная мысль/сообщение новостного поста\"\"\"\n",
|
| 69 |
+
" \n",
|
| 70 |
+
" main_topic: str = Field(\n",
|
| 71 |
+
" description=\"Основная тема или предмет новостного поста (например: 'Выпуск iPhone 17', 'Высказывание политика А о политике Б')\"\n",
|
| 72 |
+
" )\n",
|
| 73 |
+
" key_entities: list[str] = Field(\n",
|
| 74 |
+
" description=\"Ключевые сущности, упомянутые в посте (люди, организации, события, даты, места)\"\n",
|
| 75 |
+
" )\n",
|
| 76 |
+
" main_fact_or_statement: str = Field(\n",
|
| 77 |
+
" description=\"Основной факт или утверждение, содержащееся в посте\"\n",
|
| 78 |
+
" )\n",
|
| 79 |
+
" temporal_context: Optional[str] = Field(\n",
|
| 80 |
+
" default=None,\n",
|
| 81 |
+
" description=\"Временной контекст события (конкретная дата, период, или 'текущий момент')\"\n",
|
| 82 |
+
" )\n",
|
| 83 |
+
" additional_context: str = Field(\n",
|
| 84 |
+
" description=\"Дополнительный контекст, необходимый для понимания основной мысли\"\n",
|
| 85 |
+
" )\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"class ClassificationResult(BaseModel):\n",
|
| 89 |
+
" \"\"\"Результат классификации новостного поста по однозначности поиска\"\"\"\n",
|
| 90 |
+
" \n",
|
| 91 |
+
" is_unambiguous: bool = Field(\n",
|
| 92 |
+
" description=\"Является ли основная тема поста однозначной при поиске. True - однозначная (конкретный факт), False - неоднозначная (могут быть противоречивые ответы)\"\n",
|
| 93 |
+
" )\n",
|
| 94 |
+
" confidence: float = Field(\n",
|
| 95 |
+
" description=\"Уверенность в классификации от 0.0 до 1.0\",\n",
|
| 96 |
+
" ge=0.0,\n",
|
| 97 |
+
" le=1.0\n",
|
| 98 |
+
" )\n",
|
| 99 |
+
" category: Literal[\"fact\", \"opinion\", \"statement\", \"event\", \"mixed\"] = Field(\n",
|
| 100 |
+
" description=\"Категория контента: fact - чистый факт, opinion - мнение, statement - высказывание/заявление, event - событие, mixed - смешанный\"\n",
|
| 101 |
+
" )\n",
|
| 102 |
+
" search_difficulty: Literal[\"easy\", \"medium\", \"hard\"] = Field(\n",
|
| 103 |
+
" description=\"Сложность поиска: easy - простой уникальный факт, medium - требует временного контекста, hard - неоднозначный, может иметь противоречивые ответы\"\n",
|
| 104 |
+
" )\n",
|
| 105 |
+
" ambiguity_reasons: list[str] = Field(\n",
|
| 106 |
+
" default_factory=list,\n",
|
| 107 |
+
" description=\"Причины неоднозначности (если есть): изменчивость позиции, множественные источники, субъективность и т.д.\"\n",
|
| 108 |
+
" )\n",
|
| 109 |
+
" reasoning: str = Field(\n",
|
| 110 |
+
" description=\"Подробное обоснование классификации\"\n",
|
| 111 |
+
" )\n",
|
| 112 |
+
" suggested_search_query: str = Field(\n",
|
| 113 |
+
" description=\"Предлагаемый поисковый запрос для нахождения этой информации\"\n",
|
| 114 |
+
" )\n"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"cell_type": "markdown",
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"source": [
|
| 121 |
+
"## Определение состояния графа (LangGraph State)\n"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"cell_type": "code",
|
| 126 |
+
"execution_count": null,
|
| 127 |
+
"metadata": {},
|
| 128 |
+
"outputs": [],
|
| 129 |
+
"source": [
|
| 130 |
+
"# Определение состояния графа\n",
|
| 131 |
+
"# TypedDict используется для определения схемы состояния, которое передается между узлами\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"class GraphState(TypedDict):\n",
|
| 134 |
+
" \"\"\"Состояние, передаваемое между узлами графа\"\"\"\n",
|
| 135 |
+
" \n",
|
| 136 |
+
" # Входные данные\n",
|
| 137 |
+
" original_text: str\n",
|
| 138 |
+
" \n",
|
| 139 |
+
" # Результат извлечения (заполняется узлом extraction)\n",
|
| 140 |
+
" main_message: Optional[MainMessage]\n",
|
| 141 |
+
" \n",
|
| 142 |
+
" # Результат классификации (заполняется узлом classification)\n",
|
| 143 |
+
" classification: Optional[ClassificationResult]\n",
|
| 144 |
+
" \n",
|
| 145 |
+
" # Статус обработки\n",
|
| 146 |
+
" error: Optional[str]\n"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "markdown",
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"source": [
|
| 153 |
+
"## Настройка LLM через OpenRouter\n"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"cell_type": "code",
|
| 158 |
+
"execution_count": null,
|
| 159 |
+
"metadata": {},
|
| 160 |
+
"outputs": [],
|
| 161 |
+
"source": [
|
| 162 |
+
"# Настройка LLM через OpenRouter\n",
|
| 163 |
+
"\n",
|
| 164 |
+
"def create_llm(model: str = \"openai/gpt-4o-mini\", temperature: float = 0.0) -> ChatOpenAI:\n",
|
| 165 |
+
" \"\"\"Создает экземпляр LLM через OpenRouter\"\"\"\n",
|
| 166 |
+
" return ChatOpenAI(\n",
|
| 167 |
+
" model=model,\n",
|
| 168 |
+
" temperature=temperature,\n",
|
| 169 |
+
" openai_api_key=OPENROUTER_API_KEY,\n",
|
| 170 |
+
" openai_api_base=\"https://openrouter.ai/api/v1\",\n",
|
| 171 |
+
" )\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"# Модель для использования\n",
|
| 174 |
+
"MODEL_NAME = \"openai/gpt-4o-mini\"\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"# Создаем LLM\n",
|
| 177 |
+
"llm = create_llm(MODEL_NAME)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
"print(f\"✅ Используемая модель: {MODEL_NAME}\")\n"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "markdown",
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"source": [
|
| 186 |
+
"## Определение узлов графа (Nodes)\n"
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "code",
|
| 191 |
+
"execution_count": null,
|
| 192 |
+
"metadata": {},
|
| 193 |
+
"outputs": [],
|
| 194 |
+
"source": [
|
| 195 |
+
"# Узел 1: Извлечение основной мысли\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"extraction_parser = PydanticOutputParser(pydantic_object=MainMessage)\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"extraction_prompt = ChatPromptTemplate.from_messages([\n",
|
| 200 |
+
" (\"system\", \"\"\"Ты - эксперт по анализу новостного контента. Твоя задача - извлечь основную мысль и ключевую информацию из новостного поста.\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"Анализируй текст внимательно и выдели:\n",
|
| 203 |
+
"1. Основную тему поста\n",
|
| 204 |
+
"2. Все ключевые сущности (люди, организации, места, даты, события)\n",
|
| 205 |
+
"3. Главн��й факт или утверждение\n",
|
| 206 |
+
"4. Временной контекст (когда это произошло/происходит)\n",
|
| 207 |
+
"5. Дополнительный контекст для понимания\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"{format_instructions}\"\"\"),\n",
|
| 210 |
+
" (\"human\", \"Проанализируй следующий новостной пост и извлеки основную мысль:\\n\\n{text}\")\n",
|
| 211 |
+
"])\n",
|
| 212 |
+
"\n",
|
| 213 |
+
"extraction_chain = extraction_prompt | llm | extraction_parser\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"def extraction_node(state: GraphState) -> dict:\n",
|
| 217 |
+
" \"\"\"\n",
|
| 218 |
+
" Узел извлечения основной мысли.\n",
|
| 219 |
+
" Принимает состояние, извлекает основную мысль и возвращает обновление состояния.\n",
|
| 220 |
+
" \"\"\"\n",
|
| 221 |
+
" try:\n",
|
| 222 |
+
" result = extraction_chain.invoke({\n",
|
| 223 |
+
" \"text\": state[\"original_text\"],\n",
|
| 224 |
+
" \"format_instructions\": extraction_parser.get_format_instructions()\n",
|
| 225 |
+
" })\n",
|
| 226 |
+
" return {\"main_message\": result, \"error\": None}\n",
|
| 227 |
+
" except Exception as e:\n",
|
| 228 |
+
" return {\"main_message\": None, \"error\": f\"Ошибка извлечения: {str(e)}\"}\n",
|
| 229 |
+
"\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"print(\"✅ Узел извлечения определен\")\n"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"cell_type": "code",
|
| 236 |
+
"execution_count": null,
|
| 237 |
+
"metadata": {},
|
| 238 |
+
"outputs": [],
|
| 239 |
+
"source": [
|
| 240 |
+
"# Узел 2: Классификация по однозначности\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"classification_parser = PydanticOutputParser(pydantic_object=ClassificationResult)\n",
|
| 243 |
+
"\n",
|
| 244 |
+
"classification_prompt = ChatPromptTemplate.from_messages([\n",
|
| 245 |
+
" (\"system\", \"\"\"Ты - эксперт по классификации новостного контента для поисковых систем.\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"Твоя задача - определить, является ли новостной пост ОДНОЗНАЧНЫМ или НЕОДНОЗНАЧНЫМ для поиска.\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"## Критерии ОДНОЗНАЧНОГО контента (is_unambiguous=True):\n",
|
| 250 |
+
"- Конкретные факты с точными датами и цифрами (\"Apple выпустила iPhone 17 15 сентября 2025\")\n",
|
| 251 |
+
"- Уникальные события, которые произошли один раз\n",
|
| 252 |
+
"- Официальные решения, законы, назначения\n",
|
| 253 |
+
"- Результаты спортивных событий, выборов\n",
|
| 254 |
+
"- Финансовые показатели за конкретный период\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"## Критерии НЕОДНОЗНАЧНОГО контента (is_unambiguous=False):\n",
|
| 257 |
+
"- Высказывания и мнения, которые могут меняться со временем\n",
|
| 258 |
+
"- Позиции политиков/персон по вопросам (\"политик А заявил о политике Б\")\n",
|
| 259 |
+
"- Прогнозы и ожидания\n",
|
| 260 |
+
"- Оценочные суждения\n",
|
| 261 |
+
"- События без точной привязки ко времени\n",
|
| 262 |
+
"- Темы, где возможны противоречивые источники\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"## Сложность поиска:\n",
|
| 265 |
+
"- easy: Уникальный факт, легко найти один правильный ответ\n",
|
| 266 |
+
"- medium: Требует временного/контекстного уточнения\n",
|
| 267 |
+
"- hard: Высокая вероятность найти противоречивые ответы\n",
|
| 268 |
+
"\n",
|
| 269 |
+
"{format_instructions}\"\"\"),\n",
|
| 270 |
+
" (\"human\", \"\"\"Проклассифицируй следующий новостной контент:\n",
|
| 271 |
+
"\n",
|
| 272 |
+
"## Оригинальный текст:\n",
|
| 273 |
+
"{original_text}\n",
|
| 274 |
+
"\n",
|
| 275 |
+
"## Извлечённая основная мысль:\n",
|
| 276 |
+
"- Тема: {main_topic}\n",
|
| 277 |
+
"- Ключевые сущности: {key_entities}\n",
|
| 278 |
+
"- Основной факт/утверждение: {main_fact}\n",
|
| 279 |
+
"- Временной контекст: {temporal_context}\n",
|
| 280 |
+
"- Дополнительный контекст: {additional_context}\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"Определи, является ли этот контент однозначным для поиска.\"\"\")\n",
|
| 283 |
+
"])\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"classification_chain = classification_prompt | llm | classification_parser\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"def classification_node(state: GraphState) -> dict:\n",
|
| 289 |
+
" \"\"\"\n",
|
| 290 |
+
" Узел классификации контента.\n",
|
| 291 |
+
" Принимает состояние с извлеченной мыслью и классифицирует её.\n",
|
| 292 |
+
" \"\"\"\n",
|
| 293 |
+
" # Проверяем, есть ли ошибка на предыдущем шаге\n",
|
| 294 |
+
" if state.get(\"error\"):\n",
|
| 295 |
+
" return {\"classification\": None}\n",
|
| 296 |
+
" \n",
|
| 297 |
+
" main_message = state.get(\"main_message\")\n",
|
| 298 |
+
" if not main_message:\n",
|
| 299 |
+
" return {\"classification\": None, \"error\": \"Отсутствует main_message для классификации\"}\n",
|
| 300 |
+
" \n",
|
| 301 |
+
" try:\n",
|
| 302 |
+
" result = classification_chain.invoke({\n",
|
| 303 |
+
" \"original_text\": state[\"original_text\"],\n",
|
| 304 |
+
" \"main_topic\": main_message.main_topic,\n",
|
| 305 |
+
" \"key_entities\": \", \".join(main_message.key_entities),\n",
|
| 306 |
+
" \"main_fact\": main_message.main_fact_or_statement,\n",
|
| 307 |
+
" \"temporal_context\": main_message.temporal_context or \"не указан\",\n",
|
| 308 |
+
" \"additional_context\": main_message.additional_context,\n",
|
| 309 |
+
" \"format_instructions\": classification_parser.get_format_instructions()\n",
|
| 310 |
+
" })\n",
|
| 311 |
+
" return {\"classification\": result}\n",
|
| 312 |
+
" except Exception as e:\n",
|
| 313 |
+
" return {\"classification\": None, \"error\": f\"Ошибка классификации: {str(e)}\"}\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"\n",
|
| 316 |
+
"print(\"✅ Узел классификации определен\")\n"
|
| 317 |
+
]
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"cell_type": "markdown",
|
| 321 |
+
"metadata": {},
|
| 322 |
+
"source": [
|
| 323 |
+
"## Построение графа LangGraph\n"
|
| 324 |
+
]
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"cell_type": "code",
|
| 328 |
+
"execution_count": null,
|
| 329 |
+
"metadata": {},
|
| 330 |
+
"outputs": [],
|
| 331 |
+
"source": [
|
| 332 |
+
"# Создание графа\n",
|
| 333 |
+
"workflow = StateGraph(GraphState)\n",
|
| 334 |
+
"\n",
|
| 335 |
+
"# Добавляем узлы\n",
|
| 336 |
+
"workflow.add_node(\"extraction\", extraction_node)\n",
|
| 337 |
+
"workflow.add_node(\"classification\", classification_node)\n",
|
| 338 |
+
"\n",
|
| 339 |
+
"# Определяем входную точку\n",
|
| 340 |
+
"workflow.set_entry_point(\"extraction\")\n",
|
| 341 |
+
"\n",
|
| 342 |
+
"# Добавляем рёбра (переходы между узлами)\n",
|
| 343 |
+
"workflow.add_edge(\"extraction\", \"classification\")\n",
|
| 344 |
+
"workflow.add_edge(\"classification\", END)\n",
|
| 345 |
+
"\n",
|
| 346 |
+
"# Компилируем граф\n",
|
| 347 |
+
"graph = workflow.compile()\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"print(\"✅ Граф LangGraph скомпилирован\")\n"
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"cell_type": "code",
|
| 354 |
+
"execution_count": null,
|
| 355 |
+
"metadata": {},
|
| 356 |
+
"outputs": [],
|
| 357 |
+
"source": [
|
| 358 |
+
"# Визуализация графа (опционально)\n",
|
| 359 |
+
"try:\n",
|
| 360 |
+
" from IPython.display import Image, display\n",
|
| 361 |
+
" display(Image(graph.get_graph().draw_mermaid_png()))\n",
|
| 362 |
+
"except Exception as e:\n",
|
| 363 |
+
" print(f\"Визуализация недоступна: {e}\")\n",
|
| 364 |
+
" print(\"\\nСтруктура графа:\")\n",
|
| 365 |
+
" print(\" [START] → extraction → classification → [END]\")\n"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "markdown",
|
| 370 |
+
"metadata": {},
|
| 371 |
+
"source": [
|
| 372 |
+
"## Обёртка пайплайна\n"
|
| 373 |
+
]
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"cell_type": "code",
|
| 377 |
+
"execution_count": null,
|
| 378 |
+
"metadata": {},
|
| 379 |
+
"outputs": [],
|
| 380 |
+
"source": [
|
| 381 |
+
"class PipelineResult(BaseModel):\n",
|
| 382 |
+
" \"\"\"Полный результат работы пайплайна\"\"\"\n",
|
| 383 |
+
" original_text: str\n",
|
| 384 |
+
" main_message: Optional[MainMessage]\n",
|
| 385 |
+
" classification: Optional[ClassificationResult]\n",
|
| 386 |
+
" error: Optional[str] = None\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"class NewsClassificationPipeline:\n",
|
| 390 |
+
" \"\"\"Обёртка над графом LangGraph для удобного использования\"\"\"\n",
|
| 391 |
+
" \n",
|
| 392 |
+
" def __init__(self, compiled_graph=None):\n",
|
| 393 |
+
" self.graph = compiled_graph or graph\n",
|
| 394 |
+
" \n",
|
| 395 |
+
" def process(self, text: str) -> PipelineResult:\n",
|
| 396 |
+
" \"\"\"Обрабатывает один новостной пост через граф\"\"\"\n",
|
| 397 |
+
" initial_state = {\n",
|
| 398 |
+
" \"original_text\": text,\n",
|
| 399 |
+
" \"main_message\": None,\n",
|
| 400 |
+
" \"classification\": None,\n",
|
| 401 |
+
" \"error\": None\n",
|
| 402 |
+
" }\n",
|
| 403 |
+
" \n",
|
| 404 |
+
" # Запускаем граф\n",
|
| 405 |
+
" final_state = self.graph.invoke(initial_state)\n",
|
| 406 |
+
" \n",
|
| 407 |
+
" return PipelineResult(\n",
|
| 408 |
+
" original_text=text,\n",
|
| 409 |
+
" main_message=final_state.get(\"main_message\"),\n",
|
| 410 |
+
" classification=final_state.get(\"classification\"),\n",
|
| 411 |
+
" error=final_state.get(\"error\")\n",
|
| 412 |
+
" )\n",
|
| 413 |
+
" \n",
|
| 414 |
+
" def process_batch(self, texts: list[str], show_progress: bool = True) -> list[PipelineResult]:\n",
|
| 415 |
+
" \"\"\"Обрабатывает список постов\"\"\"\n",
|
| 416 |
+
" results = []\n",
|
| 417 |
+
" iterator = tqdm(texts, desc=\"Обработка постов\") if show_progress else texts\n",
|
| 418 |
+
" \n",
|
| 419 |
+
" for text in iterator:\n",
|
| 420 |
+
" result = self.process(text)\n",
|
| 421 |
+
" results.append(result)\n",
|
| 422 |
+
" \n",
|
| 423 |
+
" return results\n",
|
| 424 |
+
" \n",
|
| 425 |
+
" def stream(self, text: str):\n",
|
| 426 |
+
" \"\"\"Потоковая обработка с выводом промежуточных состояний\"\"\"\n",
|
| 427 |
+
" initial_state = {\n",
|
| 428 |
+
" \"original_text\": text,\n",
|
| 429 |
+
" \"main_message\": None,\n",
|
| 430 |
+
" \"classification\": None,\n",
|
| 431 |
+
" \"error\": None\n",
|
| 432 |
+
" }\n",
|
| 433 |
+
" \n",
|
| 434 |
+
" for event in self.graph.stream(initial_state):\n",
|
| 435 |
+
" yield event\n",
|
| 436 |
+
"\n",
|
| 437 |
+
"\n",
|
| 438 |
+
"# Создаем экземпляр пайплайна\n",
|
| 439 |
+
"pipeline = NewsClassificationPipeline()\n",
|
| 440 |
+
"print(\"✅ Пайплайн LangGraph готов к работе\")\n"
|
| 441 |
+
]
|
| 442 |
+
},
|
| 443 |
+
{
|
| 444 |
+
"cell_type": "markdown",
|
| 445 |
+
"metadata": {},
|
| 446 |
+
"source": [
|
| 447 |
+
"## Демонстрация работы пайплайна\n"
|
| 448 |
+
]
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"cell_type": "code",
|
| 452 |
+
"execution_count": null,
|
| 453 |
+
"metadata": {},
|
| 454 |
+
"outputs": [],
|
| 455 |
+
"source": [
|
| 456 |
+
"# Примеры для тестирования\n",
|
| 457 |
+
"test_posts = [\n",
|
| 458 |
+
" \"\"\"▪️Apple представила iPhone 17 на презентации 10 сентября 2025 года. \n",
|
| 459 |
+
" Новый смартфон получил процессор A19 Bionic и камеру на 200 мегапикселей. \n",
|
| 460 |
+
" Цена в России начинается от 129 990 рублей.\"\"\",\n",
|
| 461 |
+
" \n",
|
| 462 |
+
" \"\"\"▪️Путин заявил о готовности к переговорам по Украине.\n",
|
| 463 |
+
" «Мы всегда открыты к диалогу», – подчеркнул президент на встрече с журналистами.\n",
|
| 464 |
+
" При этом он отметил, что условия для переговоров должны учитывать интересы России.\"\"\",\n",
|
| 465 |
+
" \n",
|
| 466 |
+
" \"\"\"▪️Роскомнадзор сообщил об ограничении звонков через Telegram и WhatsApp.\n",
|
| 467 |
+
" «По данным правоохранительных органов, иностранные мессенджеры стали основными \n",
|
| 468 |
+
" голосовыми сервисами для обмана граждан», – пояснили в пресс-службе ведомства.\"\"\",\n",
|
| 469 |
+
" \n",
|
| 470 |
+
" \"\"\"▪️Индекс Мосбиржи упал на 3,2% по итогам торгов 13 марта 2025 года.\n",
|
| 471 |
+
" Основными аутсайдерами стали акции Сбербанка (-4,5%) и Газпрома (-3,8%).\n",
|
| 472 |
+
" Аналитики связывают падение с геополитической напряжённостью.\"\"\"\n",
|
| 473 |
+
"]\n"
|
| 474 |
+
]
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"cell_type": "code",
|
| 478 |
+
"execution_count": null,
|
| 479 |
+
"metadata": {},
|
| 480 |
+
"outputs": [],
|
| 481 |
+
"source": [
|
| 482 |
+
"# Демонстрация потоковой обработки (streaming) - уникальная возможность LangGraph\n",
|
| 483 |
+
"print(\"🔄 Потоковая обработка первого поста:\\n\")\n",
|
| 484 |
+
"print(f\"Текст: {test_posts[0][:100]}...\\n\")\n",
|
| 485 |
+
"\n",
|
| 486 |
+
"for step in pipeline.stream(test_posts[0]):\n",
|
| 487 |
+
" node_name = list(step.keys())[0]\n",
|
| 488 |
+
" print(f\"📍 Узел: {node_name}\")\n",
|
| 489 |
+
" \n",
|
| 490 |
+
" if node_name == \"extraction\" and step[node_name].get(\"main_message\"):\n",
|
| 491 |
+
" msg = step[node_name][\"main_message\"]\n",
|
| 492 |
+
" print(f\" Тема: {msg.main_topic}\")\n",
|
| 493 |
+
" print(f\" Сущности: {', '.join(msg.key_entities)}\")\n",
|
| 494 |
+
" \n",
|
| 495 |
+
" if node_name == \"classification\" and step[node_name].get(\"classification\"):\n",
|
| 496 |
+
" cls = step[node_name][\"classification\"]\n",
|
| 497 |
+
" status = \"✅ ОДНОЗНАЧНЫЙ\" if cls.is_unambiguous else \"⚠️ НЕОДНОЗНАЧНЫЙ\"\n",
|
| 498 |
+
" print(f\" Статус: {status}\")\n",
|
| 499 |
+
" print(f\" Сложность: {cls.search_difficulty}\")\n",
|
| 500 |
+
" print()\n"
|
| 501 |
+
]
|
| 502 |
+
},
|
| 503 |
+
{
|
| 504 |
+
"cell_type": "code",
|
| 505 |
+
"execution_count": null,
|
| 506 |
+
"metadata": {},
|
| 507 |
+
"outputs": [],
|
| 508 |
+
"source": [
|
| 509 |
+
"# Обработка всех тестовых примеров\n",
|
| 510 |
+
"results = []\n",
|
| 511 |
+
"\n",
|
| 512 |
+
"for i, post in enumerate(test_posts, 1):\n",
|
| 513 |
+
" print(f\"\\n{'='*80}\")\n",
|
| 514 |
+
" print(f\"📰 ПОСТ #{i}\")\n",
|
| 515 |
+
" print(f\"{'='*80}\")\n",
|
| 516 |
+
" print(post[:200] + \"...\" if len(post) > 200 else post)\n",
|
| 517 |
+
" \n",
|
| 518 |
+
" result = pipeline.process(post)\n",
|
| 519 |
+
" results.append(result)\n",
|
| 520 |
+
" \n",
|
| 521 |
+
" if result.error:\n",
|
| 522 |
+
" print(f\"\\n❌ Ошибка: {result.error}\")\n",
|
| 523 |
+
" continue\n",
|
| 524 |
+
" \n",
|
| 525 |
+
" print(f\"\\n📋 ОСНОВНАЯ МЫСЛЬ:\")\n",
|
| 526 |
+
" print(f\" Тема: {result.main_message.main_topic}\")\n",
|
| 527 |
+
" print(f\" Сущности: {', '.join(result.main_message.key_entities)}\")\n",
|
| 528 |
+
" print(f\" Факт: {result.main_message.main_fact_or_statement}\")\n",
|
| 529 |
+
" print(f\" Время: {result.main_message.temporal_context}\")\n",
|
| 530 |
+
" \n",
|
| 531 |
+
" print(f\"\\n🎯 КЛАССИФИКАЦИЯ:\")\n",
|
| 532 |
+
" status = \"✅ ОДНОЗНАЧНЫЙ\" if result.classification.is_unambiguous else \"⚠️ НЕОДНОЗНАЧНЫЙ\"\n",
|
| 533 |
+
" print(f\" Статус: {status}\")\n",
|
| 534 |
+
" print(f\" Уверенность: {result.classification.confidence:.0%}\")\n",
|
| 535 |
+
" print(f\" Категория: {result.classification.category}\")\n",
|
| 536 |
+
" print(f\" Сложность поиска: {result.classification.search_difficulty}\")\n",
|
| 537 |
+
" if result.classification.ambiguity_reasons:\n",
|
| 538 |
+
" print(f\" Причины неоднозначности: {', '.join(result.classification.ambiguity_reasons)}\")\n",
|
| 539 |
+
" print(f\" Поисковый запрос: {result.classification.suggested_search_query}\")\n"
|
| 540 |
+
]
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"cell_type": "markdown",
|
| 544 |
+
"metadata": {},
|
| 545 |
+
"source": [
|
| 546 |
+
"## Преобразование результатов в DataFrame\n"
|
| 547 |
+
]
|
| 548 |
+
},
|
| 549 |
+
{
|
| 550 |
+
"cell_type": "code",
|
| 551 |
+
"execution_count": null,
|
| 552 |
+
"metadata": {},
|
| 553 |
+
"outputs": [],
|
| 554 |
+
"source": [
|
| 555 |
+
"def results_to_dataframe(results: list[PipelineResult]) -> pd.DataFrame:\n",
|
| 556 |
+
" \"\"\"Преобразует результаты в pandas DataFrame\"\"\"\n",
|
| 557 |
+
" rows = []\n",
|
| 558 |
+
" \n",
|
| 559 |
+
" for r in results:\n",
|
| 560 |
+
" if r.error or not r.main_message or not r.classification:\n",
|
| 561 |
+
" continue\n",
|
| 562 |
+
" \n",
|
| 563 |
+
" rows.append({\n",
|
| 564 |
+
" \"original_text\": r.original_text[:100] + \"...\",\n",
|
| 565 |
+
" \"main_topic\": r.main_message.main_topic,\n",
|
| 566 |
+
" \"key_entities\": \", \".join(r.main_message.key_entities),\n",
|
| 567 |
+
" \"main_fact\": r.main_message.main_fact_or_statement,\n",
|
| 568 |
+
" \"temporal_context\": r.main_message.temporal_context,\n",
|
| 569 |
+
" \"is_unambiguous\": r.classification.is_unambiguous,\n",
|
| 570 |
+
" \"confidence\": r.classification.confidence,\n",
|
| 571 |
+
" \"category\": r.classification.category,\n",
|
| 572 |
+
" \"search_difficulty\": r.classification.search_difficulty,\n",
|
| 573 |
+
" \"ambiguity_reasons\": \", \".join(r.classification.ambiguity_reasons),\n",
|
| 574 |
+
" \"suggested_query\": r.classification.suggested_search_query\n",
|
| 575 |
+
" })\n",
|
| 576 |
+
" \n",
|
| 577 |
+
" return pd.DataFrame(rows)\n",
|
| 578 |
+
"\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"df_results = results_to_dataframe(results)\n",
|
| 581 |
+
"df_results\n"
|
| 582 |
+
]
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"cell_type": "markdown",
|
| 586 |
+
"metadata": {},
|
| 587 |
+
"source": [
|
| 588 |
+
"## Применение к реальным данным\n"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"cell_type": "code",
|
| 593 |
+
"execution_count": null,
|
| 594 |
+
"metadata": {},
|
| 595 |
+
"outputs": [],
|
| 596 |
+
"source": [
|
| 597 |
+
"# Загрузка реальных данных\n",
|
| 598 |
+
"data = pd.read_csv('src/dataset/rbc/channel_rbc_news_posts.csv')\n",
|
| 599 |
+
"data[\"message_dt\"] = pd.to_datetime(data[\"message_dt\"])\n",
|
| 600 |
+
"data = data.sort_values(\"message_dt\")\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"print(f\"Загружено {len(data)} постов\")\n",
|
| 603 |
+
"print(f\"Период: {data['message_dt'].min()} - {data['message_dt'].max()}\")\n",
|
| 604 |
+
"data.head()\n"
|
| 605 |
+
]
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"cell_type": "code",
|
| 609 |
+
"execution_count": null,
|
| 610 |
+
"metadata": {},
|
| 611 |
+
"outputs": [],
|
| 612 |
+
"source": [
|
| 613 |
+
"# Обработка выборки постов\n",
|
| 614 |
+
"SAMPLE_SIZE = 10\n",
|
| 615 |
+
"\n",
|
| 616 |
+
"sample_data = data.sample(n=min(SAMPLE_SIZE, len(data)), random_state=42)\n",
|
| 617 |
+
"sample_texts = sample_data[\"content\"].dropna().tolist()\n",
|
| 618 |
+
"\n",
|
| 619 |
+
"print(f\"Обрабатываем {len(sample_texts)} постов...\")\n",
|
| 620 |
+
"\n",
|
| 621 |
+
"sample_results = pipeline.process_batch(sample_texts)\n"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"cell_type": "code",
|
| 626 |
+
"execution_count": null,
|
| 627 |
+
"metadata": {},
|
| 628 |
+
"outputs": [],
|
| 629 |
+
"source": [
|
| 630 |
+
"# Анализ результатов\n",
|
| 631 |
+
"df_sample_results = results_to_dataframe(sample_results)\n",
|
| 632 |
+
"\n",
|
| 633 |
+
"if len(df_sample_results) > 0:\n",
|
| 634 |
+
" print(\"\\n📊 СТАТИСТИКА КЛАССИФИКАЦИИ:\")\n",
|
| 635 |
+
" print(f\" Однозначных постов: {df_sample_results['is_unambiguous'].sum()} ({df_sample_results['is_unambiguous'].mean():.0%})\")\n",
|
| 636 |
+
" print(f\" Неоднозначных постов: {(~df_sample_results['is_unambiguous']).sum()} ({(~df_sample_results['is_unambiguous']).mean():.0%})\")\n",
|
| 637 |
+
"\n",
|
| 638 |
+
" print(\"\\n📈 РАСПРЕДЕЛЕНИЕ ПО СЛОЖНОСТИ:\")\n",
|
| 639 |
+
" print(df_sample_results['search_difficulty'].value_counts())\n",
|
| 640 |
+
"\n",
|
| 641 |
+
" print(\"\\n📂 РАСПРЕДЕЛЕНИЕ ПО КАТЕГОРИЯМ:\")\n",
|
| 642 |
+
" print(df_sample_results['category'].value_counts())\n",
|
| 643 |
+
"else:\n",
|
| 644 |
+
" print(\"Нет успешно обработанных результатов\")\n"
|
| 645 |
+
]
|
| 646 |
+
},
|
| 647 |
+
{
|
| 648 |
+
"cell_type": "code",
|
| 649 |
+
"execution_count": null,
|
| 650 |
+
"metadata": {},
|
| 651 |
+
"outputs": [],
|
| 652 |
+
"source": [
|
| 653 |
+
"# Сохранение результатов\n",
|
| 654 |
+
"if len(df_sample_results) > 0:\n",
|
| 655 |
+
" df_sample_results.to_csv('classification_results_langgraph.csv', index=False)\n",
|
| 656 |
+
" print(\"✅ Результаты сохранены в classification_results_langgraph.csv\")\n"
|
| 657 |
+
]
|
| 658 |
+
},
|
| 659 |
+
{
|
| 660 |
+
"cell_type": "markdown",
|
| 661 |
+
"metadata": {},
|
| 662 |
+
"source": [
|
| 663 |
+
"## Преимущества LangGraph\n",
|
| 664 |
+
"\n",
|
| 665 |
+
"**LangGraph** предоставляет ряд преимуществ по сравнению с обычными цепочками LangChain:\n",
|
| 666 |
+
"\n",
|
| 667 |
+
"1. **Явное управление состоянием** - `GraphState` определяет схему данных, передаваемых между узлами\n",
|
| 668 |
+
"2. **Потоковая обработка (Streaming)** - возможность отслеживать промежуточные результаты через `graph.stream()`\n",
|
| 669 |
+
"3. **Визуализация** - граф можно визуализировать для понимания потока данных\n",
|
| 670 |
+
"4. **Условные переходы** - можно добавить условную логику для разветвления графа\n",
|
| 671 |
+
"5. **Циклы** - поддержка циклических графов для итеративных агентов\n",
|
| 672 |
+
"6. **Checkpointing** - сохранение состояния для возобновления обработки\n",
|
| 673 |
+
"\n",
|
| 674 |
+
"### Структура графа\n",
|
| 675 |
+
"\n",
|
| 676 |
+
"```\n",
|
| 677 |
+
"[START] → extraction → classification → [END]\n",
|
| 678 |
+
"```\n",
|
| 679 |
+
"\n",
|
| 680 |
+
"- **extraction**: Извлекает основную мысль из текста\n",
|
| 681 |
+
"- **classification**: Классифицирует контент по однозначности для поиска\n"
|
| 682 |
+
]
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"cell_type": "markdown",
|
| 686 |
+
"metadata": {},
|
| 687 |
+
"source": []
|
| 688 |
+
}
|
| 689 |
+
],
|
| 690 |
+
"metadata": {
|
| 691 |
+
"language_info": {
|
| 692 |
+
"name": "python"
|
| 693 |
+
}
|
| 694 |
+
},
|
| 695 |
+
"nbformat": 4,
|
| 696 |
+
"nbformat_minor": 2
|
| 697 |
+
}
|
news_classification_pipeline.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qa_evaluation_example.ipynb
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"Загружено 167 записей\n"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"data": {
|
| 17 |
+
"text/html": [
|
| 18 |
+
"<div>\n",
|
| 19 |
+
"<style scoped>\n",
|
| 20 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 21 |
+
" vertical-align: middle;\n",
|
| 22 |
+
" }\n",
|
| 23 |
+
"\n",
|
| 24 |
+
" .dataframe tbody tr th {\n",
|
| 25 |
+
" vertical-align: top;\n",
|
| 26 |
+
" }\n",
|
| 27 |
+
"\n",
|
| 28 |
+
" .dataframe thead th {\n",
|
| 29 |
+
" text-align: right;\n",
|
| 30 |
+
" }\n",
|
| 31 |
+
"</style>\n",
|
| 32 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 33 |
+
" <thead>\n",
|
| 34 |
+
" <tr style=\"text-align: right;\">\n",
|
| 35 |
+
" <th></th>\n",
|
| 36 |
+
" <th>message_id</th>\n",
|
| 37 |
+
" <th>original_text</th>\n",
|
| 38 |
+
" <th>strict_question</th>\n",
|
| 39 |
+
" <th>real_question</th>\n",
|
| 40 |
+
" </tr>\n",
|
| 41 |
+
" </thead>\n",
|
| 42 |
+
" <tbody>\n",
|
| 43 |
+
" <tr>\n",
|
| 44 |
+
" <th>0</th>\n",
|
| 45 |
+
" <td>130738</td>\n",
|
| 46 |
+
" <td>Итальянский суд принял решение экстрадировать ...</td>\n",
|
| 47 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 48 |
+
" <td>Что там с Кузнецовым — его в Германию выдадут ...</td>\n",
|
| 49 |
+
" </tr>\n",
|
| 50 |
+
" <tr>\n",
|
| 51 |
+
" <th>1</th>\n",
|
| 52 |
+
" <td>129361</td>\n",
|
| 53 |
+
" <td>Пять пассажиров автобуса №793 пострадали в ДТП...</td>\n",
|
| 54 |
+
" <td>Сколько пассажиров автобуса №793 пострадали в ...</td>\n",
|
| 55 |
+
" <td>Сколько человек в автобусе 793 пострадали, ког...</td>\n",
|
| 56 |
+
" </tr>\n",
|
| 57 |
+
" <tr>\n",
|
| 58 |
+
" <th>2</th>\n",
|
| 59 |
+
" <td>133468</td>\n",
|
| 60 |
+
" <td>Владимир Путин утвердил концепцию государствен...</td>\n",
|
| 61 |
+
" <td>Кто утвердил концепцию государственной миграци...</td>\n",
|
| 62 |
+
" <td>Кто там утвердил новую миграционную концепцию ...</td>\n",
|
| 63 |
+
" </tr>\n",
|
| 64 |
+
" <tr>\n",
|
| 65 |
+
" <th>3</th>\n",
|
| 66 |
+
" <td>123139</td>\n",
|
| 67 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 68 |
+
" <td>Какое юридическое действие предприняли Генерал...</td>\n",
|
| 69 |
+
" <td>Что Генпрокуратура и Минюст сделали с сатанист...</td>\n",
|
| 70 |
+
" </tr>\n",
|
| 71 |
+
" <tr>\n",
|
| 72 |
+
" <th>4</th>\n",
|
| 73 |
+
" <td>129894</td>\n",
|
| 74 |
+
" <td>Обломки дрона обнаружили польские пограничники...</td>\n",
|
| 75 |
+
" <td>Где и кем был обнаружен непилотируемый летател...</td>\n",
|
| 76 |
+
" <td>Что там польские пограничники нашли рядом с Бе...</td>\n",
|
| 77 |
+
" </tr>\n",
|
| 78 |
+
" </tbody>\n",
|
| 79 |
+
"</table>\n",
|
| 80 |
+
"</div>"
|
| 81 |
+
],
|
| 82 |
+
"text/plain": [
|
| 83 |
+
" message_id original_text \\\n",
|
| 84 |
+
"0 130738 Итальянский суд принял решение экстрадировать ... \n",
|
| 85 |
+
"1 129361 Пять пассажиров автобуса №793 пострадали в ДТП... \n",
|
| 86 |
+
"2 133468 Владимир Путин утвердил концепцию государствен... \n",
|
| 87 |
+
"3 123139 Генпрокуратура и Минюст подали в Верховный суд... \n",
|
| 88 |
+
"4 129894 Обломки дрона обнаружили польские пограничники... \n",
|
| 89 |
+
"\n",
|
| 90 |
+
" strict_question \\\n",
|
| 91 |
+
"0 Какое решение приняло итальянское судопроизвод... \n",
|
| 92 |
+
"1 Сколько пассажиров автобуса №793 пострадали в ... \n",
|
| 93 |
+
"2 Кто утвердил концепцию государственной миграци... \n",
|
| 94 |
+
"3 Какое юридическое действие предприняли Генерал... \n",
|
| 95 |
+
"4 Где и кем был обнаружен непилотируемый летател... \n",
|
| 96 |
+
"\n",
|
| 97 |
+
" real_question \n",
|
| 98 |
+
"0 Что там с Кузнецовым — его в Германию выдадут ... \n",
|
| 99 |
+
"1 Сколько человек в автобусе 793 пострадали, ког... \n",
|
| 100 |
+
"2 Кто там утвердил новую миграционную концепцию ... \n",
|
| 101 |
+
"3 Что Генпрокуратура и Минюст сделали с сатанист... \n",
|
| 102 |
+
"4 Что там польские пограничники нашли рядом с Бе... "
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
"execution_count": 5,
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"output_type": "execute_result"
|
| 108 |
+
}
|
| 109 |
+
],
|
| 110 |
+
"source": [
|
| 111 |
+
"from src.evaluation import QAEvaluator\n",
|
| 112 |
+
"from dotenv import load_dotenv\n",
|
| 113 |
+
"import pandas as pd\n",
|
| 114 |
+
"import os\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"load_dotenv()\n",
|
| 117 |
+
"\n",
|
| 118 |
+
"qa_df = pd.read_csv(\"generated_qa.csv\")\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"qa_df.head()"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": null,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [],
|
| 128 |
+
"source": [
|
| 129 |
+
"evaluator = QAEvaluator(\n",
|
| 130 |
+
" df=qa_df,\n",
|
| 131 |
+
" text_column=\"original_text\",\n",
|
| 132 |
+
" temperature=0.0,\n",
|
| 133 |
+
" api_key=os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 134 |
+
")\n"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "code",
|
| 139 |
+
"execution_count": null,
|
| 140 |
+
"metadata": {},
|
| 141 |
+
"outputs": [
|
| 142 |
+
{
|
| 143 |
+
"name": "stdout",
|
| 144 |
+
"output_type": "stream",
|
| 145 |
+
"text": [
|
| 146 |
+
"Всего батчей: 34\n"
|
| 147 |
+
]
|
| 148 |
+
}
|
| 149 |
+
],
|
| 150 |
+
"source": [
|
| 151 |
+
"batch_size = 5\n",
|
| 152 |
+
"question_iterator = evaluator.get_questions(\n",
|
| 153 |
+
" question_column=\"strict_question\", # или \"real_question\"\n",
|
| 154 |
+
" batch_size=batch_size\n",
|
| 155 |
+
")\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"print(f\"Всего батчей: {len(question_iterator)}\")"
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"cell_type": "markdown",
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"source": [
|
| 164 |
+
"## Демонстрация оценки ответов\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"Ниже показан пример оценки ответов от RAG системы.\n"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": null,
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"outputs": [
|
| 174 |
+
{
|
| 175 |
+
"name": "stdout",
|
| 176 |
+
"output_type": "stream",
|
| 177 |
+
"text": [
|
| 178 |
+
"Демо-вопросов: 5\n",
|
| 179 |
+
"Демо-ответов: 5\n"
|
| 180 |
+
]
|
| 181 |
+
}
|
| 182 |
+
],
|
| 183 |
+
"source": [
|
| 184 |
+
"N_DEMO = 5\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"evaluator_demo = QAEvaluator(\n",
|
| 187 |
+
" df=qa_df.head(N_DEMO),\n",
|
| 188 |
+
" text_column=\"original_text\",\n",
|
| 189 |
+
" api_key=os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 190 |
+
")\n",
|
| 191 |
+
"\n",
|
| 192 |
+
"question_iterator = evaluator_demo.get_questions(\n",
|
| 193 |
+
" question_column=\"strict_question\",\n",
|
| 194 |
+
" batch_size=N_DEMO\n",
|
| 195 |
+
")\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"demo_questions = next(iter(question_iterator))\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"demo_answers = qa_df[\"original_text\"].head(N_DEMO).tolist()\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"print(f\"Демо-вопросов: {len(demo_questions)}\")\n",
|
| 202 |
+
"print(f\"Демо-ответов: {len(demo_answers)}\")"
|
| 203 |
+
]
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"cell_type": "code",
|
| 207 |
+
"execution_count": null,
|
| 208 |
+
"metadata": {},
|
| 209 |
+
"outputs": [
|
| 210 |
+
{
|
| 211 |
+
"name": "stderr",
|
| 212 |
+
"output_type": "stream",
|
| 213 |
+
"text": [
|
| 214 |
+
"Оценка ответов: 100%|██████████| 5/5 [00:05<00:00, 1.07s/it]\n"
|
| 215 |
+
]
|
| 216 |
+
}
|
| 217 |
+
],
|
| 218 |
+
"source": [
|
| 219 |
+
"_ = evaluator_demo.get_questions(\n",
|
| 220 |
+
" question_column=\"strict_question\",\n",
|
| 221 |
+
" batch_size=N_DEMO\n",
|
| 222 |
+
")\n",
|
| 223 |
+
"\n",
|
| 224 |
+
"metrics = evaluator_demo.evaluate_answers(demo_answers, show_progress=True)"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"cell_type": "code",
|
| 229 |
+
"execution_count": 13,
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"outputs": [
|
| 232 |
+
{
|
| 233 |
+
"data": {
|
| 234 |
+
"text/plain": [
|
| 235 |
+
"['Итальянский суд принял решение экстрадировать в Германию задержанного по подозрению в подрыве «Северных потоков» Сергея Кузнецова, пишет Reuters.\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 236 |
+
" 'Пять пассажиров автобуса №793 пострадали в ДТП с участием автомобиля, которое произошло на пересечении Мичуринского проспекта и проезда Олимпийской Деревни в Москве, сообщил «Мосгортранс». \\n\\nПо предварительной информации, водитель автомобиля внезапно перестроился в полосу движения автобуса, из-за чего произошло столкновение. Всем пострадавшим оказывается медицинская помощь.\\n\\n🐚 Другие видео этого дня — в телеграм-канале РБК',\n",
|
| 237 |
+
" 'Владимир Путин утвердил концепцию государственной миграционной политики России на 2026-2030 годы.\\n\\nРезультатом политики должно стать снижение числа нелегальных мигрантов и преступлений.\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 238 |
+
" 'Генпрокуратура и Минюст подали в Верховный суд иск с требованием признать экстремистской организацией «Международное движение сатанизма» и запретить его деятельность на территории России, передает РАПСИ. \\n\\nДепутаты, военные, священники и общественники в России регулярно заявляют о необходимости бороться с «сатанизмом». Круглый стол на эту тему, например, проходил 8 апреля в Госдуме. Депутат от «Справедливой России — За правду» актер Николай Бурляев тогда рассказал, что в Госдуму поступают обращения граждан, которые обеспокоены деструктивными явлениями в книгах, фильмах, аниме, а также сообщениями о «сатанинских секс-оргиях» в Москве и других городах.\\n\\nФото: Getty',\n",
|
| 239 |
+
" 'Обломки дрона обнаружили польские пограничники в деревне возле границы с Белоруссией, сообщает Reuters.\\n\\n«Этот дрон рухнул у границы, примерно в 300 метрах от пограничного перехода, в деревне Полатыче. Дрон не вооружен, на корпусе имеются надписи на кириллице», — рассказала Агнешка Кепка из прокуратуры города Люблин на пресс-конференции.\\n\\nВоенные полицейские опрашивают свидетелей и проверяют видеозаписи, чтобы установить траекторию полета дрона, добавила она. Никто не пострадал, подчеркнули в полиции.\\n\\nФото: Kuba Stezycki / Reuters\\n\\n🐚 Следить за новостями РБК в Telegram']"
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
"execution_count": 13,
|
| 243 |
+
"metadata": {},
|
| 244 |
+
"output_type": "execute_result"
|
| 245 |
+
}
|
| 246 |
+
],
|
| 247 |
+
"source": [
|
| 248 |
+
"demo_answers"
|
| 249 |
+
]
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"cell_type": "code",
|
| 253 |
+
"execution_count": 12,
|
| 254 |
+
"metadata": {},
|
| 255 |
+
"outputs": [
|
| 256 |
+
{
|
| 257 |
+
"data": {
|
| 258 |
+
"text/plain": [
|
| 259 |
+
"{'total_questions': 5,\n",
|
| 260 |
+
" 'valid_answers': 4,\n",
|
| 261 |
+
" 'invalid_answers': 1,\n",
|
| 262 |
+
" 'accuracy': 0.8,\n",
|
| 263 |
+
" 'avg_relevance': 0.9,\n",
|
| 264 |
+
" 'avg_completeness': 0.9,\n",
|
| 265 |
+
" 'avg_factual_accuracy': 1.0,\n",
|
| 266 |
+
" 'combined_score': 0.9333333333333332,\n",
|
| 267 |
+
" 'detailed_results': [{'index': 0,\n",
|
| 268 |
+
" 'question': 'Какое решение приняло итальянское судопроизводство в отношении экстрадиции Сергея Кузнецова в связи с подозрениями в причастности к подрыву газопроводов «Северные потоки»?',\n",
|
| 269 |
+
" 'answer': 'Итальянский суд принял решение экстрадировать в Германию задержанного по подозрению в подрыве «Северных потоков» Сергея Кузнецова, пишет Reuters.\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 270 |
+
" 'is_valid': True,\n",
|
| 271 |
+
" 'relevance_score': 1.0,\n",
|
| 272 |
+
" 'completeness_score': 1.0,\n",
|
| 273 |
+
" 'factual_accuracy_score': 1.0},\n",
|
| 274 |
+
" {'index': 1,\n",
|
| 275 |
+
" 'question': 'Сколько пассажиров автобуса №793 пострадали в дорожно-транспортном происшествии на пересечении Мичуринского проспекта и проезда Олимпийской Деревни в Москве, вызванном внезапным перестроением автомобиля?',\n",
|
| 276 |
+
" 'answer': 'Пять пассажиров автобуса №793 пострадали в ДТП с участием автомобиля, которое произошло на пересечении Мичуринского проспекта и проезда Олимпийской Деревни в Москве, сообщил «Мосгортранс». \\n\\nПо предварительной информации, водитель автомобиля внезапно перестроился в полосу движения автобуса, из-за чего произошло столкновение. Всем пострадавшим оказывается медицинская помощь.\\n\\n🐚 Другие видео этого дня — в телеграм-канале РБК',\n",
|
| 277 |
+
" 'is_valid': True,\n",
|
| 278 |
+
" 'relevance_score': 1.0,\n",
|
| 279 |
+
" 'completeness_score': 1.0,\n",
|
| 280 |
+
" 'factual_accuracy_score': 1.0},\n",
|
| 281 |
+
" {'index': 2,\n",
|
| 282 |
+
" 'question': 'Кто утвердил концепцию государственной миграционной политики Российской Федерации на 2026–2030 годы с целью снижения числа нелегальных мигрантов и преступлений?',\n",
|
| 283 |
+
" 'answer': 'Владимир Путин утвердил концепцию государственной миграционной политики России на 2026-2030 годы.\\n\\nРезультатом политики должно стать снижение числа нелегальных мигрантов и преступлений.\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 284 |
+
" 'is_valid': True,\n",
|
| 285 |
+
" 'relevance_score': 1.0,\n",
|
| 286 |
+
" 'completeness_score': 1.0,\n",
|
| 287 |
+
" 'factual_accuracy_score': 1.0},\n",
|
| 288 |
+
" {'index': 3,\n",
|
| 289 |
+
" 'question': 'Какое юридическое действие предприняли Генеральная прокуратура Российской Федерации и Министерство юстиции Российской Федерации в отношении «Международного движения сатанизма»?',\n",
|
| 290 |
+
" 'answer': 'Генпрокуратура и Минюст подали в Верховный суд иск с требованием признать экстремистской организацией «Международное движение сатанизма» и запретить его деятельность на территории России, передает РАПСИ. \\n\\nДепутаты, военные, священники и общественники в России регулярно заявляют о необходимости бороться с «сатанизмом». Круглый стол на эту тему, например, проходил 8 апреля в Госдуме. Депутат от «Справедливой России — За правду» актер Николай Бурляев тогда рассказал, что в Госдуму поступают обращения граждан, которые обеспокоены деструктивными явлениями в книгах, фильмах, аниме, а также сообщениями о «сатанинских секс-оргиях» в Москве и других городах.\\n\\nФото: Getty',\n",
|
| 291 |
+
" 'is_valid': False,\n",
|
| 292 |
+
" 'relevance_score': 0.5,\n",
|
| 293 |
+
" 'completeness_score': 0.5,\n",
|
| 294 |
+
" 'factual_accuracy_score': 1.0},\n",
|
| 295 |
+
" {'index': 4,\n",
|
| 296 |
+
" 'question': 'Где и кем был обнаружен непилотируемый летательный аппарат без вооружения с надписями на кириллице, и в каком именно месте произошло это событие?',\n",
|
| 297 |
+
" 'answer': 'Обломки дрона обнаружили польские пограничники в деревне возле границы с Белоруссией, сообщает Reuters.\\n\\n«Этот др��н рухнул у границы, примерно в 300 метрах от пограничного перехода, в деревне Полатыче. Дрон не вооружен, на корпусе имеются надписи на кириллице», — рассказала Агнешка Кепка из прокуратуры города Люблин на пресс-конференции.\\n\\nВоенные полицейские опрашивают свидетелей и проверяют видеозаписи, чтобы установить траекторию полета дрона, добавила она. Никто не пострадал, подчеркнули в полиции.\\n\\nФото: Kuba Stezycki / Reuters\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 298 |
+
" 'is_valid': True,\n",
|
| 299 |
+
" 'relevance_score': 1.0,\n",
|
| 300 |
+
" 'completeness_score': 1.0,\n",
|
| 301 |
+
" 'factual_accuracy_score': 1.0}]}"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
"execution_count": 12,
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"output_type": "execute_result"
|
| 307 |
+
}
|
| 308 |
+
],
|
| 309 |
+
"source": [
|
| 310 |
+
"metrics"
|
| 311 |
+
]
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"cell_type": "code",
|
| 315 |
+
"execution_count": null,
|
| 316 |
+
"metadata": {},
|
| 317 |
+
"outputs": [
|
| 318 |
+
{
|
| 319 |
+
"data": {
|
| 320 |
+
"text/html": [
|
| 321 |
+
"<div>\n",
|
| 322 |
+
"<style scoped>\n",
|
| 323 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 324 |
+
" vertical-align: middle;\n",
|
| 325 |
+
" }\n",
|
| 326 |
+
"\n",
|
| 327 |
+
" .dataframe tbody tr th {\n",
|
| 328 |
+
" vertical-align: top;\n",
|
| 329 |
+
" }\n",
|
| 330 |
+
"\n",
|
| 331 |
+
" .dataframe thead th {\n",
|
| 332 |
+
" text-align: right;\n",
|
| 333 |
+
" }\n",
|
| 334 |
+
"</style>\n",
|
| 335 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 336 |
+
" <thead>\n",
|
| 337 |
+
" <tr style=\"text-align: right;\">\n",
|
| 338 |
+
" <th></th>\n",
|
| 339 |
+
" <th>index</th>\n",
|
| 340 |
+
" <th>question</th>\n",
|
| 341 |
+
" <th>answer</th>\n",
|
| 342 |
+
" <th>is_valid</th>\n",
|
| 343 |
+
" <th>relevance_score</th>\n",
|
| 344 |
+
" <th>completeness_score</th>\n",
|
| 345 |
+
" <th>factual_accuracy_score</th>\n",
|
| 346 |
+
" </tr>\n",
|
| 347 |
+
" </thead>\n",
|
| 348 |
+
" <tbody>\n",
|
| 349 |
+
" <tr>\n",
|
| 350 |
+
" <th>0</th>\n",
|
| 351 |
+
" <td>0</td>\n",
|
| 352 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 353 |
+
" <td>Итальянский суд принял решение экстрадировать ...</td>\n",
|
| 354 |
+
" <td>True</td>\n",
|
| 355 |
+
" <td>1.0</td>\n",
|
| 356 |
+
" <td>1.0</td>\n",
|
| 357 |
+
" <td>1.0</td>\n",
|
| 358 |
+
" </tr>\n",
|
| 359 |
+
" <tr>\n",
|
| 360 |
+
" <th>1</th>\n",
|
| 361 |
+
" <td>1</td>\n",
|
| 362 |
+
" <td>Сколько пассажиров автобуса №793 пострадали в ...</td>\n",
|
| 363 |
+
" <td>Пять пассажиров автобуса №793 пострадали в ДТП...</td>\n",
|
| 364 |
+
" <td>True</td>\n",
|
| 365 |
+
" <td>1.0</td>\n",
|
| 366 |
+
" <td>1.0</td>\n",
|
| 367 |
+
" <td>1.0</td>\n",
|
| 368 |
+
" </tr>\n",
|
| 369 |
+
" <tr>\n",
|
| 370 |
+
" <th>2</th>\n",
|
| 371 |
+
" <td>2</td>\n",
|
| 372 |
+
" <td>Кто утвердил концепцию государственной миграци...</td>\n",
|
| 373 |
+
" <td>Владимир Путин утвердил концепцию государствен...</td>\n",
|
| 374 |
+
" <td>True</td>\n",
|
| 375 |
+
" <td>1.0</td>\n",
|
| 376 |
+
" <td>1.0</td>\n",
|
| 377 |
+
" <td>1.0</td>\n",
|
| 378 |
+
" </tr>\n",
|
| 379 |
+
" <tr>\n",
|
| 380 |
+
" <th>3</th>\n",
|
| 381 |
+
" <td>3</td>\n",
|
| 382 |
+
" <td>Какое юридическое действие предприняли Генерал...</td>\n",
|
| 383 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 384 |
+
" <td>False</td>\n",
|
| 385 |
+
" <td>0.5</td>\n",
|
| 386 |
+
" <td>0.5</td>\n",
|
| 387 |
+
" <td>1.0</td>\n",
|
| 388 |
+
" </tr>\n",
|
| 389 |
+
" <tr>\n",
|
| 390 |
+
" <th>4</th>\n",
|
| 391 |
+
" <td>4</td>\n",
|
| 392 |
+
" <td>Где и кем был обнаружен непилотируемый летател...</td>\n",
|
| 393 |
+
" <td>Обломки дрона обнаружили польские пограничники...</td>\n",
|
| 394 |
+
" <td>True</td>\n",
|
| 395 |
+
" <td>1.0</td>\n",
|
| 396 |
+
" <td>1.0</td>\n",
|
| 397 |
+
" <td>1.0</td>\n",
|
| 398 |
+
" </tr>\n",
|
| 399 |
+
" </tbody>\n",
|
| 400 |
+
"</table>\n",
|
| 401 |
+
"</div>"
|
| 402 |
+
],
|
| 403 |
+
"text/plain": [
|
| 404 |
+
" index question \\\n",
|
| 405 |
+
"0 0 Какое решение приняло итальянское судопроизвод... \n",
|
| 406 |
+
"1 1 Сколько пассажиров автобуса №793 пострадали в ... \n",
|
| 407 |
+
"2 2 Кто утвердил концепцию государственной миграци... \n",
|
| 408 |
+
"3 3 Какое юридическое действие предприняли Генерал... \n",
|
| 409 |
+
"4 4 Где и кем был обнаружен непилотируемый летател... \n",
|
| 410 |
+
"\n",
|
| 411 |
+
" answer is_valid \\\n",
|
| 412 |
+
"0 Итальянский суд принял решение экстрадировать ... True \n",
|
| 413 |
+
"1 Пять пассажиров автобуса №793 пострадали в ДТП... True \n",
|
| 414 |
+
"2 Владимир Путин утвердил концепцию государствен... True \n",
|
| 415 |
+
"3 Генпрокуратура и Минюст подали в Верховный суд... False \n",
|
| 416 |
+
"4 Обломки дрона обнаружили польские пограничники... True \n",
|
| 417 |
+
"\n",
|
| 418 |
+
" relevance_score completeness_score factual_accuracy_score \n",
|
| 419 |
+
"0 1.0 1.0 1.0 \n",
|
| 420 |
+
"1 1.0 1.0 1.0 \n",
|
| 421 |
+
"2 1.0 1.0 1.0 \n",
|
| 422 |
+
"3 0.5 0.5 1.0 \n",
|
| 423 |
+
"4 1.0 1.0 1.0 "
|
| 424 |
+
]
|
| 425 |
+
},
|
| 426 |
+
"execution_count": 14,
|
| 427 |
+
"metadata": {},
|
| 428 |
+
"output_type": "execute_result"
|
| 429 |
+
}
|
| 430 |
+
],
|
| 431 |
+
"source": [
|
| 432 |
+
"results_df = evaluator_demo.get_detailed_results_df(metrics)\n",
|
| 433 |
+
"results_df"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"cell_type": "code",
|
| 438 |
+
"execution_count": null,
|
| 439 |
+
"metadata": {},
|
| 440 |
+
"outputs": [],
|
| 441 |
+
"source": []
|
| 442 |
+
}
|
| 443 |
+
],
|
| 444 |
+
"metadata": {
|
| 445 |
+
"kernelspec": {
|
| 446 |
+
"display_name": "venv",
|
| 447 |
+
"language": "python",
|
| 448 |
+
"name": "python3"
|
| 449 |
+
},
|
| 450 |
+
"language_info": {
|
| 451 |
+
"codemirror_mode": {
|
| 452 |
+
"name": "ipython",
|
| 453 |
+
"version": 3
|
| 454 |
+
},
|
| 455 |
+
"file_extension": ".py",
|
| 456 |
+
"mimetype": "text/x-python",
|
| 457 |
+
"name": "python",
|
| 458 |
+
"nbconvert_exporter": "python",
|
| 459 |
+
"pygments_lexer": "ipython3",
|
| 460 |
+
"version": "3.13.3"
|
| 461 |
+
}
|
| 462 |
+
},
|
| 463 |
+
"nbformat": 4,
|
| 464 |
+
"nbformat_minor": 2
|
| 465 |
+
}
|
question_generation.ipynb
ADDED
|
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 54,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"✅ API ключ загружен\n"
|
| 13 |
+
]
|
| 14 |
+
}
|
| 15 |
+
],
|
| 16 |
+
"source": [
|
| 17 |
+
"import os\n",
|
| 18 |
+
"from typing import Literal, Optional\n",
|
| 19 |
+
"from dotenv import load_dotenv\n",
|
| 20 |
+
"from pydantic import BaseModel, Field\n",
|
| 21 |
+
"from langchain_openai import ChatOpenAI\n",
|
| 22 |
+
"from langchain_core.prompts import ChatPromptTemplate\n",
|
| 23 |
+
"from langchain_core.output_parsers import PydanticOutputParser\n",
|
| 24 |
+
"import pandas as pd\n",
|
| 25 |
+
"from tqdm import tqdm\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"load_dotenv()\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 30 |
+
"if not OPENROUTER_API_KEY:\n",
|
| 31 |
+
" raise ValueError(\"Не найден OPENROUTER_API_KEY в переменных окружения\")\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"print(\"✅ API ключ загружен\")\n"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "markdown",
|
| 38 |
+
"metadata": {},
|
| 39 |
+
"source": [
|
| 40 |
+
"## Загрузка и фильтрация результатов классификации\n"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": 80,
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"df = pd.read_csv(\"classification_results.csv\")\n",
|
| 50 |
+
"filtered_df = df.loc[df[\"is_unambiguous\"] & df[\"category\"].isin([\"event\", \"statement\", \"fact\"])]"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "markdown",
|
| 55 |
+
"metadata": {},
|
| 56 |
+
"source": [
|
| 57 |
+
"## Генерация вопросов с помощью LangChain\n"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": 81,
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [],
|
| 65 |
+
"source": [
|
| 66 |
+
"# Шаг 1: Модель для извлечения ответа\n",
|
| 67 |
+
"class ExtractedAnswer(BaseModel):\n",
|
| 68 |
+
" \"\"\"Извлечённый ответ из main_fact\"\"\"\n",
|
| 69 |
+
" \n",
|
| 70 |
+
" answer: str = Field(\n",
|
| 71 |
+
" description=\"Краткий, конкретный ответ, который можно дать на вопрос о main_fact\"\n",
|
| 72 |
+
" )\n",
|
| 73 |
+
" answer_type: Literal[\"entity\", \"number\", \"date\", \"action\", \"description\"] = Field(\n",
|
| 74 |
+
" description=\"Тип ответа: entity - сущность/название, number - число, date - дата, action - действие, description - описание\"\n",
|
| 75 |
+
" )\n",
|
| 76 |
+
" key_info: str = Field(\n",
|
| 77 |
+
" description=\"Ключевая информация, которая ОБЯЗАТЕЛЬНО должна присутствовать в любом корректном ответе\"\n",
|
| 78 |
+
" )\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# Шаг 2: Модель для генерации вопросов к ответу\n",
|
| 82 |
+
"class QuestionPair(BaseModel):\n",
|
| 83 |
+
" \"\"\"Пара вопросов с одинаковым ответом\"\"\"\n",
|
| 84 |
+
" \n",
|
| 85 |
+
" strict_question: str = Field(\n",
|
| 86 |
+
" description=\"Формальный, точный вопрос. Конкретный и однозначный.\"\n",
|
| 87 |
+
" )\n",
|
| 88 |
+
" real_question: str = Field(\n",
|
| 89 |
+
" description=\"Разговорная, человечная формулировка того же вопроса. Как бы спросил обычный человек.\"\n",
|
| 90 |
+
" )\n",
|
| 91 |
+
" question_type: Literal[\"what\", \"when\", \"where\", \"who\", \"how_much\", \"how_many\", \"why\", \"how\"] = Field(\n",
|
| 92 |
+
" description=\"Тип вопроса\"\n",
|
| 93 |
+
" )\n",
|
| 94 |
+
"\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"class QAResult(BaseModel):\n",
|
| 97 |
+
" \"\"\"Финальный результат: ответ + 2 вопроса\"\"\"\n",
|
| 98 |
+
" \n",
|
| 99 |
+
" answer: ExtractedAnswer\n",
|
| 100 |
+
" questions: QuestionPair\n"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"cell_type": "code",
|
| 105 |
+
"execution_count": 62,
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"outputs": [
|
| 108 |
+
{
|
| 109 |
+
"name": "stdout",
|
| 110 |
+
"output_type": "stream",
|
| 111 |
+
"text": [
|
| 112 |
+
"✅ Двухшаговый агент создан (модель: qwen/qwen3-next-80b-a3b-instruct)\n"
|
| 113 |
+
]
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"source": [
|
| 117 |
+
"def create_llm(model: str = \"openai/gpt-4o-mini\", temperature: float = 0.0) -> ChatOpenAI:\n",
|
| 118 |
+
" \"\"\"Создает экземпляр LLM через OpenRouter\"\"\"\n",
|
| 119 |
+
" return ChatOpenAI(\n",
|
| 120 |
+
" model=model,\n",
|
| 121 |
+
" temperature=temperature,\n",
|
| 122 |
+
" openai_api_key=OPENROUTER_API_KEY,\n",
|
| 123 |
+
" openai_api_base=\"https://api.proxyapi.ru/openrouter/v1\",\n",
|
| 124 |
+
" )\n",
|
| 125 |
+
"\n",
|
| 126 |
+
"MODEL_NAME = \"qwen/qwen3-next-80b-a3b-instruct\"\n",
|
| 127 |
+
"llm = create_llm(model=MODEL_NAME, temperature=0.3)\n",
|
| 128 |
+
"\n",
|
| 129 |
+
"# === ШАГ 1: Извлечение ответа ===\n",
|
| 130 |
+
"answer_parser = PydanticOutputParser(pydantic_object=ExtractedAnswer)\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"answer_prompt = ChatPromptTemplate.from_messages([\n",
|
| 133 |
+
" (\"system\", \"\"\"Ты - эксперт по извлечению ключевой информации из новостных текстов.\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"Твоя задача - извлечь КРАТКИЙ ОТВЕТ из main_fact. Этот ответ будет использоваться как эталонный ответ на вопросы.\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"## Правила:\n",
|
| 138 |
+
"1. Ответ должен быть КРАТКИМ и КОНКРЕТНЫМ (1-2 предложения максимум)\n",
|
| 139 |
+
"2. Ответ должен содержать ГЛАВНУЮ информацию из main_fact\n",
|
| 140 |
+
"3. Определи тип ответа: сущность, число, дата, действие или описание\n",
|
| 141 |
+
"4. Выдели key_info - минимальную информацию, без которой ответ будет неполным\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"{format_instructions}\"\"\"),\n",
|
| 144 |
+
" (\"human\", \"\"\"Извлеки ответ из следующего факта:\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"## main_fact: {main_fact}\"\"\")\n",
|
| 147 |
+
"])\n",
|
| 148 |
+
"\n",
|
| 149 |
+
"answer_chain = answer_prompt | llm | answer_parser\n",
|
| 150 |
+
"\n",
|
| 151 |
+
"# === ШАГ 2: Генерация вопросов к ответу ===\n",
|
| 152 |
+
"question_parser = PydanticOutputParser(pydantic_object=QuestionPair)\n",
|
| 153 |
+
"\n",
|
| 154 |
+
"question_prompt = ChatPromptTemplate.from_messages([\n",
|
| 155 |
+
" (\"system\", \"\"\"Ты - эксперт по созданию вопросов для систем вопрос-ответ (QA).\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"Тебе дан ОТВЕТ. Твоя задача - сгенерировать 2 ВОПРОСА, на которые этот ответ будет ЕДИНСТВЕННО ВЕРНЫМ.\n",
|
| 158 |
+
"\n",
|
| 159 |
+
"## КРИТИЧЕСКИ ВАЖНО:\n",
|
| 160 |
+
"- Оба вопроса ДОЛЖНЫ иметь ОДИНАКОВЫЙ ответ = \"{answer}\"\n",
|
| 161 |
+
"- Вопросы отличаются ТОЛЬКО стилем формулировки, НЕ содержанием\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"## strict_question (формальный):\n",
|
| 164 |
+
"- Точная, академическая формулировка\n",
|
| 165 |
+
"- Использует полные названия и термины\n",
|
| 166 |
+
"- Пример: \"Какое решение принял Центральный банк РФ относительно ключевой ставки?\"\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"## real_question (разговорный):\n",
|
| 169 |
+
"- Как спросил бы обычный человек в разговоре\n",
|
| 170 |
+
"- Может опускать детали, которые понятны из контекста\n",
|
| 171 |
+
"- Пример: \"Что там ЦБ со ставкой сделал?\"\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"{format_instructions}\"\"\"),\n",
|
| 174 |
+
" (\"human\", \"\"\"Сгенерируй 2 вопроса для следующего:\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"## ОТВЕТ (должен быть одинаковым для обоих вопросов): {answer}\n",
|
| 177 |
+
"## Ключевая информация: {key_info}\n",
|
| 178 |
+
"## Контекст (main_fact): {main_fact}\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"Помни: оба вопроса должны подразумевать ОДИН И ТОТ ЖЕ ответ!\"\"\")\n",
|
| 181 |
+
"])\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"question_chain = question_prompt | llm | question_parser\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"print(f\"✅ Двухшаговый агент создан (модель: {MODEL_NAME})\")\n"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"cell_type": "code",
|
| 190 |
+
"execution_count": 63,
|
| 191 |
+
"metadata": {},
|
| 192 |
+
"outputs": [
|
| 193 |
+
{
|
| 194 |
+
"name": "stdout",
|
| 195 |
+
"output_type": "stream",
|
| 196 |
+
"text": [
|
| 197 |
+
"✅ QA-агент создан\n"
|
| 198 |
+
]
|
| 199 |
+
}
|
| 200 |
+
],
|
| 201 |
+
"source": [
|
| 202 |
+
"class QAAgent:\n",
|
| 203 |
+
" \"\"\"Двухшаговый агент: сначала ответ, потом вопросы\"\"\"\n",
|
| 204 |
+
" \n",
|
| 205 |
+
" def __init__(self):\n",
|
| 206 |
+
" self.answer_chain = answer_chain\n",
|
| 207 |
+
" self.question_chain = question_chain\n",
|
| 208 |
+
" self.answer_parser = answer_parser\n",
|
| 209 |
+
" self.question_parser = question_parser\n",
|
| 210 |
+
" \n",
|
| 211 |
+
" def generate(self, row: pd.Series) -> Optional[QAResult]:\n",
|
| 212 |
+
" \"\"\"Генерирует ответ и вопросы для одной записи\"\"\"\n",
|
| 213 |
+
" main_fact = row.get(\"main_fact\", \"\")\n",
|
| 214 |
+
" \n",
|
| 215 |
+
" # Шаг 1: Извлекаем ответ\n",
|
| 216 |
+
" try:\n",
|
| 217 |
+
" answer_result = self.answer_chain.invoke({\n",
|
| 218 |
+
" \"main_fact\": main_fact,\n",
|
| 219 |
+
" \"format_instructions\": self.answer_parser.get_format_instructions()\n",
|
| 220 |
+
" })\n",
|
| 221 |
+
" except Exception as e:\n",
|
| 222 |
+
" print(f\"Ошибка извлечения ответа: {e}\")\n",
|
| 223 |
+
" return None\n",
|
| 224 |
+
" \n",
|
| 225 |
+
" # Шаг 2: Генерируем вопросы к этому ответу\n",
|
| 226 |
+
" try:\n",
|
| 227 |
+
" questions_result = self.question_chain.invoke({\n",
|
| 228 |
+
" \"answer\": answer_result.answer,\n",
|
| 229 |
+
" \"key_info\": answer_result.key_info,\n",
|
| 230 |
+
" \"main_fact\": main_fact,\n",
|
| 231 |
+
" \"format_instructions\": self.question_parser.get_format_instructions()\n",
|
| 232 |
+
" })\n",
|
| 233 |
+
" except Exception as e:\n",
|
| 234 |
+
" print(f\"Ошибка генерации вопросов: {e}\")\n",
|
| 235 |
+
" return None\n",
|
| 236 |
+
" \n",
|
| 237 |
+
" return QAResult(answer=answer_result, questions=questions_result)\n",
|
| 238 |
+
" \n",
|
| 239 |
+
" def generate_batch(self, df: pd.DataFrame, show_progress: bool = True) -> list[dict]:\n",
|
| 240 |
+
" \"\"\"Генерирует QA-пары для всего DataFrame\"\"\"\n",
|
| 241 |
+
" results = []\n",
|
| 242 |
+
" iterator = tqdm(df.iterrows(), total=len(df), desc=\"Генерация QA\") if show_progress else df.iterrows()\n",
|
| 243 |
+
" \n",
|
| 244 |
+
" for idx, row in iterator:\n",
|
| 245 |
+
" try:\n",
|
| 246 |
+
" qa_result = self.generate(row)\n",
|
| 247 |
+
" except KeyboardInterrupt:\n",
|
| 248 |
+
" break\n",
|
| 249 |
+
" \n",
|
| 250 |
+
" if qa_result:\n",
|
| 251 |
+
" results.append({\n",
|
| 252 |
+
" \"index\": idx,\n",
|
| 253 |
+
" \"original_text\": row.get(\"original_text\", \"\"),\n",
|
| 254 |
+
" \"main_topic\": row.get(\"main_topic\", \"\"),\n",
|
| 255 |
+
" \"main_fact\": row.get(\"main_fact\", \"\"),\n",
|
| 256 |
+
" \"answer\": qa_result.answer.answer,\n",
|
| 257 |
+
" \"answer_type\": qa_result.answer.answer_type,\n",
|
| 258 |
+
" \"key_info\": qa_result.answer.key_info,\n",
|
| 259 |
+
" \"strict_question\": qa_result.questions.strict_question,\n",
|
| 260 |
+
" \"real_question\": qa_result.questions.real_question,\n",
|
| 261 |
+
" \"question_type\": qa_result.questions.question_type,\n",
|
| 262 |
+
" })\n",
|
| 263 |
+
" else:\n",
|
| 264 |
+
" results.append({\n",
|
| 265 |
+
" \"index\": idx,\n",
|
| 266 |
+
" \"original_text\": row.get(\"original_text\", \"\"),\n",
|
| 267 |
+
" \"main_topic\": row.get(\"main_topic\", \"\"),\n",
|
| 268 |
+
" \"main_fact\": row.get(\"main_fact\", \"\"),\n",
|
| 269 |
+
" \"answer\": None,\n",
|
| 270 |
+
" \"answer_type\": None,\n",
|
| 271 |
+
" \"key_info\": None,\n",
|
| 272 |
+
" \"strict_question\": None,\n",
|
| 273 |
+
" \"real_question\": None,\n",
|
| 274 |
+
" \"question_type\": None,\n",
|
| 275 |
+
" })\n",
|
| 276 |
+
" \n",
|
| 277 |
+
" return results\n",
|
| 278 |
+
"\n",
|
| 279 |
+
"\n",
|
| 280 |
+
"# Создаем агента\n",
|
| 281 |
+
"agent = QAAgent()\n",
|
| 282 |
+
"print(\"✅ QA-агент создан\")\n"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "markdown",
|
| 287 |
+
"metadata": {},
|
| 288 |
+
"source": [
|
| 289 |
+
"## Генерация вопросов для отфильтрованных данных\n"
|
| 290 |
+
]
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"cell_type": "code",
|
| 294 |
+
"execution_count": 64,
|
| 295 |
+
"metadata": {},
|
| 296 |
+
"outputs": [
|
| 297 |
+
{
|
| 298 |
+
"name": "stderr",
|
| 299 |
+
"output_type": "stream",
|
| 300 |
+
"text": [
|
| 301 |
+
"Генерация QA: 100%|██████████| 167/167 [09:09<00:00, 3.29s/it]"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"name": "stdout",
|
| 306 |
+
"output_type": "stream",
|
| 307 |
+
"text": [
|
| 308 |
+
"\n",
|
| 309 |
+
"✅ Сгенерировано QA-пар: 167/167\n"
|
| 310 |
+
]
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"name": "stderr",
|
| 314 |
+
"output_type": "stream",
|
| 315 |
+
"text": [
|
| 316 |
+
"\n"
|
| 317 |
+
]
|
| 318 |
+
}
|
| 319 |
+
],
|
| 320 |
+
"source": [
|
| 321 |
+
"# Генерация QA-пар для отфильтрованных данных\n",
|
| 322 |
+
"qa_results = agent.generate_batch(filtered_df)\n",
|
| 323 |
+
"\n",
|
| 324 |
+
"print(f\"\\n✅ Сгенерировано QA-пар: {sum(1 for r in qa_results if r['answer'])}/{len(qa_results)}\")\n"
|
| 325 |
+
]
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"cell_type": "code",
|
| 329 |
+
"execution_count": 85,
|
| 330 |
+
"metadata": {},
|
| 331 |
+
"outputs": [
|
| 332 |
+
{
|
| 333 |
+
"data": {
|
| 334 |
+
"text/plain": [
|
| 335 |
+
"message_id 130738\n",
|
| 336 |
+
"original_text Итальянский суд принял решение экстрадировать ...\n",
|
| 337 |
+
"main_topic Экстрадиция Сергея Кузнецова в Германию по под...\n",
|
| 338 |
+
"key_entities Итальянский суд, Германия, Сергей Кузнецов, Се...\n",
|
| 339 |
+
"main_fact Итальянский суд принял решение экстрадировать ...\n",
|
| 340 |
+
"is_unambiguous True\n",
|
| 341 |
+
"confidence 0.95\n",
|
| 342 |
+
"category event\n",
|
| 343 |
+
"search_difficulty easy\n",
|
| 344 |
+
"ambiguity_reasons NaN\n",
|
| 345 |
+
"Name: 6, dtype: object"
|
| 346 |
+
]
|
| 347 |
+
},
|
| 348 |
+
"execution_count": 85,
|
| 349 |
+
"metadata": {},
|
| 350 |
+
"output_type": "execute_result"
|
| 351 |
+
}
|
| 352 |
+
],
|
| 353 |
+
"source": [
|
| 354 |
+
"filtered_df.iloc[0]"
|
| 355 |
+
]
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"cell_type": "code",
|
| 359 |
+
"execution_count": 84,
|
| 360 |
+
"metadata": {},
|
| 361 |
+
"outputs": [
|
| 362 |
+
{
|
| 363 |
+
"data": {
|
| 364 |
+
"text/plain": [
|
| 365 |
+
"{'index': 6,\n",
|
| 366 |
+
" 'original_text': 'Итальянский суд принял решение экстрадировать в Германию задержанного по подозрению в подрыве «Северных потоков» Сергея Кузнецова, пишет Reuters.\\n\\n🐚 Следить за новостями РБК в Telegram',\n",
|
| 367 |
+
" 'main_topic': 'Экстрадиция Сергея Кузнецова в Германию по подозрению в подрыве «Северных потоков»',\n",
|
| 368 |
+
" 'main_fact': 'Итальянский суд принял решение экстрадировать Сергея Кузнецова в Германию, где его подозревают в подрыве газопроводов «Северные потоки».',\n",
|
| 369 |
+
" 'answer': 'Итальянский суд решил экстрадировать Сергея Кузнецова в Германию, где его подозревают в подрыве газопроводов «Северные потоки».',\n",
|
| 370 |
+
" 'answer_type': 'action',\n",
|
| 371 |
+
" 'key_info': 'экстрадировать Сергея Кузнецова в Германию из-за подозрения в подрыве газопроводов «Северные потоки»',\n",
|
| 372 |
+
" 'strict_question': 'Какое решение приняло итальянское судопроизводство в отношении экстрадиции Сергея Кузнецова в связи с подозрениями в причастности к подрыву газопроводов «Северные потоки»?',\n",
|
| 373 |
+
" 'real_question': 'Что там с Кузнецовым — его в Германию выдадут за подрыв «Северных потоков»?',\n",
|
| 374 |
+
" 'question_type': 'what'}"
|
| 375 |
+
]
|
| 376 |
+
},
|
| 377 |
+
"execution_count": 84,
|
| 378 |
+
"metadata": {},
|
| 379 |
+
"output_type": "execute_result"
|
| 380 |
+
}
|
| 381 |
+
],
|
| 382 |
+
"source": [
|
| 383 |
+
"qa_results[0]"
|
| 384 |
+
]
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"cell_type": "code",
|
| 388 |
+
"execution_count": 93,
|
| 389 |
+
"metadata": {},
|
| 390 |
+
"outputs": [],
|
| 391 |
+
"source": [
|
| 392 |
+
"# Преобразование в DataFrame и сохранение\n",
|
| 393 |
+
"qa_df = pd.DataFrame(qa_results)\n",
|
| 394 |
+
"qa_df[\"message_id\"] = filtered_df[\"message_id\"].values\n",
|
| 395 |
+
"qa_df = qa_df[[\"message_id\", \"original_text\", \"strict_question\", \"real_question\"]]\n",
|
| 396 |
+
"\n",
|
| 397 |
+
"# Сохраняем в CSV\n",
|
| 398 |
+
"output_file = \"generated_qa.csv\"\n",
|
| 399 |
+
"qa_df.to_csv(output_file, index=False)"
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"cell_type": "code",
|
| 404 |
+
"execution_count": 94,
|
| 405 |
+
"metadata": {},
|
| 406 |
+
"outputs": [
|
| 407 |
+
{
|
| 408 |
+
"data": {
|
| 409 |
+
"text/html": [
|
| 410 |
+
"<div>\n",
|
| 411 |
+
"<style scoped>\n",
|
| 412 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 413 |
+
" vertical-align: middle;\n",
|
| 414 |
+
" }\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" .dataframe tbody tr th {\n",
|
| 417 |
+
" vertical-align: top;\n",
|
| 418 |
+
" }\n",
|
| 419 |
+
"\n",
|
| 420 |
+
" .dataframe thead th {\n",
|
| 421 |
+
" text-align: right;\n",
|
| 422 |
+
" }\n",
|
| 423 |
+
"</style>\n",
|
| 424 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 425 |
+
" <thead>\n",
|
| 426 |
+
" <tr style=\"text-align: right;\">\n",
|
| 427 |
+
" <th></th>\n",
|
| 428 |
+
" <th>message_id</th>\n",
|
| 429 |
+
" <th>original_text</th>\n",
|
| 430 |
+
" <th>strict_question</th>\n",
|
| 431 |
+
" <th>real_question</th>\n",
|
| 432 |
+
" </tr>\n",
|
| 433 |
+
" </thead>\n",
|
| 434 |
+
" <tbody>\n",
|
| 435 |
+
" <tr>\n",
|
| 436 |
+
" <th>0</th>\n",
|
| 437 |
+
" <td>130738</td>\n",
|
| 438 |
+
" <td>Итальянский суд принял решение экстрадировать ...</td>\n",
|
| 439 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 440 |
+
" <td>Что там с Кузнецовым — его в Германию выдадут ...</td>\n",
|
| 441 |
+
" </tr>\n",
|
| 442 |
+
" <tr>\n",
|
| 443 |
+
" <th>1</th>\n",
|
| 444 |
+
" <td>129361</td>\n",
|
| 445 |
+
" <td>Пять пассажиров автобуса №793 пострадали в ДТП...</td>\n",
|
| 446 |
+
" <td>Сколько пассажиров автобуса №793 пострадали в ...</td>\n",
|
| 447 |
+
" <td>Сколько человек в автобусе 793 пострадали, ког...</td>\n",
|
| 448 |
+
" </tr>\n",
|
| 449 |
+
" <tr>\n",
|
| 450 |
+
" <th>2</th>\n",
|
| 451 |
+
" <td>133468</td>\n",
|
| 452 |
+
" <td>Владимир Путин утвердил концепцию государствен...</td>\n",
|
| 453 |
+
" <td>Кто утвердил концепцию государственной миграци...</td>\n",
|
| 454 |
+
" <td>Кто там утвердил новую миграционную концепцию ...</td>\n",
|
| 455 |
+
" </tr>\n",
|
| 456 |
+
" <tr>\n",
|
| 457 |
+
" <th>3</th>\n",
|
| 458 |
+
" <td>123139</td>\n",
|
| 459 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 460 |
+
" <td>Какое юридическое действие предприняли Генерал...</td>\n",
|
| 461 |
+
" <td>Что Генпрокуратура и Минюст сделали с сатанист...</td>\n",
|
| 462 |
+
" </tr>\n",
|
| 463 |
+
" <tr>\n",
|
| 464 |
+
" <th>4</th>\n",
|
| 465 |
+
" <td>129894</td>\n",
|
| 466 |
+
" <td>Обломки дрона обнаружили польские пограничники...</td>\n",
|
| 467 |
+
" <td>Где и кем был обнаружен непилотируемый летател...</td>\n",
|
| 468 |
+
" <td>Что там польские пограничники нашли рядом с Бе...</td>\n",
|
| 469 |
+
" </tr>\n",
|
| 470 |
+
" <tr>\n",
|
| 471 |
+
" <th>...</th>\n",
|
| 472 |
+
" <td>...</td>\n",
|
| 473 |
+
" <td>...</td>\n",
|
| 474 |
+
" <td>...</td>\n",
|
| 475 |
+
" <td>...</td>\n",
|
| 476 |
+
" </tr>\n",
|
| 477 |
+
" <tr>\n",
|
| 478 |
+
" <th>162</th>\n",
|
| 479 |
+
" <td>123802</td>\n",
|
| 480 |
+
" <td>Мальчика, которого в Шереметьево мужчина удари...</td>\n",
|
| 481 |
+
" <td>Каков исход медицинского случая двухлетнего ма...</td>\n",
|
| 482 |
+
" <td>Что случилось с мальчиком, которого бросили в ...</td>\n",
|
| 483 |
+
" </tr>\n",
|
| 484 |
+
" <tr>\n",
|
| 485 |
+
" <th>163</th>\n",
|
| 486 |
+
" <td>124166</td>\n",
|
| 487 |
+
" <td>Почти все виды американского оружия, которые с...</td>\n",
|
| 488 |
+
" <td>Каков текущий статус американского оружия, зап...</td>\n",
|
| 489 |
+
" <td>Уже есть всё это оружие для Украины, что НАТО ...</td>\n",
|
| 490 |
+
" </tr>\n",
|
| 491 |
+
" <tr>\n",
|
| 492 |
+
" <th>164</th>\n",
|
| 493 |
+
" <td>136058</td>\n",
|
| 494 |
+
" <td>По планам Банка России, массовое внедрение циф...</td>\n",
|
| 495 |
+
" <td>Когда начнется массовое внедрение цифрового ру...</td>\n",
|
| 496 |
+
" <td>Когда начнут все пользоваться цифровым рублем,...</td>\n",
|
| 497 |
+
" </tr>\n",
|
| 498 |
+
" <tr>\n",
|
| 499 |
+
" <th>165</th>\n",
|
| 500 |
+
" <td>134555</td>\n",
|
| 501 |
+
" <td>В Турции в Гебзе обрушился многоэтажный дом. П...</td>\n",
|
| 502 |
+
" <td>В каком городе Турции обрушился семиэтажный до...</td>\n",
|
| 503 |
+
" <td>Что там в Гебзе с домом обрушился? Пять челове...</td>\n",
|
| 504 |
+
" </tr>\n",
|
| 505 |
+
" <tr>\n",
|
| 506 |
+
" <th>166</th>\n",
|
| 507 |
+
" <td>123088</td>\n",
|
| 508 |
+
" <td>Современный городской квартал сегодня уже дале...</td>\n",
|
| 509 |
+
" <td>Какой девелопер осуществляет строительство жил...</td>\n",
|
| 510 |
+
" <td>Кто строит тот самый квартал Soul рядом с метр...</td>\n",
|
| 511 |
+
" </tr>\n",
|
| 512 |
+
" </tbody>\n",
|
| 513 |
+
"</table>\n",
|
| 514 |
+
"<p>167 rows × 4 columns</p>\n",
|
| 515 |
+
"</div>"
|
| 516 |
+
],
|
| 517 |
+
"text/plain": [
|
| 518 |
+
" message_id original_text \\\n",
|
| 519 |
+
"0 130738 Итальянский суд принял решение экстрадировать ... \n",
|
| 520 |
+
"1 129361 Пять пассажиров автобуса №793 пострадали в ДТП... \n",
|
| 521 |
+
"2 133468 Владимир Путин утвердил концепцию государствен... \n",
|
| 522 |
+
"3 123139 Генпрокуратура и Минюст подали в Верховный суд... \n",
|
| 523 |
+
"4 129894 Обломки дрона обнаружили польские пограничники... \n",
|
| 524 |
+
".. ... ... \n",
|
| 525 |
+
"162 123802 Мальчика, которого в Шереметьево мужчина удари... \n",
|
| 526 |
+
"163 124166 Почти все виды американского оружия, которые с... \n",
|
| 527 |
+
"164 136058 По планам Банка России, массовое внедрение циф... \n",
|
| 528 |
+
"165 134555 В Турции в Гебзе обрушился многоэтажный дом. П... \n",
|
| 529 |
+
"166 123088 Современный городской квартал сегодня уже дале... \n",
|
| 530 |
+
"\n",
|
| 531 |
+
" strict_question \\\n",
|
| 532 |
+
"0 Какое решение приняло итальянское судопроизвод... \n",
|
| 533 |
+
"1 Сколько пассажиров автобуса №793 пострадали в ... \n",
|
| 534 |
+
"2 Кто утвердил концепцию государственной миграци... \n",
|
| 535 |
+
"3 Какое юридическое действие предприняли Генерал... \n",
|
| 536 |
+
"4 Где и кем был обнаружен непилотируемый летател... \n",
|
| 537 |
+
".. ... \n",
|
| 538 |
+
"162 Каков исход медицинского случая двухлетнего ма... \n",
|
| 539 |
+
"163 Каков текущий статус американского оружия, зап... \n",
|
| 540 |
+
"164 Когда начнется массовое внедрение цифрового ру... \n",
|
| 541 |
+
"165 В каком городе Турции обрушился семиэтажный до... \n",
|
| 542 |
+
"166 Какой девелопер осуществляет строительство жил... \n",
|
| 543 |
+
"\n",
|
| 544 |
+
" real_question \n",
|
| 545 |
+
"0 Что там с Кузнецовым — его в Германию выдадут ... \n",
|
| 546 |
+
"1 Сколько человек в автобусе 793 пострадали, ког... \n",
|
| 547 |
+
"2 Кто там утвердил новую миграционную концепцию ... \n",
|
| 548 |
+
"3 Что Генпрокуратура и Минюст сделали с сатанист... \n",
|
| 549 |
+
"4 Что там польские пограничники нашли рядом с Бе... \n",
|
| 550 |
+
".. ... \n",
|
| 551 |
+
"162 Что случилось с мальчиком, которого бросили в ... \n",
|
| 552 |
+
"163 Уже есть всё это оружие для Украины, что НАТО ... \n",
|
| 553 |
+
"164 Когда начнут все пользоваться цифровым рублем,... \n",
|
| 554 |
+
"165 Что ��ам в Гебзе с домом обрушился? Пять челове... \n",
|
| 555 |
+
"166 Кто строит тот самый квартал Soul рядом с метр... \n",
|
| 556 |
+
"\n",
|
| 557 |
+
"[167 rows x 4 columns]"
|
| 558 |
+
]
|
| 559 |
+
},
|
| 560 |
+
"execution_count": 94,
|
| 561 |
+
"metadata": {},
|
| 562 |
+
"output_type": "execute_result"
|
| 563 |
+
}
|
| 564 |
+
],
|
| 565 |
+
"source": [
|
| 566 |
+
"qa_df"
|
| 567 |
+
]
|
| 568 |
+
}
|
| 569 |
+
],
|
| 570 |
+
"metadata": {
|
| 571 |
+
"kernelspec": {
|
| 572 |
+
"display_name": "venv",
|
| 573 |
+
"language": "python",
|
| 574 |
+
"name": "python3"
|
| 575 |
+
},
|
| 576 |
+
"language_info": {
|
| 577 |
+
"codemirror_mode": {
|
| 578 |
+
"name": "ipython",
|
| 579 |
+
"version": 3
|
| 580 |
+
},
|
| 581 |
+
"file_extension": ".py",
|
| 582 |
+
"mimetype": "text/x-python",
|
| 583 |
+
"name": "python",
|
| 584 |
+
"nbconvert_exporter": "python",
|
| 585 |
+
"pygments_lexer": "ipython3",
|
| 586 |
+
"version": "3.13.3"
|
| 587 |
+
}
|
| 588 |
+
},
|
| 589 |
+
"nbformat": 4,
|
| 590 |
+
"nbformat_minor": 2
|
| 591 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,3 +1,34 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.3.3
|
| 2 |
+
python-dotenv==1.2.1
|
| 3 |
+
pyaes==1.6.1
|
| 4 |
+
Pyrogram==2.0.106
|
| 5 |
+
PySocks==1.7.1
|
| 6 |
+
python-dateutil==2.9.0.post0
|
| 7 |
+
pytz==2025.2
|
| 8 |
+
six==1.17.0
|
| 9 |
+
structlog==25.5.0
|
| 10 |
+
TgCrypto==1.2.5
|
| 11 |
+
typing_extensions==4.15.0
|
| 12 |
+
tzdata==2025.2
|
| 13 |
+
torch[cuda129]==2.9.1
|
| 14 |
+
tokenizers==0.22.1
|
| 15 |
+
transformers==4.57.3
|
| 16 |
+
sentence-transformers==5.1.2
|
| 17 |
+
nltk==3.9.2
|
| 18 |
+
langchain==1.1.0
|
| 19 |
+
langchain-core==1.1.0
|
| 20 |
+
langchain_text_splitters==1.0.0
|
| 21 |
+
langchain-experimental==0.4.0
|
| 22 |
+
langchain_huggingface==1.1.0
|
| 23 |
+
langchain_mistralai==1.1.1
|
| 24 |
+
langchain-openai==1.0.0
|
| 25 |
+
sqlalchemy==2.0.44
|
| 26 |
+
psycopg2-binary==2.9.11
|
| 27 |
+
qdrant-client==1.16.2
|
| 28 |
+
fastapi==0.124.4
|
| 29 |
+
uvicorn==0.38.0
|
| 30 |
+
openai==1.109.1
|
| 31 |
+
pydantic==2.9.2
|
| 32 |
+
tenacity==9.0.0
|
| 33 |
+
tqdm==4.67.1
|
| 34 |
+
streamlit==1.40.2
|
root.crt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIE3TCCAsWgAwIBAgIKPxb5sAAAAAAAFzANBgkqhkiG9w0BAQ0FADAfMR0wGwYD
|
| 3 |
+
VQQDExRZYW5kZXhJbnRlcm5hbFJvb3RDQTAeFw0xNzA2MjAxNjQ0MzdaFw0yNzA2
|
| 4 |
+
MjAxNjU0MzdaMFUxEjAQBgoJkiaJk/IsZAEZFgJydTEWMBQGCgmSJomT8ixkARkW
|
| 5 |
+
BnlhbmRleDESMBAGCgmSJomT8ixkARkWAmxkMRMwEQYDVQQDEwpZYW5kZXhDTENB
|
| 6 |
+
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAqgNnjk0JKPcbsk1+KG2t
|
| 7 |
+
eM1AfMnEe5RkAJuBBuwVV49snhcvO1jhKBx/pCnjr6biICc1/oAFDVgU8yVYYPwp
|
| 8 |
+
WZ2vH3ZtscjJ/RAT/NS9OKKG7kKknhFhVYxua5xhoIQmm6usBNYYiTcWoFm1eHC8
|
| 9 |
+
I9oddOLSscZYbh3unVRvt+3V+drVmUx9oSUKpqMgfysiv1MN6zB3vq9TFkbhz53E
|
| 10 |
+
k0tEcV+W2NnDaeFhLKy284FDKLvOdTDj1EDsSAihxl7sNEKpupNuhgyy2siOqUb+
|
| 11 |
+
d5mO/CRfaAKGg3E6hDM3pEi48E506dJdjPXWfHKSvuguMLRlb2RWdVocRZuyWxOh
|
| 12 |
+
0QIDAQABo4HkMIHhMBAGCSsGAQQBgjcVAQQDAgEAMB0GA1UdDgQWBBRMU5uItjx+
|
| 13 |
+
TOicX1+ovC1Xq2PSnzAZBgkrBgEEAYI3FAIEDB4KAFMAdQBiAEMAQTALBgNVHQ8E
|
| 14 |
+
BAMCAYYwDwYDVR0TAQH/BAUwAwEB/zAfBgNVHSMEGDAWgBSrucX/oe/mUx0zOSKE
|
| 15 |
+
0XbUN04tajBUBgNVHR8ETTBLMEmgR6BFhkNodHRwOi8vY3Jscy55YW5kZXgucnUv
|
| 16 |
+
WWFuZGV4SW50ZXJuYWxSb290Q0EvWWFuZGV4SW50ZXJuYWxSb290Q0EuY3JsMA0G
|
| 17 |
+
CSqGSIb3DQEBDQUAA4ICAQAsR5Lb4Pv2FD0Kk+4oc1GEOnehxKLsQtdV81nrU+IV
|
| 18 |
+
l9pr2oNMdi8lwIolvHZRllLM4Ba5AcRH6YJ5fe7AjKm+5EdSkhqVWo2UOllRCbtS
|
| 19 |
+
wmL50+erOAkxstSlRkO6b8x1L0MOBKv54E5YcQ/Wwt27ldSb6RkEmJBGvmxObAaf
|
| 20 |
+
5zc51pqSqao9tnldYaCblEQ/Zmy43FliIpa2eUJoh8DqK8bVo2gcI3wbQ32tWs9u
|
| 21 |
+
wvKk8fo4lAdhCwhv+QHuqau1VAY9hPU106bsFIDUmijTMxjAobKBi6CkIX6EbNHU
|
| 22 |
+
Jv4DzYVLlDd2y0CADdn2F6I70xpCBn5cquSGuvFbqZjQDmIHwb7WQSxadkiGRWfc
|
| 23 |
+
zVTnmiHjJONJJIpE2t+FOV3hc+8o98OzOtNaH2QQ9j6dnKvtIGKGFeNSDp0vXPOi
|
| 24 |
+
QhHiIyuB7eWx+g2whktQ74UCpGDSXYnEW3s8w5wezVWIEmouq7q4rCEkTNvJ7Ico
|
| 25 |
+
43AgUdPzAFS2zYktw1C+cbUALM8smvXbXrXOBzMmscjIhtXvLMrpPeh23VfdJfQB
|
| 26 |
+
0rN2BmRCLUE8JOV+o0k98XMm83oN+lGkL1l+hyoj3ok1uI3JrsWOcDyjOds3ptcN
|
| 27 |
+
KimJLm27ndjcxDNo/iA6gefMJuCxFRaqI+eF4P0jSkMgnnQqZkvLGFuHCw8eRDhm
|
| 28 |
+
bw==
|
| 29 |
+
-----END CERTIFICATE-----
|
| 30 |
+
-----BEGIN CERTIFICATE-----
|
| 31 |
+
MIIFGTCCAwGgAwIBAgIQJMM7ZIy2SYxCBgK7WcFwnjANBgkqhkiG9w0BAQ0FADAf
|
| 32 |
+
MR0wGwYDVQQDExRZYW5kZXhJbnRlcm5hbFJvb3RDQTAeFw0xMzAyMTExMzQxNDNa
|
| 33 |
+
Fw0zMzAyMTExMzUxNDJaMB8xHTAbBgNVBAMTFFlhbmRleEludGVybmFsUm9vdENB
|
| 34 |
+
MIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAgb4xoQjBQ7oEFk8EHVGy
|
| 35 |
+
1pDEmPWw0Wgw5nX9RM7LL2xQWyUuEq+Lf9Dgh+O725aZ9+SO2oEs47DHHt81/fne
|
| 36 |
+
5N6xOftRrCpy8hGtUR/A3bvjnQgjs+zdXvcO9cTuuzzPTFSts/iZATZsAruiepMx
|
| 37 |
+
SGj9S1fGwvYws/yiXWNoNBz4Tu1Tlp0g+5fp/ADjnxc6DqNk6w01mJRDbx+6rlBO
|
| 38 |
+
aIH2tQmJXDVoFdrhmBK9qOfjxWlIYGy83TnrvdXwi5mKTMtpEREMgyNLX75UjpvO
|
| 39 |
+
NkZgBvEXPQq+g91wBGsWIE2sYlguXiBniQgAJOyRuSdTxcJoG8tZkLDPRi5RouWY
|
| 40 |
+
gxXr13edn1TRDGco2hkdtSUBlajBMSvAq+H0hkslzWD/R+BXkn9dh0/DFnxVt4XU
|
| 41 |
+
5JbFyd/sKV/rF4Vygfw9ssh1ZIWdqkfZ2QXOZ2gH4AEeoN/9vEfUPwqPVzL0XEZK
|
| 42 |
+
r4s2WjU9mE5tHrVsQOZ80wnvYHYi2JHbl0hr5ghs4RIyJwx6LEEnj2tzMFec4f7o
|
| 43 |
+
dQeSsZpgRJmpvpAfRTxhIRjZBrKxnMytedAkUPguBQwjVCn7+EaKiJfpu42JG8Mm
|
| 44 |
+
+/dHi+Q9Tc+0tX5pKOIpQMlMxMHw8MfPmUjC3AAd9lsmCtuybYoeN2IRdbzzchJ8
|
| 45 |
+
l1ZuoI3gH7pcIeElfVSqSBkCAwEAAaNRME8wCwYDVR0PBAQDAgGGMA8GA1UdEwEB
|
| 46 |
+
/wQFMAMBAf8wHQYDVR0OBBYEFKu5xf+h7+ZTHTM5IoTRdtQ3Ti1qMBAGCSsGAQQB
|
| 47 |
+
gjcVAQQDAgEAMA0GCSqGSIb3DQEBDQUAA4ICAQAVpyJ1qLjqRLC34F1UXkC3vxpO
|
| 48 |
+
nV6WgzpzA+DUNog4Y6RhTnh0Bsir+I+FTl0zFCm7JpT/3NP9VjfEitMkHehmHhQK
|
| 49 |
+
c7cIBZSF62K477OTvLz+9ku2O/bGTtYv9fAvR4BmzFfyPDoAKOjJSghD1p/7El+1
|
| 50 |
+
eSjvcUBzLnBUtxO/iYXRNo7B3+1qo4F5Hz7rPRLI0UWW/0UAfVCO2fFtyF6C1iEY
|
| 51 |
+
/q0Ldbf3YIaMkf2WgGhnX9yH/8OiIij2r0LVNHS811apyycjep8y/NkG4q1Z9jEi
|
| 52 |
+
VEX3P6NEL8dWtXQlvlNGMcfDT3lmB+tS32CPEUwce/Ble646rukbERRwFfxXojpf
|
| 53 |
+
C6ium+LtJc7qnK6ygnYF4D6mz4H+3WaxJd1S1hGQxOb/3WVw63tZFnN62F6/nc5g
|
| 54 |
+
6T44Yb7ND6y3nVcygLpbQsws6HsjX65CoSjrrPn0YhKxNBscF7M7tLTW/5LK9uhk
|
| 55 |
+
yjRCkJ0YagpeLxfV1l1ZJZaTPZvY9+ylHnWHhzlq0FzcrooSSsp4i44DB2K7O2ID
|
| 56 |
+
87leymZkKUY6PMDa4GkDJx0dG4UXDhRETMf+NkYgtLJ+UIzMNskwVDcxO4kVL+Hi
|
| 57 |
+
Pj78bnC5yCw8P5YylR45LdxLzLO68unoXOyFz1etGXzszw8lJI9LNubYxk77mK8H
|
| 58 |
+
LpuQKbSbIERsmR+QqQ==
|
| 59 |
+
-----END CERTIFICATE-----
|
server.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Dict, Any
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
|
| 4 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
|
| 7 |
+
from src import RAG
|
| 8 |
+
from src.db_utils.history_utils import (
|
| 9 |
+
init_history_table,
|
| 10 |
+
log_query,
|
| 11 |
+
get_all_history,
|
| 12 |
+
get_history_by_dialogue,
|
| 13 |
+
search_history,
|
| 14 |
+
get_history_stats,
|
| 15 |
+
delete_history,
|
| 16 |
+
get_recent_dialogues
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# --- Lifespan для инициализации при старте ---
|
| 21 |
+
@asynccontextmanager
|
| 22 |
+
async def lifespan(app: FastAPI):
|
| 23 |
+
# Startup: инициализация таблицы истории
|
| 24 |
+
try:
|
| 25 |
+
init_history_table()
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"⚠️ Не удалось инициализировать таблицу истории: {e}")
|
| 28 |
+
yield
|
| 29 |
+
# Shutdown: ничего не делаем
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
app = FastAPI(
|
| 33 |
+
title="RAG API",
|
| 34 |
+
version="1.0.0",
|
| 35 |
+
lifespan=lifespan,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# --- Инициализация RAG один раз при старте ---
|
| 39 |
+
rag = RAG(
|
| 40 |
+
embed_model_name="deepvk/USER-bge-m3",
|
| 41 |
+
embed_index_name="recursive_USER-bge-m3",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# --- Request / Response схемы ---
|
| 46 |
+
|
| 47 |
+
class QueryRequest(BaseModel):
|
| 48 |
+
query: str
|
| 49 |
+
dialogue_id: Optional[str] = None
|
| 50 |
+
history: Optional[List[Dict[str, Any]]] = None # История диалога для контекста
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class QueryResponse(BaseModel):
|
| 54 |
+
answer: str
|
| 55 |
+
reason: str
|
| 56 |
+
query_id: Optional[int] = None # ID записи в истории
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class HistoryEntry(BaseModel):
|
| 60 |
+
id: int
|
| 61 |
+
timestamp: str
|
| 62 |
+
dialogue_id: str
|
| 63 |
+
query: str
|
| 64 |
+
answer: str
|
| 65 |
+
reason: Optional[str] = None
|
| 66 |
+
search_period: Optional[Dict[str, Any]] = None
|
| 67 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class HistoryStats(BaseModel):
|
| 71 |
+
total_queries: int
|
| 72 |
+
unique_dialogues: int
|
| 73 |
+
last_query_time: Optional[str] = None
|
| 74 |
+
first_query_time: Optional[str] = None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class DialogueInfo(BaseModel):
|
| 78 |
+
dialogue_id: str
|
| 79 |
+
message_count: int
|
| 80 |
+
started_at: Optional[str] = None
|
| 81 |
+
last_message_at: Optional[str] = None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# --- RAG Endpoint ---
|
| 85 |
+
|
| 86 |
+
@app.post("/rag", response_model=QueryResponse)
|
| 87 |
+
def rag_query(request: QueryRequest):
|
| 88 |
+
"""Основной endpoint для запросов к RAG. Логирует запрос после получения ответа."""
|
| 89 |
+
|
| 90 |
+
# Если передан dialogue_id, загружаем историю
|
| 91 |
+
history = None
|
| 92 |
+
if request.dialogue_id and not request.history:
|
| 93 |
+
history = get_history_by_dialogue(request.dialogue_id)
|
| 94 |
+
elif request.history:
|
| 95 |
+
history = request.history
|
| 96 |
+
|
| 97 |
+
# Получаем ответ от RAG с историей (история используется для обогащения вопроса)
|
| 98 |
+
result = rag.invoke(request.query, history=history)
|
| 99 |
+
|
| 100 |
+
# Логируем в историю
|
| 101 |
+
query_id = log_query(
|
| 102 |
+
query=request.query,
|
| 103 |
+
answer=result.get("answer", ""),
|
| 104 |
+
reason=result.get("reason", ""),
|
| 105 |
+
dialogue_id=request.dialogue_id
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return QueryResponse(
|
| 109 |
+
answer=result.get("answer", ""),
|
| 110 |
+
reason=result.get("reason", ""),
|
| 111 |
+
query_id=query_id
|
| 112 |
+
)
|
| 113 |
+
# except Exception as e:
|
| 114 |
+
# raise HTTPException(
|
| 115 |
+
# status_code=500,
|
| 116 |
+
# detail=str(e)
|
| 117 |
+
# )
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# --- History Endpoints ---
|
| 121 |
+
|
| 122 |
+
@app.get("/history", response_model=List[HistoryEntry])
|
| 123 |
+
def get_history(
|
| 124 |
+
limit: int = Query(default=100, ge=1, le=1000),
|
| 125 |
+
offset: int = Query(default=0, ge=0)
|
| 126 |
+
):
|
| 127 |
+
"""Получить историю запросов"""
|
| 128 |
+
return get_all_history(limit=limit, offset=offset)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@app.get("/history/stats", response_model=HistoryStats)
|
| 132 |
+
def get_stats():
|
| 133 |
+
"""Получить статистику по истории"""
|
| 134 |
+
stats = get_history_stats()
|
| 135 |
+
return HistoryStats(
|
| 136 |
+
total_queries=stats.get("total_queries", 0),
|
| 137 |
+
unique_dialogues=stats.get("unique_dialogues", 0),
|
| 138 |
+
last_query_time=stats.get("last_query_time"),
|
| 139 |
+
first_query_time=stats.get("first_query_time")
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@app.get("/history/search", response_model=List[HistoryEntry])
|
| 144 |
+
def search_in_history(
|
| 145 |
+
q: str = Query(..., min_length=1, description="Текст для поиска"),
|
| 146 |
+
limit: int = Query(default=50, ge=1, le=500)
|
| 147 |
+
):
|
| 148 |
+
"""Поиск по истории запросов"""
|
| 149 |
+
return search_history(search_text=q, limit=limit)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@app.get("/history/dialogues", response_model=List[DialogueInfo])
|
| 153 |
+
def get_dialogues(
|
| 154 |
+
limit: int = Query(default=10, ge=1, le=100)
|
| 155 |
+
):
|
| 156 |
+
"""Получить список последних диалогов"""
|
| 157 |
+
return get_recent_dialogues(limit=limit)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.get("/history/dialogue/{dialogue_id}", response_model=List[HistoryEntry])
|
| 161 |
+
def get_dialogue(dialogue_id: str):
|
| 162 |
+
"""Получить историю конкретного диалога"""
|
| 163 |
+
return get_history_by_dialogue(dialogue_id)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@app.delete("/history")
|
| 167 |
+
def clear_history(dialogue_id: Optional[str] = None):
|
| 168 |
+
"""Удалить историю (всю или конкретного диалога)"""
|
| 169 |
+
try:
|
| 170 |
+
delete_history(dialogue_id=dialogue_id)
|
| 171 |
+
if dialogue_id:
|
| 172 |
+
return {"message": f"История диалога {dialogue_id} удалена"}
|
| 173 |
+
return {"message": "Вся история удалена"}
|
| 174 |
+
except Exception as e:
|
| 175 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# --- Healthcheck ---
|
| 179 |
+
|
| 180 |
+
@app.get("/health")
|
| 181 |
+
def health():
|
| 182 |
+
return {"status": "ok"}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# --- Entry point ---
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
import uvicorn
|
| 188 |
+
|
| 189 |
+
uvicorn.run(
|
| 190 |
+
"server:app",
|
| 191 |
+
host="0.0.0.0",
|
| 192 |
+
port=8000,
|
| 193 |
+
reload=True,
|
| 194 |
+
)
|
src/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG Telegram 2025 - Source Package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from src.rag import RAG
|
src/config.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from qdrant_client import QdrantClient
|
| 5 |
+
from sqlalchemy import create_engine
|
| 6 |
+
|
| 7 |
+
# from src.data.parser import PyroSource
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
# Парсер
|
| 13 |
+
API_ID = os.environ.get("TELEGRAM_API_ID")
|
| 14 |
+
API_HASH = os.environ.get("TELEGRAM_API_HASH")
|
| 15 |
+
CHANNEL_ID = os.environ.get("CHANNEL_ID")
|
| 16 |
+
|
| 17 |
+
# pyro_source = PyroSource(api_id=API_ID, api_hash=API_HASH)
|
| 18 |
+
|
| 19 |
+
# Sql
|
| 20 |
+
DB_USER = os.getenv("DB_USER")
|
| 21 |
+
DB_PASS = os.getenv("DB_PASS")
|
| 22 |
+
DB_HOST = os.getenv("DB_HOST")
|
| 23 |
+
DB_PORT = os.getenv("DB_PORT")
|
| 24 |
+
DB_NAME = os.getenv("DB_NAME")
|
| 25 |
+
PATH_TO_CERT = os.getenv("PATH_TO_CERT")
|
| 26 |
+
|
| 27 |
+
connection_str = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
|
| 28 |
+
|
| 29 |
+
sql_client = create_engine(
|
| 30 |
+
connection_str,
|
| 31 |
+
connect_args={
|
| 32 |
+
"sslmode": "verify-full",
|
| 33 |
+
"sslrootcert": PATH_TO_CERT,
|
| 34 |
+
"target_session_attrs": "read-write"
|
| 35 |
+
}
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Qdrant
|
| 39 |
+
QDRANT_URL = os.getenv("QDRANT_URL")
|
| 40 |
+
|
| 41 |
+
qdrant_client = QdrantClient(url=QDRANT_URL)
|
| 42 |
+
|
| 43 |
+
# LLM
|
| 44 |
+
LLM_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 45 |
+
LLM = "qwen/qwen3-next-80b-a3b-instruct"
|
| 46 |
+
|
| 47 |
+
# Chat settings
|
| 48 |
+
CHAT_HISTORY_LENGTH = 5
|
| 49 |
+
ENABLE_QUESTION_ENRICHMENT = True
|
src/data/__init__.py
ADDED
|
File without changes
|
src/data/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (149 Bytes). View file
|
|
|
src/data/__pycache__/parser.cpython-313.pyc
ADDED
|
Binary file (4.43 kB). View file
|
|
|
src/data/__pycache__/splitter.cpython-313.pyc
ADDED
|
Binary file (5.99 kB). View file
|
|
|
src/data/clean.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def strip_edges_allow_punct(s: str):
|
| 5 |
+
allowed_punct = set(".,!?;:-–—") # можно расширять
|
| 6 |
+
|
| 7 |
+
# Левый указатель — пока не буква/цифра
|
| 8 |
+
left = 0
|
| 9 |
+
while left < len(s) and not s[left].isalnum():
|
| 10 |
+
left += 1
|
| 11 |
+
|
| 12 |
+
# Правый указатель — пока не буква/цифра/пунктуация
|
| 13 |
+
right = len(s) - 1
|
| 14 |
+
while right >= 0 and not (s[right].isalnum() or s[right] in allowed_punct):
|
| 15 |
+
right -= 1
|
| 16 |
+
|
| 17 |
+
# Если всё мусор
|
| 18 |
+
if right < left:
|
| 19 |
+
return ""
|
| 20 |
+
|
| 21 |
+
return s[left:right+1]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def process_str(s: str):
|
| 25 |
+
# Чистка статьи от мусора
|
| 26 |
+
s = "\n".join(strip_edges_allow_punct(p) for p in s.split("\n") if p)
|
| 27 |
+
|
| 28 |
+
for suf in [
|
| 29 |
+
"Слушать прямой эфир",
|
| 30 |
+
"Читать РБК Стиль в Telegram",
|
| 31 |
+
"РБК Events, 18",
|
| 32 |
+
"Подписаться | Онлайн-сомелье",
|
| 33 |
+
"Читать РБК в Telegram",
|
| 34 |
+
"Следить за новостями РБК в Telegram",
|
| 35 |
+
"Следить за новостями РБК в МАХ",
|
| 36 |
+
"Другие видео этого дня — в телеграм-канале РБК",
|
| 37 |
+
"РБК в Telegram и MAX",
|
| 38 |
+
"РБК в Telegram | MAX",
|
| 39 |
+
"Подписаться на «РБК Спорт",
|
| 40 |
+
"Картина дня — в телеграм-канале РБК",
|
| 41 |
+
"Самые важные новости — в канале РБК в МАХ",
|
| 42 |
+
"Больше инфографики — в телеграм-канале РБК",
|
| 43 |
+
"Подписаться на «Сам ты инвестор!",
|
| 44 |
+
"Читать РБК Недвижимость в Telegram"
|
| 45 |
+
]:
|
| 46 |
+
s = s.removesuffix(suf).strip()
|
| 47 |
+
|
| 48 |
+
parts = [p for p in s.split("\n") if p]
|
| 49 |
+
|
| 50 |
+
prev_parts = [0] * 1000
|
| 51 |
+
while len(prev_parts) != len(parts) and len(parts) != 0:
|
| 52 |
+
prev_parts = parts
|
| 53 |
+
if "Фото:" in parts[-1] or "Данные:" in parts[-1]:
|
| 54 |
+
parts = parts[:-1]
|
| 55 |
+
|
| 56 |
+
return "\n".join(parts)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def is_advertisement(s: str):
|
| 60 |
+
# Проверка рекламных объявлений
|
| 61 |
+
last_part = [p for p in s.split("\n") if p][-1]
|
| 62 |
+
return any(v in last_part for v in ["Реклама.", "Реклама,"])
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def clean_df(df: pd.DataFrame):
|
| 66 |
+
df["message_dt"] = pd.to_datetime(df["message_dt"]).dt.date
|
| 67 |
+
df["content"] = df["content"].apply(lambda x: process_str(x))
|
| 68 |
+
df["views"] = df["views"].astype(int)
|
| 69 |
+
df = df[~df["content"].apply(is_advertisement)]
|
| 70 |
+
|
| 71 |
+
return df[["message_id", "channel_id", "message_dt", "views", "content"]]
|
src/data/parser.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import datetime
|
| 3 |
+
from typing import Union, Generator, List, Dict, Any, Optional
|
| 4 |
+
|
| 5 |
+
from pyrogram import Client
|
| 6 |
+
from pyrogram.types import Message
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PyroSource:
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
api_id: Union[int, str],
|
| 14 |
+
api_hash: str,
|
| 15 |
+
app_name: str = "default_app",
|
| 16 |
+
):
|
| 17 |
+
self.client = Client(name=app_name, api_id=api_id, api_hash=api_hash)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def load_messages(
|
| 21 |
+
self,
|
| 22 |
+
channel_id: Union[int, str],
|
| 23 |
+
limit: int,
|
| 24 |
+
offset: int = 0,
|
| 25 |
+
offset_id: int = 0,
|
| 26 |
+
time_sleep: float = 0.05,
|
| 27 |
+
) -> List[Dict[str, Any]]:
|
| 28 |
+
"""
|
| 29 |
+
channel_id: channel id or username
|
| 30 |
+
limit: number of messages to load
|
| 31 |
+
offset: offset index
|
| 32 |
+
offset_id: message id offset
|
| 33 |
+
"""
|
| 34 |
+
posts = []
|
| 35 |
+
|
| 36 |
+
with self.client as app:
|
| 37 |
+
messages: Generator[Message] = app.get_chat_history(
|
| 38 |
+
chat_id=channel_id,
|
| 39 |
+
limit=limit,
|
| 40 |
+
offset=offset,
|
| 41 |
+
offset_id=offset_id,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
for msg in messages:
|
| 45 |
+
time.sleep(time_sleep)
|
| 46 |
+
|
| 47 |
+
content = msg.text or msg.caption or ''
|
| 48 |
+
original_author = (
|
| 49 |
+
msg.forward_from_chat.username if msg.forward_from_chat else ''
|
| 50 |
+
)
|
| 51 |
+
message_dt = msg.date.strftime("%Y-%m-%d")
|
| 52 |
+
|
| 53 |
+
meta = {
|
| 54 |
+
"message_dt" : message_dt,
|
| 55 |
+
"message_id" : msg.id,
|
| 56 |
+
"channel_id" : channel_id,
|
| 57 |
+
"content" : content,
|
| 58 |
+
"views" : msg.views,
|
| 59 |
+
"original_author" : original_author,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
posts.append(meta)
|
| 63 |
+
|
| 64 |
+
return posts
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def load_days(
|
| 68 |
+
self,
|
| 69 |
+
channel_id: Union[int, str],
|
| 70 |
+
from_date: datetime.date,
|
| 71 |
+
to_date: Optional[datetime.date] = None,
|
| 72 |
+
limit: int = 1000,
|
| 73 |
+
time_sleep: float = 0.05,
|
| 74 |
+
) -> List[Dict[str, Any]]:
|
| 75 |
+
"""
|
| 76 |
+
Загружает сообщения в диапазоне дат [from_date, to_date]
|
| 77 |
+
|
| 78 |
+
channel_id: channel id or username
|
| 79 |
+
from_date: дата начала (включительно)
|
| 80 |
+
to_date: дата конца (включительно)
|
| 81 |
+
limit: safety limit
|
| 82 |
+
"""
|
| 83 |
+
posts = []
|
| 84 |
+
|
| 85 |
+
offset_date = datetime.datetime.combine(
|
| 86 |
+
from_date + datetime.timedelta(days=1),
|
| 87 |
+
datetime.time.min
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
with self.client as app:
|
| 91 |
+
messages: Generator[Message] = app.get_chat_history(
|
| 92 |
+
chat_id=channel_id,
|
| 93 |
+
limit=limit,
|
| 94 |
+
offset_date=offset_date,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
for msg in messages:
|
| 98 |
+
time.sleep(time_sleep)
|
| 99 |
+
|
| 100 |
+
msg_date = msg.date.date()
|
| 101 |
+
|
| 102 |
+
# если ушли слишком далеко в прошлое — стоп
|
| 103 |
+
if msg_date < from_date:
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
# если задан to_date и сообщение новее — пропускаем
|
| 107 |
+
if to_date and msg_date > to_date:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
content = msg.text or msg.caption or ''
|
| 111 |
+
original_author = (
|
| 112 |
+
msg.forward_from_chat.username if msg.forward_from_chat else ''
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
meta = {
|
| 116 |
+
"message_dt": msg_date.isoformat(),
|
| 117 |
+
"message_id": msg.id,
|
| 118 |
+
"channel_id": channel_id,
|
| 119 |
+
"content": content,
|
| 120 |
+
"views": msg.views,
|
| 121 |
+
"original_author": original_author,
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
posts.append(meta)
|
| 125 |
+
|
| 126 |
+
return posts
|
src/data/splitter.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter, NLTKTextSplitter
|
| 5 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
| 6 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Splitter:
|
| 10 |
+
"""
|
| 11 |
+
Класс описывает функционал разделения текста на чанки тремя способами на выбор:
|
| 12 |
+
- рекурсивно разбивая чанки различными разделителями
|
| 13 |
+
в порядке возрастания "жесткости" их эффекта;
|
| 14 |
+
|
| 15 |
+
- объединяя выделенные с помощью библиотеки NLTK предложения
|
| 16 |
+
в чанки определенного размера и с наложением;
|
| 17 |
+
|
| 18 |
+
- разбивая текст на семантически связанные блоки
|
| 19 |
+
с помощью векторных представлений текстов;
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
mode: Literal["recursive", "nltk", "semantic"],
|
| 25 |
+
model_name: str = "deepvk/USER-bge-m3",
|
| 26 |
+
chunk_size: int = 256,
|
| 27 |
+
chunk_overlap: int = 64,
|
| 28 |
+
**splitter_kwargs,
|
| 29 |
+
):
|
| 30 |
+
self.chunk_size = chunk_size
|
| 31 |
+
self.chunk_overlap = chunk_overlap
|
| 32 |
+
|
| 33 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 34 |
+
|
| 35 |
+
match mode:
|
| 36 |
+
|
| 37 |
+
case "recursive":
|
| 38 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
| 39 |
+
separators=[
|
| 40 |
+
"\n### ", "\n## ", "\n# ",
|
| 41 |
+
"\n\n", "\n",
|
| 42 |
+
"!", "?", ". ", ";", ",", ")", " ", "",
|
| 43 |
+
],
|
| 44 |
+
keep_separator="end",
|
| 45 |
+
chunk_size=chunk_size,
|
| 46 |
+
chunk_overlap=chunk_overlap,
|
| 47 |
+
length_function=lambda x: len(self.tokenizer.encode(x, add_special_tokens=False)),
|
| 48 |
+
**splitter_kwargs,
|
| 49 |
+
)
|
| 50 |
+
self.split_fn = self._recursive_split
|
| 51 |
+
|
| 52 |
+
case "nltk":
|
| 53 |
+
self.splitter = NLTKTextSplitter(
|
| 54 |
+
language="russian",
|
| 55 |
+
**splitter_kwargs,
|
| 56 |
+
)
|
| 57 |
+
self.split_fn = self._nltk_split
|
| 58 |
+
|
| 59 |
+
case "semantic":
|
| 60 |
+
self.splitter = SemanticChunker(
|
| 61 |
+
HuggingFaceEmbeddings(
|
| 62 |
+
model_name=model_name,
|
| 63 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 64 |
+
),
|
| 65 |
+
**splitter_kwargs,
|
| 66 |
+
)
|
| 67 |
+
self.split_fn = self._semantic_split
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def split_text(self, text: str) -> list[str]:
|
| 71 |
+
"""
|
| 72 |
+
Доступная пользователю функция разделения текста на чанки
|
| 73 |
+
"""
|
| 74 |
+
return self.split_fn(text)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _recursive_split(self, text: str) -> list[str]:
|
| 78 |
+
"""
|
| 79 |
+
Функция разделения текста на чанки при self.splitter == RecursiveCharacterTextSplitter
|
| 80 |
+
"""
|
| 81 |
+
return [
|
| 82 |
+
chunk
|
| 83 |
+
for chunk in self.splitter.split_text(text)
|
| 84 |
+
if any(ch.isalpha() for ch in set(chunk))
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _nltk_split(self, text: str) -> list[str]:
|
| 89 |
+
"""
|
| 90 |
+
Функция разделения текста на чанки при self.splitter == NLTKTextSplitter
|
| 91 |
+
"""
|
| 92 |
+
sentences = self.splitter.split_text(text)[0].split("\n\n")
|
| 93 |
+
sent_sizes = [
|
| 94 |
+
len(self.tokenizer.encode(sent, add_special_tokens=False))
|
| 95 |
+
for sent in sentences
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
chunks = []
|
| 99 |
+
i, n = 0, len(sentences)
|
| 100 |
+
while i < n:
|
| 101 |
+
cur_len, cur_texts = 0, []
|
| 102 |
+
|
| 103 |
+
# --- Собираем строки в чанк ---
|
| 104 |
+
j = i
|
| 105 |
+
while (j < n) and (cur_len + sent_sizes[j] <= self.chunk_size):
|
| 106 |
+
cur_texts.append(sentences[j])
|
| 107 |
+
cur_len += sent_sizes[j]
|
| 108 |
+
j += 1
|
| 109 |
+
|
| 110 |
+
chunks.append(cur_texts)
|
| 111 |
+
|
| 112 |
+
# --- Сдвигаем окно с overlap ---
|
| 113 |
+
if j >= n:
|
| 114 |
+
break
|
| 115 |
+
|
| 116 |
+
# Держим overlap в токенах, но не превышая его
|
| 117 |
+
overlap_len, k = 0, j - 1
|
| 118 |
+
while (k >= i) and (overlap_len + sent_sizes[k] <= self.chunk_overlap):
|
| 119 |
+
overlap_len += sent_sizes[k]
|
| 120 |
+
k -= 1 # идём назад от конца чанка
|
| 121 |
+
|
| 122 |
+
# Следующий старт = k+1
|
| 123 |
+
i = k + 1
|
| 124 |
+
|
| 125 |
+
return chunks
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _semantic_split(self, text: str) -> list[str]:
|
| 129 |
+
"""
|
| 130 |
+
Функция разделения текста на чанки при self.splitter == SemanticChunker
|
| 131 |
+
"""
|
| 132 |
+
return self.splitter.split_text(text)
|
src/dataset/rbc/channel_rbc_news_posts.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/dataset/test_cases.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/db_utils/__init__.py
ADDED
|
File without changes
|
src/db_utils/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (153 Bytes). View file
|
|
|
src/db_utils/__pycache__/history_utils.cpython-313.pyc
ADDED
|
Binary file (13 kB). View file
|
|
|
src/db_utils/__pycache__/qdrant_utils.cpython-313.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
src/db_utils/__pycache__/sql_utils.cpython-313.pyc
ADDED
|
Binary file (3.45 kB). View file
|
|
|
src/db_utils/db_example_usage.ipynb
ADDED
|
@@ -0,0 +1,881 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "70227cfd",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from sqlalchemy import create_engine\n",
|
| 11 |
+
"from dotenv import load_dotenv\n",
|
| 12 |
+
"from sqlalchemy import text\n",
|
| 13 |
+
"import pandas as pd\n",
|
| 14 |
+
"import os\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"load_dotenv()\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"DB_USER = os.getenv('DB_USER')\n",
|
| 19 |
+
"DB_PASS = os.getenv('DB_PASS')\n",
|
| 20 |
+
"DB_HOST = os.getenv('DB_HOST')\n",
|
| 21 |
+
"DB_PORT = os.getenv('DB_PORT')\n",
|
| 22 |
+
"DB_NAME = os.getenv('DB_NAME')\n",
|
| 23 |
+
"PATH_TO_CERT = os.getenv('PATH_TO_CERT')\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"connection_str = f\"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}\"\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"engine = create_engine(\n",
|
| 28 |
+
" connection_str,\n",
|
| 29 |
+
" connect_args={\n",
|
| 30 |
+
" \"sslmode\": \"verify-full\",\n",
|
| 31 |
+
" \"sslrootcert\": PATH_TO_CERT,\n",
|
| 32 |
+
" \"target_session_attrs\": \"read-write\"\n",
|
| 33 |
+
" }\n",
|
| 34 |
+
")"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 2,
|
| 40 |
+
"id": "fd49e25a",
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"data": {
|
| 45 |
+
"text/plain": [
|
| 46 |
+
"167"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
"execution_count": 2,
|
| 50 |
+
"metadata": {},
|
| 51 |
+
"output_type": "execute_result"
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"source": [
|
| 55 |
+
"df = pd.read_csv(\"/Users/incllude/dev/rag_tg_2025/generated_qa.csv\")\n",
|
| 56 |
+
"df.to_sql('test_cases', engine, if_exists='replace', index=False)"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 11,
|
| 62 |
+
"id": "0bb4f789",
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [],
|
| 65 |
+
"source": [
|
| 66 |
+
"create_table_query = \"\"\"\n",
|
| 67 |
+
"drop table if exists posts;\n",
|
| 68 |
+
"\"\"\"\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"try:\n",
|
| 71 |
+
" with engine.begin() as conn:\n",
|
| 72 |
+
" conn.execute(text(create_table_query))\n",
|
| 73 |
+
"except Exception as e:\n",
|
| 74 |
+
" print(\"Ошибка:\", e)\n"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": 6,
|
| 80 |
+
"id": "e687fcdb",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [
|
| 83 |
+
{
|
| 84 |
+
"data": {
|
| 85 |
+
"text/html": [
|
| 86 |
+
"<div>\n",
|
| 87 |
+
"<style scoped>\n",
|
| 88 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 89 |
+
" vertical-align: middle;\n",
|
| 90 |
+
" }\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" .dataframe tbody tr th {\n",
|
| 93 |
+
" vertical-align: top;\n",
|
| 94 |
+
" }\n",
|
| 95 |
+
"\n",
|
| 96 |
+
" .dataframe thead th {\n",
|
| 97 |
+
" text-align: right;\n",
|
| 98 |
+
" }\n",
|
| 99 |
+
"</style>\n",
|
| 100 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 101 |
+
" <thead>\n",
|
| 102 |
+
" <tr style=\"text-align: right;\">\n",
|
| 103 |
+
" <th></th>\n",
|
| 104 |
+
" <th>message_id</th>\n",
|
| 105 |
+
" <th>channel_id</th>\n",
|
| 106 |
+
" <th>message_dt</th>\n",
|
| 107 |
+
" <th>views</th>\n",
|
| 108 |
+
" <th>content</th>\n",
|
| 109 |
+
" </tr>\n",
|
| 110 |
+
" </thead>\n",
|
| 111 |
+
" <tbody>\n",
|
| 112 |
+
" <tr>\n",
|
| 113 |
+
" <th>0</th>\n",
|
| 114 |
+
" <td>137228</td>\n",
|
| 115 |
+
" <td>rbc_news</td>\n",
|
| 116 |
+
" <td>2025-12-03</td>\n",
|
| 117 |
+
" <td>40045</td>\n",
|
| 118 |
+
" <td>Суд признал писателя Бориса Акунина (настоящее...</td>\n",
|
| 119 |
+
" </tr>\n",
|
| 120 |
+
" <tr>\n",
|
| 121 |
+
" <th>1</th>\n",
|
| 122 |
+
" <td>137226</td>\n",
|
| 123 |
+
" <td>rbc_news</td>\n",
|
| 124 |
+
" <td>2025-12-03</td>\n",
|
| 125 |
+
" <td>53463</td>\n",
|
| 126 |
+
" <td>На характере переговоров с США, прошедших нака...</td>\n",
|
| 127 |
+
" </tr>\n",
|
| 128 |
+
" <tr>\n",
|
| 129 |
+
" <th>2</th>\n",
|
| 130 |
+
" <td>137224</td>\n",
|
| 131 |
+
" <td>rbc_news</td>\n",
|
| 132 |
+
" <td>2025-12-03</td>\n",
|
| 133 |
+
" <td>56667</td>\n",
|
| 134 |
+
" <td>Еврокомиссия намерена запретить исполнение вну...</td>\n",
|
| 135 |
+
" </tr>\n",
|
| 136 |
+
" <tr>\n",
|
| 137 |
+
" <th>3</th>\n",
|
| 138 |
+
" <td>137223</td>\n",
|
| 139 |
+
" <td>rbc_news</td>\n",
|
| 140 |
+
" <td>2025-12-03</td>\n",
|
| 141 |
+
" <td>55604</td>\n",
|
| 142 |
+
" <td>Прямо сейчас в эфире Радио РБК обсуждаем планы...</td>\n",
|
| 143 |
+
" </tr>\n",
|
| 144 |
+
" <tr>\n",
|
| 145 |
+
" <th>4</th>\n",
|
| 146 |
+
" <td>137222</td>\n",
|
| 147 |
+
" <td>rbc_news</td>\n",
|
| 148 |
+
" <td>2025-12-03</td>\n",
|
| 149 |
+
" <td>61798</td>\n",
|
| 150 |
+
" <td>Коллегия Еврокомиссии одобрила «потенциальный ...</td>\n",
|
| 151 |
+
" </tr>\n",
|
| 152 |
+
" <tr>\n",
|
| 153 |
+
" <th>...</th>\n",
|
| 154 |
+
" <td>...</td>\n",
|
| 155 |
+
" <td>...</td>\n",
|
| 156 |
+
" <td>...</td>\n",
|
| 157 |
+
" <td>...</td>\n",
|
| 158 |
+
" <td>...</td>\n",
|
| 159 |
+
" </tr>\n",
|
| 160 |
+
" <tr>\n",
|
| 161 |
+
" <th>4795</th>\n",
|
| 162 |
+
" <td>116045</td>\n",
|
| 163 |
+
" <td>rbc_news</td>\n",
|
| 164 |
+
" <td>2025-04-15</td>\n",
|
| 165 |
+
" <td>108803</td>\n",
|
| 166 |
+
" <td>Суд на Сахалине вынес первое решение по делу о...</td>\n",
|
| 167 |
+
" </tr>\n",
|
| 168 |
+
" <tr>\n",
|
| 169 |
+
" <th>4796</th>\n",
|
| 170 |
+
" <td>116044</td>\n",
|
| 171 |
+
" <td>rbc_news</td>\n",
|
| 172 |
+
" <td>2025-04-15</td>\n",
|
| 173 |
+
" <td>108074</td>\n",
|
| 174 |
+
" <td>Оператор национальных лотерей Франции стал отв...</td>\n",
|
| 175 |
+
" </tr>\n",
|
| 176 |
+
" <tr>\n",
|
| 177 |
+
" <th>4797</th>\n",
|
| 178 |
+
" <td>116043</td>\n",
|
| 179 |
+
" <td>rbc_news</td>\n",
|
| 180 |
+
" <td>2025-04-15</td>\n",
|
| 181 |
+
" <td>143013</td>\n",
|
| 182 |
+
" <td>Слоны во время землетрясения в зоопарке Сан-Ди...</td>\n",
|
| 183 |
+
" </tr>\n",
|
| 184 |
+
" <tr>\n",
|
| 185 |
+
" <th>4798</th>\n",
|
| 186 |
+
" <td>116041</td>\n",
|
| 187 |
+
" <td>rbc_news</td>\n",
|
| 188 |
+
" <td>2025-04-15</td>\n",
|
| 189 |
+
" <td>125020</td>\n",
|
| 190 |
+
" <td>Аэропорт южнокорейского города Муан регулярно ...</td>\n",
|
| 191 |
+
" </tr>\n",
|
| 192 |
+
" <tr>\n",
|
| 193 |
+
" <th>4799</th>\n",
|
| 194 |
+
" <td>116039</td>\n",
|
| 195 |
+
" <td>rbc_news</td>\n",
|
| 196 |
+
" <td>2025-04-15</td>\n",
|
| 197 |
+
" <td>156002</td>\n",
|
| 198 |
+
" <td>Первоклассники не должны заниматься уроками бо...</td>\n",
|
| 199 |
+
" </tr>\n",
|
| 200 |
+
" </tbody>\n",
|
| 201 |
+
"</table>\n",
|
| 202 |
+
"<p>4800 rows × 5 columns</p>\n",
|
| 203 |
+
"</div>"
|
| 204 |
+
],
|
| 205 |
+
"text/plain": [
|
| 206 |
+
" message_id channel_id message_dt views \\\n",
|
| 207 |
+
"0 137228 rbc_news 2025-12-03 40045 \n",
|
| 208 |
+
"1 137226 rbc_news 2025-12-03 53463 \n",
|
| 209 |
+
"2 137224 rbc_news 2025-12-03 56667 \n",
|
| 210 |
+
"3 137223 rbc_news 2025-12-03 55604 \n",
|
| 211 |
+
"4 137222 rbc_news 2025-12-03 61798 \n",
|
| 212 |
+
"... ... ... ... ... \n",
|
| 213 |
+
"4795 116045 rbc_news 2025-04-15 108803 \n",
|
| 214 |
+
"4796 116044 rbc_news 2025-04-15 108074 \n",
|
| 215 |
+
"4797 116043 rbc_news 2025-04-15 143013 \n",
|
| 216 |
+
"4798 116041 rbc_news 2025-04-15 125020 \n",
|
| 217 |
+
"4799 116039 rbc_news 2025-04-15 156002 \n",
|
| 218 |
+
"\n",
|
| 219 |
+
" content \n",
|
| 220 |
+
"0 Суд признал писателя Бориса Акунина (настоящее... \n",
|
| 221 |
+
"1 На характере переговоров с США, прошедших нака... \n",
|
| 222 |
+
"2 Еврокомиссия намерена запретить исполнение вну... \n",
|
| 223 |
+
"3 Прямо сейчас в эфире Радио РБК обсуждаем планы... \n",
|
| 224 |
+
"4 Коллегия Еврокомиссии одобрила «потенциальный ... \n",
|
| 225 |
+
"... ... \n",
|
| 226 |
+
"4795 Суд на Сахалине вынес первое решение по делу о... \n",
|
| 227 |
+
"4796 Оператор национальных лотерей Франции стал отв... \n",
|
| 228 |
+
"4797 Слоны во время землетрясения в зоопарке Сан-Ди... \n",
|
| 229 |
+
"4798 Аэропорт южнокорейского города Муан регулярно ... \n",
|
| 230 |
+
"4799 Первоклассники не должны заниматься уроками бо... \n",
|
| 231 |
+
"\n",
|
| 232 |
+
"[4800 rows x 5 columns]"
|
| 233 |
+
]
|
| 234 |
+
},
|
| 235 |
+
"execution_count": 6,
|
| 236 |
+
"metadata": {},
|
| 237 |
+
"output_type": "execute_result"
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
"source": [
|
| 241 |
+
"with engine.connect() as conn:\n",
|
| 242 |
+
" df = pd.read_sql('''\n",
|
| 243 |
+
"select * from posts\n",
|
| 244 |
+
" ''', conn)\n",
|
| 245 |
+
"\n",
|
| 246 |
+
"df"
|
| 247 |
+
]
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"cell_type": "code",
|
| 251 |
+
"execution_count": 73,
|
| 252 |
+
"id": "8abd0803",
|
| 253 |
+
"metadata": {},
|
| 254 |
+
"outputs": [
|
| 255 |
+
{
|
| 256 |
+
"data": {
|
| 257 |
+
"text/html": [
|
| 258 |
+
"<div>\n",
|
| 259 |
+
"<style scoped>\n",
|
| 260 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 261 |
+
" vertical-align: middle;\n",
|
| 262 |
+
" }\n",
|
| 263 |
+
"\n",
|
| 264 |
+
" .dataframe tbody tr th {\n",
|
| 265 |
+
" vertical-align: top;\n",
|
| 266 |
+
" }\n",
|
| 267 |
+
"\n",
|
| 268 |
+
" .dataframe thead th {\n",
|
| 269 |
+
" text-align: right;\n",
|
| 270 |
+
" }\n",
|
| 271 |
+
"</style>\n",
|
| 272 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 273 |
+
" <thead>\n",
|
| 274 |
+
" <tr style=\"text-align: right;\">\n",
|
| 275 |
+
" <th></th>\n",
|
| 276 |
+
" <th>message_id</th>\n",
|
| 277 |
+
" <th>channel_id</th>\n",
|
| 278 |
+
" <th>message_dt</th>\n",
|
| 279 |
+
" <th>views</th>\n",
|
| 280 |
+
" <th>content</th>\n",
|
| 281 |
+
" </tr>\n",
|
| 282 |
+
" </thead>\n",
|
| 283 |
+
" <tbody>\n",
|
| 284 |
+
" <tr>\n",
|
| 285 |
+
" <th>4498</th>\n",
|
| 286 |
+
" <td>130471</td>\n",
|
| 287 |
+
" <td>rbc_news</td>\n",
|
| 288 |
+
" <td>2025-09-12</td>\n",
|
| 289 |
+
" <td>139946</td>\n",
|
| 290 |
+
" <td>Адвокат бывшего мэра Стамбула Экрема Имамоглу ...</td>\n",
|
| 291 |
+
" </tr>\n",
|
| 292 |
+
" <tr>\n",
|
| 293 |
+
" <th>1054</th>\n",
|
| 294 |
+
" <td>134025</td>\n",
|
| 295 |
+
" <td>rbc_news</td>\n",
|
| 296 |
+
" <td>2025-10-22</td>\n",
|
| 297 |
+
" <td>112469</td>\n",
|
| 298 |
+
" <td>Госдума приняла в первом чтении проект бюджета...</td>\n",
|
| 299 |
+
" </tr>\n",
|
| 300 |
+
" <tr>\n",
|
| 301 |
+
" <th>283</th>\n",
|
| 302 |
+
" <td>136520</td>\n",
|
| 303 |
+
" <td>rbc_news</td>\n",
|
| 304 |
+
" <td>2025-11-24</td>\n",
|
| 305 |
+
" <td>118238</td>\n",
|
| 306 |
+
" <td>Силы ПВО в период с 14:00 до 20:00 мск уничтож...</td>\n",
|
| 307 |
+
" </tr>\n",
|
| 308 |
+
" <tr>\n",
|
| 309 |
+
" <th>4406</th>\n",
|
| 310 |
+
" <td>124142</td>\n",
|
| 311 |
+
" <td>rbc_news</td>\n",
|
| 312 |
+
" <td>2025-07-15</td>\n",
|
| 313 |
+
" <td>122075</td>\n",
|
| 314 |
+
" <td>В Госдуме из-за мощных ливней затопило курилку...</td>\n",
|
| 315 |
+
" </tr>\n",
|
| 316 |
+
" <tr>\n",
|
| 317 |
+
" <th>1378</th>\n",
|
| 318 |
+
" <td>135060</td>\n",
|
| 319 |
+
" <td>rbc_news</td>\n",
|
| 320 |
+
" <td>2025-11-05</td>\n",
|
| 321 |
+
" <td>114842</td>\n",
|
| 322 |
+
" <td>Американская актриса и посол доброй воли ЮНИСЕ...</td>\n",
|
| 323 |
+
" </tr>\n",
|
| 324 |
+
" <tr>\n",
|
| 325 |
+
" <th>...</th>\n",
|
| 326 |
+
" <td>...</td>\n",
|
| 327 |
+
" <td>...</td>\n",
|
| 328 |
+
" <td>...</td>\n",
|
| 329 |
+
" <td>...</td>\n",
|
| 330 |
+
" <td>...</td>\n",
|
| 331 |
+
" </tr>\n",
|
| 332 |
+
" <tr>\n",
|
| 333 |
+
" <th>1467</th>\n",
|
| 334 |
+
" <td>130799</td>\n",
|
| 335 |
+
" <td>rbc_news</td>\n",
|
| 336 |
+
" <td>2025-09-17</td>\n",
|
| 337 |
+
" <td>135139</td>\n",
|
| 338 |
+
" <td>Кинокомпании Walt Disney, Universal и Warner B...</td>\n",
|
| 339 |
+
" </tr>\n",
|
| 340 |
+
" <tr>\n",
|
| 341 |
+
" <th>4581</th>\n",
|
| 342 |
+
" <td>123624</td>\n",
|
| 343 |
+
" <td>rbc_news</td>\n",
|
| 344 |
+
" <td>2025-07-10</td>\n",
|
| 345 |
+
" <td>121628</td>\n",
|
| 346 |
+
" <td>Главные новости к утру — на телеканале РБК</td>\n",
|
| 347 |
+
" </tr>\n",
|
| 348 |
+
" <tr>\n",
|
| 349 |
+
" <th>4281</th>\n",
|
| 350 |
+
" <td>130775</td>\n",
|
| 351 |
+
" <td>rbc_news</td>\n",
|
| 352 |
+
" <td>2025-09-16</td>\n",
|
| 353 |
+
" <td>120652</td>\n",
|
| 354 |
+
" <td>Путин примерил тепловизионные очки «Стрекоза» ...</td>\n",
|
| 355 |
+
" </tr>\n",
|
| 356 |
+
" <tr>\n",
|
| 357 |
+
" <th>241</th>\n",
|
| 358 |
+
" <td>136585</td>\n",
|
| 359 |
+
" <td>rbc_news</td>\n",
|
| 360 |
+
" <td>2025-11-25</td>\n",
|
| 361 |
+
" <td>101214</td>\n",
|
| 362 |
+
" <td>Российского посла вызвали в МИД Молдавии из-за...</td>\n",
|
| 363 |
+
" </tr>\n",
|
| 364 |
+
" <tr>\n",
|
| 365 |
+
" <th>1947</th>\n",
|
| 366 |
+
" <td>128465</td>\n",
|
| 367 |
+
" <td>rbc_news</td>\n",
|
| 368 |
+
" <td>2025-08-23</td>\n",
|
| 369 |
+
" <td>163463</td>\n",
|
| 370 |
+
" <td>Ограничения полетов ввели в аэропорту Ульяновс...</td>\n",
|
| 371 |
+
" </tr>\n",
|
| 372 |
+
" </tbody>\n",
|
| 373 |
+
"</table>\n",
|
| 374 |
+
"<p>4800 rows × 5 columns</p>\n",
|
| 375 |
+
"</div>"
|
| 376 |
+
],
|
| 377 |
+
"text/plain": [
|
| 378 |
+
" message_id channel_id message_dt views \\\n",
|
| 379 |
+
"4498 130471 rbc_news 2025-09-12 139946 \n",
|
| 380 |
+
"1054 134025 rbc_news 2025-10-22 112469 \n",
|
| 381 |
+
"283 136520 rbc_news 2025-11-24 118238 \n",
|
| 382 |
+
"4406 124142 rbc_news 2025-07-15 122075 \n",
|
| 383 |
+
"1378 135060 rbc_news 2025-11-05 114842 \n",
|
| 384 |
+
"... ... ... ... ... \n",
|
| 385 |
+
"1467 130799 rbc_news 2025-09-17 135139 \n",
|
| 386 |
+
"4581 123624 rbc_news 2025-07-10 121628 \n",
|
| 387 |
+
"4281 130775 rbc_news 2025-09-16 120652 \n",
|
| 388 |
+
"241 136585 rbc_news 2025-11-25 101214 \n",
|
| 389 |
+
"1947 128465 rbc_news 2025-08-23 163463 \n",
|
| 390 |
+
"\n",
|
| 391 |
+
" content \n",
|
| 392 |
+
"4498 Адвокат бывшего мэра Стамбула Экрема Имамоглу ... \n",
|
| 393 |
+
"1054 Госдума приняла в первом чтении проект бюджета... \n",
|
| 394 |
+
"283 Силы ПВО в период с 14:00 до 20:00 мск уничтож... \n",
|
| 395 |
+
"4406 В Госдуме из-за мощных ливней затопило курилку... \n",
|
| 396 |
+
"1378 Американская актриса и посол доброй воли ЮНИСЕ... \n",
|
| 397 |
+
"... ... \n",
|
| 398 |
+
"1467 Кинокомп��нии Walt Disney, Universal и Warner B... \n",
|
| 399 |
+
"4581 Главные новости к утру — на телеканале РБК \n",
|
| 400 |
+
"4281 Путин примерил тепловизионные очки «Стрекоза» ... \n",
|
| 401 |
+
"241 Российского посла вызвали в МИД Молдавии из-за... \n",
|
| 402 |
+
"1947 Ограничения полетов ввели в аэропорту Ульяновс... \n",
|
| 403 |
+
"\n",
|
| 404 |
+
"[4800 rows x 5 columns]"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
"execution_count": 73,
|
| 408 |
+
"metadata": {},
|
| 409 |
+
"output_type": "execute_result"
|
| 410 |
+
}
|
| 411 |
+
],
|
| 412 |
+
"source": [
|
| 413 |
+
"def strip_edges_allow_punct(s: str):\n",
|
| 414 |
+
" allowed_punct = set(\".,!?;:-–—\") # можно расширять\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" # Левый указатель — пока не буква/цифра\n",
|
| 417 |
+
" left = 0\n",
|
| 418 |
+
" while left < len(s) and not s[left].isalnum():\n",
|
| 419 |
+
" left += 1\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" # Правый указатель — пока не буква/цифра/пунктуация\n",
|
| 422 |
+
" right = len(s) - 1\n",
|
| 423 |
+
" while right >= 0 and not (s[right].isalnum() or s[right] in allowed_punct):\n",
|
| 424 |
+
" right -= 1\n",
|
| 425 |
+
"\n",
|
| 426 |
+
" # Если всё мусор\n",
|
| 427 |
+
" if right < left:\n",
|
| 428 |
+
" return \"\"\n",
|
| 429 |
+
"\n",
|
| 430 |
+
" return s[left:right+1]\n",
|
| 431 |
+
"\n",
|
| 432 |
+
"\n",
|
| 433 |
+
"\n",
|
| 434 |
+
"def process_str(s):\n",
|
| 435 |
+
" s = \"\\n\".join(strip_edges_allow_punct(p) for p in s.split(\"\\n\") if p)\n",
|
| 436 |
+
" \n",
|
| 437 |
+
" for suf in [\n",
|
| 438 |
+
" \"Слушать прямой эфир\",\n",
|
| 439 |
+
" \"Читать РБК Стиль в Telegram\",\n",
|
| 440 |
+
" \"РБК Events, 18\",\n",
|
| 441 |
+
" \"Подписаться | Онлайн-сомелье\",\n",
|
| 442 |
+
" \"Читать РБК в Telegram\",\n",
|
| 443 |
+
" \"Следить за новостями РБК в Telegram\",\n",
|
| 444 |
+
" \"Следить за новостями РБК в МАХ\",\n",
|
| 445 |
+
" \"Другие видео этого дня — в телеграм-канале РБК\",\n",
|
| 446 |
+
" \"РБК в Telegram и MAX\",\n",
|
| 447 |
+
" \"РБК в Telegram | MAX\",\n",
|
| 448 |
+
" \"Подписаться на «РБК Спорт\",\n",
|
| 449 |
+
" \"Картина дня — в телеграм-канале РБК\",\n",
|
| 450 |
+
" \"Самые важные новости — в канале РБК в МАХ\",\n",
|
| 451 |
+
" \"Больше инфографики — в телеграм-канале РБК\",\n",
|
| 452 |
+
" \"Подписаться на «Сам ты инвестор!\",\n",
|
| 453 |
+
" \"Читать РБК Недвижимость в Telegram\"\n",
|
| 454 |
+
" ]:\n",
|
| 455 |
+
" s = s.removesuffix(suf).strip()\n",
|
| 456 |
+
"\n",
|
| 457 |
+
" parts = [p for p in s.split(\"\\n\") if p]\n",
|
| 458 |
+
" \n",
|
| 459 |
+
" prev_parts = [0] * 1000\n",
|
| 460 |
+
" while len(prev_parts) != len(parts) and len(parts) != 0:\n",
|
| 461 |
+
" prev_parts = parts\n",
|
| 462 |
+
" if \"Фото:\" in parts[-1] or \"Данные:\" in parts[-1]:\n",
|
| 463 |
+
" parts = parts[:-1]\n",
|
| 464 |
+
" \n",
|
| 465 |
+
" return \"\\n\".join(parts)\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"import pandas as pd\n",
|
| 468 |
+
"\n",
|
| 469 |
+
"rbc = pd.read_csv(\"src/dataset/rbc/channel_rbc_news_posts.csv\")\n",
|
| 470 |
+
"rbc[\"message_dt\"] = pd.to_datetime(rbc[\"message_dt\"]).dt.date\n",
|
| 471 |
+
"rbc = rbc[[\"message_id\", \"channel_id\", \"message_dt\", \"views\", \"content\"]].astype({\"views\": int}).sample(len(rbc))\n",
|
| 472 |
+
"rbc[\"content\"] = rbc[\"content\"].apply(lambda x: process_str(x))\n",
|
| 473 |
+
"rbc = rbc[rbc[\"content\"].apply(\n",
|
| 474 |
+
" lambda x: not any(v in [p for p in x.split(\"\\n\") if p][-1] for v in [\"Реклама.\", \"Реклама,\"]) if x else False\n",
|
| 475 |
+
" )]\n",
|
| 476 |
+
"rbc"
|
| 477 |
+
]
|
| 478 |
+
},
|
| 479 |
+
{
|
| 480 |
+
"cell_type": "code",
|
| 481 |
+
"execution_count": 74,
|
| 482 |
+
"id": "85bf4cbf",
|
| 483 |
+
"metadata": {},
|
| 484 |
+
"outputs": [
|
| 485 |
+
{
|
| 486 |
+
"data": {
|
| 487 |
+
"text/plain": [
|
| 488 |
+
"800"
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
"execution_count": 74,
|
| 492 |
+
"metadata": {},
|
| 493 |
+
"output_type": "execute_result"
|
| 494 |
+
}
|
| 495 |
+
],
|
| 496 |
+
"source": [
|
| 497 |
+
"create_table_query = \"\"\"\n",
|
| 498 |
+
"drop table if exists posts;\n",
|
| 499 |
+
"\"\"\"\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"try:\n",
|
| 502 |
+
" with engine.begin() as conn:\n",
|
| 503 |
+
" conn.execute(text(create_table_query))\n",
|
| 504 |
+
"except Exception as e:\n",
|
| 505 |
+
" print(\"Ошибка:\", e)\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"rbc.to_sql('posts', engine, if_exists='replace', index=False)"
|
| 508 |
+
]
|
| 509 |
+
},
|
| 510 |
+
{
|
| 511 |
+
"cell_type": "code",
|
| 512 |
+
"execution_count": 22,
|
| 513 |
+
"id": "cc99786f",
|
| 514 |
+
"metadata": {},
|
| 515 |
+
"outputs": [
|
| 516 |
+
{
|
| 517 |
+
"data": {
|
| 518 |
+
"text/html": [
|
| 519 |
+
"<div>\n",
|
| 520 |
+
"<style scoped>\n",
|
| 521 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 522 |
+
" vertical-align: middle;\n",
|
| 523 |
+
" }\n",
|
| 524 |
+
"\n",
|
| 525 |
+
" .dataframe tbody tr th {\n",
|
| 526 |
+
" vertical-align: top;\n",
|
| 527 |
+
" }\n",
|
| 528 |
+
"\n",
|
| 529 |
+
" .dataframe thead th {\n",
|
| 530 |
+
" text-align: right;\n",
|
| 531 |
+
" }\n",
|
| 532 |
+
"</style>\n",
|
| 533 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 534 |
+
" <thead>\n",
|
| 535 |
+
" <tr style=\"text-align: right;\">\n",
|
| 536 |
+
" <th></th>\n",
|
| 537 |
+
" <th>message_id</th>\n",
|
| 538 |
+
" <th>channel_id</th>\n",
|
| 539 |
+
" <th>message_dt</th>\n",
|
| 540 |
+
" <th>views</th>\n",
|
| 541 |
+
" <th>content</th>\n",
|
| 542 |
+
" </tr>\n",
|
| 543 |
+
" </thead>\n",
|
| 544 |
+
" <tbody>\n",
|
| 545 |
+
" <tr>\n",
|
| 546 |
+
" <th>0</th>\n",
|
| 547 |
+
" <td>137228</td>\n",
|
| 548 |
+
" <td>rbc_news</td>\n",
|
| 549 |
+
" <td>2025-12-03</td>\n",
|
| 550 |
+
" <td>40045</td>\n",
|
| 551 |
+
" <td>Суд признал писателя Бориса Акунина (настоящее...</td>\n",
|
| 552 |
+
" </tr>\n",
|
| 553 |
+
" <tr>\n",
|
| 554 |
+
" <th>1</th>\n",
|
| 555 |
+
" <td>137226</td>\n",
|
| 556 |
+
" <td>rbc_news</td>\n",
|
| 557 |
+
" <td>2025-12-03</td>\n",
|
| 558 |
+
" <td>53463</td>\n",
|
| 559 |
+
" <td>На характере переговоров с США, прошедших нака...</td>\n",
|
| 560 |
+
" </tr>\n",
|
| 561 |
+
" <tr>\n",
|
| 562 |
+
" <th>2</th>\n",
|
| 563 |
+
" <td>137224</td>\n",
|
| 564 |
+
" <td>rbc_news</td>\n",
|
| 565 |
+
" <td>2025-12-03</td>\n",
|
| 566 |
+
" <td>56667</td>\n",
|
| 567 |
+
" <td>Еврокомиссия намерена запретить исполнение вну...</td>\n",
|
| 568 |
+
" </tr>\n",
|
| 569 |
+
" <tr>\n",
|
| 570 |
+
" <th>3</th>\n",
|
| 571 |
+
" <td>137223</td>\n",
|
| 572 |
+
" <td>rbc_news</td>\n",
|
| 573 |
+
" <td>2025-12-03</td>\n",
|
| 574 |
+
" <td>55604</td>\n",
|
| 575 |
+
" <td>🎙 Прямо сейчас в эфире Радио РБК обсуждаем пла...</td>\n",
|
| 576 |
+
" </tr>\n",
|
| 577 |
+
" <tr>\n",
|
| 578 |
+
" <th>4</th>\n",
|
| 579 |
+
" <td>137222</td>\n",
|
| 580 |
+
" <td>rbc_news</td>\n",
|
| 581 |
+
" <td>2025-12-03</td>\n",
|
| 582 |
+
" <td>61798</td>\n",
|
| 583 |
+
" <td>Коллегия Еврокомиссии одобрила «потенциальный ...</td>\n",
|
| 584 |
+
" </tr>\n",
|
| 585 |
+
" <tr>\n",
|
| 586 |
+
" <th>...</th>\n",
|
| 587 |
+
" <td>...</td>\n",
|
| 588 |
+
" <td>...</td>\n",
|
| 589 |
+
" <td>...</td>\n",
|
| 590 |
+
" <td>...</td>\n",
|
| 591 |
+
" <td>...</td>\n",
|
| 592 |
+
" </tr>\n",
|
| 593 |
+
" <tr>\n",
|
| 594 |
+
" <th>4820</th>\n",
|
| 595 |
+
" <td>116046</td>\n",
|
| 596 |
+
" <td>rbc_news</td>\n",
|
| 597 |
+
" <td>2025-04-15</td>\n",
|
| 598 |
+
" <td>106022</td>\n",
|
| 599 |
+
" <td>Репортаж телеканала РБК из Курска, который ноч...</td>\n",
|
| 600 |
+
" </tr>\n",
|
| 601 |
+
" <tr>\n",
|
| 602 |
+
" <th>4821</th>\n",
|
| 603 |
+
" <td>116045</td>\n",
|
| 604 |
+
" <td>rbc_news</td>\n",
|
| 605 |
+
" <td>2025-04-15</td>\n",
|
| 606 |
+
" <td>108803</td>\n",
|
| 607 |
+
" <td>Суд на Сахалине вынес первое решение по делу о...</td>\n",
|
| 608 |
+
" </tr>\n",
|
| 609 |
+
" <tr>\n",
|
| 610 |
+
" <th>4822</th>\n",
|
| 611 |
+
" <td>116044</td>\n",
|
| 612 |
+
" <td>rbc_news</td>\n",
|
| 613 |
+
" <td>2025-04-15</td>\n",
|
| 614 |
+
" <td>108074</td>\n",
|
| 615 |
+
" <td>Оператор национальных лотерей Франции стал отв...</td>\n",
|
| 616 |
+
" </tr>\n",
|
| 617 |
+
" <tr>\n",
|
| 618 |
+
" <th>4823</th>\n",
|
| 619 |
+
" <td>116041</td>\n",
|
| 620 |
+
" <td>rbc_news</td>\n",
|
| 621 |
+
" <td>2025-04-15</td>\n",
|
| 622 |
+
" <td>125020</td>\n",
|
| 623 |
+
" <td>Аэропорт южнокорейского города Муан регулярно ...</td>\n",
|
| 624 |
+
" </tr>\n",
|
| 625 |
+
" <tr>\n",
|
| 626 |
+
" <th>4824</th>\n",
|
| 627 |
+
" <td>116039</td>\n",
|
| 628 |
+
" <td>rbc_news</td>\n",
|
| 629 |
+
" <td>2025-04-15</td>\n",
|
| 630 |
+
" <td>156002</td>\n",
|
| 631 |
+
" <td>Первоклассники не должны заниматься уроками бо...</td>\n",
|
| 632 |
+
" </tr>\n",
|
| 633 |
+
" </tbody>\n",
|
| 634 |
+
"</table>\n",
|
| 635 |
+
"<p>4825 rows × 5 columns</p>\n",
|
| 636 |
+
"</div>"
|
| 637 |
+
],
|
| 638 |
+
"text/plain": [
|
| 639 |
+
" message_id channel_id message_dt views \\\n",
|
| 640 |
+
"0 137228 rbc_news 2025-12-03 40045 \n",
|
| 641 |
+
"1 137226 rbc_news 2025-12-03 53463 \n",
|
| 642 |
+
"2 137224 rbc_news 2025-12-03 56667 \n",
|
| 643 |
+
"3 137223 rbc_news 2025-12-03 55604 \n",
|
| 644 |
+
"4 137222 rbc_news 2025-12-03 61798 \n",
|
| 645 |
+
"... ... ... ... ... \n",
|
| 646 |
+
"4820 116046 rbc_news 2025-04-15 106022 \n",
|
| 647 |
+
"4821 116045 rbc_news 2025-04-15 108803 \n",
|
| 648 |
+
"4822 116044 rbc_news 2025-04-15 108074 \n",
|
| 649 |
+
"4823 116041 rbc_news 2025-04-15 125020 \n",
|
| 650 |
+
"4824 116039 rbc_news 2025-04-15 156002 \n",
|
| 651 |
+
"\n",
|
| 652 |
+
" content \n",
|
| 653 |
+
"0 Суд признал писателя Бориса Акунина (настоящее... \n",
|
| 654 |
+
"1 На характере переговоров с США, прошедших нака... \n",
|
| 655 |
+
"2 Еврокомиссия намерена запретить исполнение вну... \n",
|
| 656 |
+
"3 🎙 Прямо сейчас в эфире Радио РБК обсуждаем пла... \n",
|
| 657 |
+
"4 Коллегия Еврокомиссии одобрила «потенциальный ... \n",
|
| 658 |
+
"... ... \n",
|
| 659 |
+
"4820 Репортаж телеканала РБК из Курска, который ноч... \n",
|
| 660 |
+
"4821 Суд на Сахалине вынес первое решение по делу о... \n",
|
| 661 |
+
"4822 Оператор национальных лотерей Франции стал отв... \n",
|
| 662 |
+
"4823 Аэропорт южнокорейского города Муан регулярно ... \n",
|
| 663 |
+
"4824 Первоклассники не должны заниматься уроками бо... \n",
|
| 664 |
+
"\n",
|
| 665 |
+
"[4825 rows x 5 columns]"
|
| 666 |
+
]
|
| 667 |
+
},
|
| 668 |
+
"execution_count": 22,
|
| 669 |
+
"metadata": {},
|
| 670 |
+
"output_type": "execute_result"
|
| 671 |
+
}
|
| 672 |
+
],
|
| 673 |
+
"source": [
|
| 674 |
+
"with engine.connect() as conn:\n",
|
| 675 |
+
" df = pd.read_sql('''\n",
|
| 676 |
+
"select * from posts\n",
|
| 677 |
+
" ''', conn)\n",
|
| 678 |
+
"\n",
|
| 679 |
+
"df"
|
| 680 |
+
]
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"cell_type": "code",
|
| 684 |
+
"execution_count": 7,
|
| 685 |
+
"id": "18c9a8be",
|
| 686 |
+
"metadata": {},
|
| 687 |
+
"outputs": [
|
| 688 |
+
{
|
| 689 |
+
"name": "stderr",
|
| 690 |
+
"output_type": "stream",
|
| 691 |
+
"text": [
|
| 692 |
+
"/var/folders/gv/dw4pvdvn4kqgq0tgn3m5qp940000gn/T/ipykernel_42516/890927509.py:10: UserWarning: Qdrant client version 1.16.2 is incompatible with server version 1.14.1. Major versions should match and minor version difference must not exceed 1. Set check_compatibility=False to skip version check.\n",
|
| 693 |
+
" client = QdrantClient(\n"
|
| 694 |
+
]
|
| 695 |
+
}
|
| 696 |
+
],
|
| 697 |
+
"source": [
|
| 698 |
+
"from qdrant_client import QdrantClient, models\n",
|
| 699 |
+
"from dotenv import load_dotenv\n",
|
| 700 |
+
"import numpy as np\n",
|
| 701 |
+
"import os\n",
|
| 702 |
+
"\n",
|
| 703 |
+
"load_dotenv()\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"QDRANT_URL = os.getenv('QDRANT_URL')\n",
|
| 706 |
+
"\n",
|
| 707 |
+
"client = QdrantClient(\n",
|
| 708 |
+
" url=QDRANT_URL,\n",
|
| 709 |
+
")"
|
| 710 |
+
]
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"cell_type": "code",
|
| 714 |
+
"execution_count": 9,
|
| 715 |
+
"id": "a0c3386a",
|
| 716 |
+
"metadata": {},
|
| 717 |
+
"outputs": [
|
| 718 |
+
{
|
| 719 |
+
"data": {
|
| 720 |
+
"text/plain": [
|
| 721 |
+
"True"
|
| 722 |
+
]
|
| 723 |
+
},
|
| 724 |
+
"execution_count": 9,
|
| 725 |
+
"metadata": {},
|
| 726 |
+
"output_type": "execute_result"
|
| 727 |
+
}
|
| 728 |
+
],
|
| 729 |
+
"source": [
|
| 730 |
+
"client.delete_collection(collection_name=\"recursive_USER-bge-m3\")"
|
| 731 |
+
]
|
| 732 |
+
},
|
| 733 |
+
{
|
| 734 |
+
"cell_type": "code",
|
| 735 |
+
"execution_count": 2,
|
| 736 |
+
"id": "7431dcf9",
|
| 737 |
+
"metadata": {},
|
| 738 |
+
"outputs": [
|
| 739 |
+
{
|
| 740 |
+
"data": {
|
| 741 |
+
"text/plain": [
|
| 742 |
+
"True"
|
| 743 |
+
]
|
| 744 |
+
},
|
| 745 |
+
"execution_count": 2,
|
| 746 |
+
"metadata": {},
|
| 747 |
+
"output_type": "execute_result"
|
| 748 |
+
}
|
| 749 |
+
],
|
| 750 |
+
"source": [
|
| 751 |
+
"client.create_collection(\n",
|
| 752 |
+
" collection_name=\"my_collection\",\n",
|
| 753 |
+
" vectors_config=models.VectorParams(\n",
|
| 754 |
+
" size=384, \n",
|
| 755 |
+
" distance=models.Distance.COSINE\n",
|
| 756 |
+
" )\n",
|
| 757 |
+
")\n"
|
| 758 |
+
]
|
| 759 |
+
},
|
| 760 |
+
{
|
| 761 |
+
"cell_type": "code",
|
| 762 |
+
"execution_count": null,
|
| 763 |
+
"id": "7e9e6318",
|
| 764 |
+
"metadata": {},
|
| 765 |
+
"outputs": [
|
| 766 |
+
{
|
| 767 |
+
"data": {
|
| 768 |
+
"text/plain": [
|
| 769 |
+
"UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)"
|
| 770 |
+
]
|
| 771 |
+
},
|
| 772 |
+
"execution_count": 15,
|
| 773 |
+
"metadata": {},
|
| 774 |
+
"output_type": "execute_result"
|
| 775 |
+
}
|
| 776 |
+
],
|
| 777 |
+
"source": [
|
| 778 |
+
"points = [\n",
|
| 779 |
+
" models.PointStruct(\n",
|
| 780 |
+
" id=1,\n",
|
| 781 |
+
" vector=np.ones(384).tolist(),\n",
|
| 782 |
+
" payload={\"text\": \"Пример документа 1\"}\n",
|
| 783 |
+
" ),\n",
|
| 784 |
+
" models.PointStruct(\n",
|
| 785 |
+
" id=2,\n",
|
| 786 |
+
" vector=(-np.ones(384)).tolist(),\n",
|
| 787 |
+
" payload={\"text\": \"Пример документа 2\"}\n",
|
| 788 |
+
" )\n",
|
| 789 |
+
"]\n",
|
| 790 |
+
"\n",
|
| 791 |
+
"client.upsert(\n",
|
| 792 |
+
" collection_name=\"my_collection\",\n",
|
| 793 |
+
" points=points,\n",
|
| 794 |
+
" wait=True\n",
|
| 795 |
+
")"
|
| 796 |
+
]
|
| 797 |
+
},
|
| 798 |
+
{
|
| 799 |
+
"cell_type": "code",
|
| 800 |
+
"execution_count": 3,
|
| 801 |
+
"id": "2186428b",
|
| 802 |
+
"metadata": {},
|
| 803 |
+
"outputs": [
|
| 804 |
+
{
|
| 805 |
+
"ename": "UnexpectedResponse",
|
| 806 |
+
"evalue": "Unexpected Response: 404 (Not Found)\nRaw response content:\nb'{\"status\":{\"error\":\"Not found: Collection `my_collection` doesn\\'t exist!\"},\"time\":0.004926893}'",
|
| 807 |
+
"output_type": "error",
|
| 808 |
+
"traceback": [
|
| 809 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 810 |
+
"\u001b[31mUnexpectedResponse\u001b[39m Traceback (most recent call last)",
|
| 811 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m results = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery_points\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmy_collection\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[43m-\u001b[49m\u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mones\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m384\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\n\u001b[32m 5\u001b[39m \u001b[43m)\u001b[49m\n",
|
| 812 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/qdrant_client.py:423\u001b[39m, in \u001b[36mQdrantClient.query_points\u001b[39m\u001b[34m(self, collection_name, query, using, prefetch, query_filter, search_params, limit, offset, with_payload, with_vectors, score_threshold, lookup_from, consistency, shard_key_selector, timeout, **kwargs)\u001b[39m\n\u001b[32m 408\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 409\u001b[39m prefetch = (\n\u001b[32m 410\u001b[39m \u001b[38;5;28mnext\u001b[39m(\n\u001b[32m 411\u001b[39m \u001b[38;5;28miter\u001b[39m(\n\u001b[32m (...)\u001b[39m\u001b[32m 420\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 421\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m423\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery_points\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 424\u001b[39m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 425\u001b[39m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 426\u001b[39m \u001b[43m \u001b[49m\u001b[43mprefetch\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprefetch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 427\u001b[39m \u001b[43m \u001b[49m\u001b[43mquery_filter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_filter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 428\u001b[39m \u001b[43m \u001b[49m\u001b[43msearch_params\u001b[49m\u001b[43m=\u001b[49m\u001b[43msearch_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 429\u001b[39m \u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlimit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 430\u001b[39m \u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[43m=\u001b[49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 431\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_payload\u001b[49m\u001b[43m=\u001b[49m\u001b[43mwith_payload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 432\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_vectors\u001b[49m\u001b[43m=\u001b[49m\u001b[43mwith_vectors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 433\u001b[39m \u001b[43m \u001b[49m\u001b[43mscore_threshold\u001b[49m\u001b[43m=\u001b[49m\u001b[43mscore_threshold\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 434\u001b[39m \u001b[43m \u001b[49m\u001b[43musing\u001b[49m\u001b[43m=\u001b[49m\u001b[43musing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 435\u001b[39m \u001b[43m \u001b[49m\u001b[43mlookup_from\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlookup_from\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 436\u001b[39m \u001b[43m \u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 437\u001b[39m \u001b[43m \u001b[49m\u001b[43mshard_key_selector\u001b[49m\u001b[43m=\u001b[49m\u001b[43mshard_key_selector\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 438\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 439\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 440\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 813 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/qdrant_remote.py:538\u001b[39m, in \u001b[36mQdrantRemote.query_points\u001b[39m\u001b[34m(self, collection_name, query, using, prefetch, query_filter, search_params, limit, offset, with_payload, with_vectors, score_threshold, lookup_from, consistency, shard_key_selector, timeout, **kwargs)\u001b[39m\n\u001b[32m 521\u001b[39m lookup_from = GrpcToRest.convert_lookup_location(lookup_from)\n\u001b[32m 523\u001b[39m query_request = models.QueryRequest(\n\u001b[32m 524\u001b[39m shard_key=shard_key_selector,\n\u001b[32m 525\u001b[39m prefetch=prefetch,\n\u001b[32m (...)\u001b[39m\u001b[32m 535\u001b[39m lookup_from=lookup_from,\n\u001b[32m 536\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m538\u001b[39m query_result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mhttp\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch_api\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery_points\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 539\u001b[39m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 540\u001b[39m \u001b[43m \u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 541\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 542\u001b[39m \u001b[43m \u001b[49m\u001b[43mquery_request\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_request\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 543\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 545\u001b[39m result: models.QueryResponse | \u001b[38;5;28;01mNone\u001b[39;00m = query_result.result\n\u001b[32m 546\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[33m\"\u001b[39m\u001b[33mSearch returned None\u001b[39m\u001b[33m\"\u001b[39m\n",
|
| 814 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/http/api/search_api.py:783\u001b[39m, in \u001b[36mSyncSearchApi.query_points\u001b[39m\u001b[34m(self, collection_name, consistency, timeout, query_request)\u001b[39m\n\u001b[32m 773\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mquery_points\u001b[39m(\n\u001b[32m 774\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 775\u001b[39m collection_name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 778\u001b[39m query_request: m.QueryRequest = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 779\u001b[39m ) -> m.InlineResponse20021:\n\u001b[32m 780\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 781\u001b[39m \u001b[33;03m Universally query points. This endpoint covers all capabilities of search, recommend, discover, filters. But also enables hybrid and multi-stage queries.\u001b[39;00m\n\u001b[32m 782\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m783\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_build_for_query_points\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 784\u001b[39m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 785\u001b[39m \u001b[43m \u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconsistency\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 786\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 787\u001b[39m \u001b[43m \u001b[49m\u001b[43mquery_request\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_request\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 788\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
| 815 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/http/api/search_api.py:181\u001b[39m, in \u001b[36m_SearchApi._build_for_query_points\u001b[39m\u001b[34m(self, collection_name, consistency, timeout, query_request)\u001b[39m\n\u001b[32m 179\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mContent-Type\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m headers:\n\u001b[32m 180\u001b[39m headers[\u001b[33m\"\u001b[39m\u001b[33mContent-Type\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33mapplication/json\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m--> \u001b[39m\u001b[32m181\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mapi_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 182\u001b[39m \u001b[43m \u001b[49m\u001b[43mtype_\u001b[49m\u001b[43m=\u001b[49m\u001b[43mm\u001b[49m\u001b[43m.\u001b[49m\u001b[43mInlineResponse20021\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 183\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mPOST\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 184\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/collections/\u001b[39;49m\u001b[38;5;132;43;01m{collection_name}\u001b[39;49;00m\u001b[33;43m/points/query\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 185\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 186\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath_params\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 187\u001b[39m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 188\u001b[39m \u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 189\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 816 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/http/api_client.py:95\u001b[39m, in \u001b[36mApiClient.request\u001b[39m\u001b[34m(self, type_, method, url, path_params, **kwargs)\u001b[39m\n\u001b[32m 93\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mint\u001b[39m(kwargs[\u001b[33m\"\u001b[39m\u001b[33mparams\u001b[39m\u001b[33m\"\u001b[39m][\u001b[33m\"\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 94\u001b[39m request = \u001b[38;5;28mself\u001b[39m._client.build_request(method, url, **kwargs)\n\u001b[32m---> \u001b[39m\u001b[32m95\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtype_\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 817 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag_tg_2025/venv/lib/python3.13/site-packages/qdrant_client/http/api_client.py:130\u001b[39m, in \u001b[36mApiClient.send\u001b[39m\u001b[34m(self, request, type_)\u001b[39m\n\u001b[32m 128\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 129\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ResponseHandlingException(e)\n\u001b[32m--> \u001b[39m\u001b[32m130\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnexpectedResponse.for_response(response)\n",
|
| 818 |
+
"\u001b[31mUnexpectedResponse\u001b[39m: Unexpected Response: 404 (Not Found)\nRaw response content:\nb'{\"status\":{\"error\":\"Not found: Collection `my_collection` doesn\\'t exist!\"},\"time\":0.004926893}'"
|
| 819 |
+
]
|
| 820 |
+
}
|
| 821 |
+
],
|
| 822 |
+
"source": [
|
| 823 |
+
"results = client.query_points(\n",
|
| 824 |
+
" collection_name=\"my_collection\",\n",
|
| 825 |
+
" query=(-np.ones(384)).tolist(),\n",
|
| 826 |
+
" limit=1\n",
|
| 827 |
+
")"
|
| 828 |
+
]
|
| 829 |
+
},
|
| 830 |
+
{
|
| 831 |
+
"cell_type": "code",
|
| 832 |
+
"execution_count": 43,
|
| 833 |
+
"id": "afd7344b",
|
| 834 |
+
"metadata": {},
|
| 835 |
+
"outputs": [
|
| 836 |
+
{
|
| 837 |
+
"data": {
|
| 838 |
+
"text/plain": [
|
| 839 |
+
"ScoredPoint(id=2, version=3, score=0.9999998, payload={'text': 'Пример документа 2'}, vector=None, shard_key=None, order_value=None)"
|
| 840 |
+
]
|
| 841 |
+
},
|
| 842 |
+
"execution_count": 43,
|
| 843 |
+
"metadata": {},
|
| 844 |
+
"output_type": "execute_result"
|
| 845 |
+
}
|
| 846 |
+
],
|
| 847 |
+
"source": [
|
| 848 |
+
"results.points[0]"
|
| 849 |
+
]
|
| 850 |
+
},
|
| 851 |
+
{
|
| 852 |
+
"cell_type": "code",
|
| 853 |
+
"execution_count": null,
|
| 854 |
+
"id": "8060434c",
|
| 855 |
+
"metadata": {},
|
| 856 |
+
"outputs": [],
|
| 857 |
+
"source": []
|
| 858 |
+
}
|
| 859 |
+
],
|
| 860 |
+
"metadata": {
|
| 861 |
+
"kernelspec": {
|
| 862 |
+
"display_name": "venv",
|
| 863 |
+
"language": "python",
|
| 864 |
+
"name": "python3"
|
| 865 |
+
},
|
| 866 |
+
"language_info": {
|
| 867 |
+
"codemirror_mode": {
|
| 868 |
+
"name": "ipython",
|
| 869 |
+
"version": 3
|
| 870 |
+
},
|
| 871 |
+
"file_extension": ".py",
|
| 872 |
+
"mimetype": "text/x-python",
|
| 873 |
+
"name": "python",
|
| 874 |
+
"nbconvert_exporter": "python",
|
| 875 |
+
"pygments_lexer": "ipython3",
|
| 876 |
+
"version": "3.13.3"
|
| 877 |
+
}
|
| 878 |
+
},
|
| 879 |
+
"nbformat": 4,
|
| 880 |
+
"nbformat_minor": 5
|
| 881 |
+
}
|
src/db_utils/history_utils.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Утилиты для работы с историей запросов в PostgreSQL
|
| 3 |
+
Используется на бэкенде для логирования запросов к RAG
|
| 4 |
+
"""
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import List, Dict, Optional
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
from sqlalchemy import text
|
| 10 |
+
from sqlalchemy.exc import SQLAlchemyError
|
| 11 |
+
|
| 12 |
+
from src.config import sql_client
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def init_history_table():
|
| 16 |
+
"""
|
| 17 |
+
Инициализация таблицы истории запросов
|
| 18 |
+
Создает таблицу, если она не существует
|
| 19 |
+
"""
|
| 20 |
+
try:
|
| 21 |
+
with sql_client.begin() as conn:
|
| 22 |
+
conn.execute(text("""
|
| 23 |
+
CREATE TABLE IF NOT EXISTS query_history (
|
| 24 |
+
id SERIAL PRIMARY KEY,
|
| 25 |
+
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
| 26 |
+
dialogue_id VARCHAR(255) NOT NULL,
|
| 27 |
+
query TEXT NOT NULL,
|
| 28 |
+
answer TEXT NOT NULL,
|
| 29 |
+
reason TEXT,
|
| 30 |
+
search_period JSONB,
|
| 31 |
+
metadata JSONB
|
| 32 |
+
)
|
| 33 |
+
"""))
|
| 34 |
+
conn.execute(text("""
|
| 35 |
+
CREATE INDEX IF NOT EXISTS idx_query_history_dialogue_id
|
| 36 |
+
ON query_history(dialogue_id)
|
| 37 |
+
"""))
|
| 38 |
+
conn.execute(text("""
|
| 39 |
+
CREATE INDEX IF NOT EXISTS idx_query_history_timestamp
|
| 40 |
+
ON query_history(timestamp DESC)
|
| 41 |
+
"""))
|
| 42 |
+
print("✅ Таблица query_history инициализирована")
|
| 43 |
+
except SQLAlchemyError as e:
|
| 44 |
+
print(f"❌ Ошибка при инициализации таблицы: {e}")
|
| 45 |
+
raise
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def log_query(
|
| 49 |
+
query: str,
|
| 50 |
+
answer: str,
|
| 51 |
+
reason: str,
|
| 52 |
+
dialogue_id: Optional[str] = None,
|
| 53 |
+
search_period: Optional[Dict] = None,
|
| 54 |
+
metadata_: Optional[Dict] = None
|
| 55 |
+
) -> Optional[int]:
|
| 56 |
+
"""
|
| 57 |
+
Логировать запрос в историю (вызывается бэкендом после получения ответа от LLM)
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
query: Текст вопроса пользователя
|
| 61 |
+
answer: Ответ системы
|
| 62 |
+
reason: Обоснование ответа
|
| 63 |
+
dialogue_id: ID диалога (опционально)
|
| 64 |
+
search_period: Период поиска
|
| 65 |
+
metadata_: Дополнительные метаданные
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
ID созданной записи или None при ошибке
|
| 69 |
+
"""
|
| 70 |
+
# Генерируем dialogue_id если не передан
|
| 71 |
+
if not dialogue_id:
|
| 72 |
+
dialogue_id = f"single_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
with sql_client.begin() as conn:
|
| 76 |
+
result = conn.execute(
|
| 77 |
+
text("""
|
| 78 |
+
INSERT INTO query_history
|
| 79 |
+
(timestamp, dialogue_id, query, answer, reason, search_period, metadata)
|
| 80 |
+
VALUES (:timestamp, :dialogue_id, :query, :answer, :reason,
|
| 81 |
+
CAST(:search_period AS JSONB), CAST(:metadata AS JSONB))
|
| 82 |
+
RETURNING id
|
| 83 |
+
"""),
|
| 84 |
+
{
|
| 85 |
+
"timestamp": datetime.now(),
|
| 86 |
+
"dialogue_id": dialogue_id,
|
| 87 |
+
"query": query,
|
| 88 |
+
"answer": answer,
|
| 89 |
+
"reason": reason,
|
| 90 |
+
"search_period": json.dumps(search_period or {}),
|
| 91 |
+
"metadata": json.dumps(metadata_ or {})
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
query_id = result.scalar()
|
| 95 |
+
return query_id
|
| 96 |
+
except SQLAlchemyError as e:
|
| 97 |
+
print(f"❌ Ошибка при логировании запроса: {e}")
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def get_all_history(limit: int = 100, offset: int = 0) -> List[Dict]:
|
| 102 |
+
"""Получить всю историю запросов"""
|
| 103 |
+
try:
|
| 104 |
+
with sql_client.connect() as conn:
|
| 105 |
+
result = conn.execute(
|
| 106 |
+
text("""
|
| 107 |
+
SELECT id, timestamp, dialogue_id, query, answer, reason,
|
| 108 |
+
search_period, metadata
|
| 109 |
+
FROM query_history
|
| 110 |
+
ORDER BY timestamp DESC
|
| 111 |
+
LIMIT :limit OFFSET :offset
|
| 112 |
+
"""),
|
| 113 |
+
{"limit": limit, "offset": offset}
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
rows = result.mappings().all()
|
| 117 |
+
# Конвертируем datetime в ISO строку для JSON сериализации
|
| 118 |
+
return [
|
| 119 |
+
{
|
| 120 |
+
**dict(row),
|
| 121 |
+
"timestamp": row["timestamp"].isoformat() if row["timestamp"] else None
|
| 122 |
+
}
|
| 123 |
+
for row in rows
|
| 124 |
+
]
|
| 125 |
+
except SQLAlchemyError as e:
|
| 126 |
+
print(f"❌ Ошибка при получении истории: {e}")
|
| 127 |
+
return []
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def get_history_by_dialogue(dialogue_id: str) -> List[Dict]:
|
| 131 |
+
"""Получить историю конк��етного диалога"""
|
| 132 |
+
try:
|
| 133 |
+
with sql_client.connect() as conn:
|
| 134 |
+
result = conn.execute(
|
| 135 |
+
text("""
|
| 136 |
+
SELECT id, timestamp, dialogue_id, query, answer, reason,
|
| 137 |
+
search_period, metadata
|
| 138 |
+
FROM query_history
|
| 139 |
+
WHERE dialogue_id = :dialogue_id
|
| 140 |
+
ORDER BY timestamp ASC
|
| 141 |
+
"""),
|
| 142 |
+
{"dialogue_id": dialogue_id}
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
rows = result.mappings().all()
|
| 146 |
+
return [
|
| 147 |
+
{
|
| 148 |
+
**dict(row),
|
| 149 |
+
"timestamp": row["timestamp"].isoformat() if row["timestamp"] else None
|
| 150 |
+
}
|
| 151 |
+
for row in rows
|
| 152 |
+
]
|
| 153 |
+
except SQLAlchemyError as e:
|
| 154 |
+
print(f"❌ Ошибка при получении диалога: {e}")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def search_history(search_text: str, limit: int = 50) -> List[Dict]:
|
| 159 |
+
"""Поиск по истории запросов"""
|
| 160 |
+
try:
|
| 161 |
+
with sql_client.connect() as conn:
|
| 162 |
+
result = conn.execute(
|
| 163 |
+
text("""
|
| 164 |
+
SELECT id, timestamp, dialogue_id, query, answer, reason,
|
| 165 |
+
search_period, metadata
|
| 166 |
+
FROM query_history
|
| 167 |
+
WHERE query ILIKE :search_pattern
|
| 168 |
+
OR answer ILIKE :search_pattern
|
| 169 |
+
ORDER BY timestamp DESC
|
| 170 |
+
LIMIT :limit
|
| 171 |
+
"""),
|
| 172 |
+
{
|
| 173 |
+
"search_pattern": f"%{search_text}%",
|
| 174 |
+
"limit": limit
|
| 175 |
+
}
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
rows = result.mappings().all()
|
| 179 |
+
return [
|
| 180 |
+
{
|
| 181 |
+
**dict(row),
|
| 182 |
+
"timestamp": row["timestamp"].isoformat() if row["timestamp"] else None
|
| 183 |
+
}
|
| 184 |
+
for row in rows
|
| 185 |
+
]
|
| 186 |
+
except SQLAlchemyError as e:
|
| 187 |
+
print(f"❌ Ошибка при поиске в истории: {e}")
|
| 188 |
+
return []
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def get_history_stats() -> Dict:
|
| 192 |
+
"""Получить статистику по истории запросов"""
|
| 193 |
+
try:
|
| 194 |
+
with sql_client.connect() as conn:
|
| 195 |
+
result = conn.execute(
|
| 196 |
+
text("""
|
| 197 |
+
SELECT
|
| 198 |
+
COUNT(*) as total_queries,
|
| 199 |
+
COUNT(DISTINCT dialogue_id) as unique_dialogues,
|
| 200 |
+
MAX(timestamp) as last_query_time,
|
| 201 |
+
MIN(timestamp) as first_query_time
|
| 202 |
+
FROM query_history
|
| 203 |
+
""")
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
row = result.mappings().first()
|
| 207 |
+
if row:
|
| 208 |
+
return {
|
| 209 |
+
"total_queries": row["total_queries"],
|
| 210 |
+
"unique_dialogues": row["unique_dialogues"],
|
| 211 |
+
"last_query_time": row["last_query_time"].isoformat() if row["last_query_time"] else None,
|
| 212 |
+
"first_query_time": row["first_query_time"].isoformat() if row["first_query_time"] else None
|
| 213 |
+
}
|
| 214 |
+
return {}
|
| 215 |
+
except SQLAlchemyError as e:
|
| 216 |
+
print(f"❌ Ошибка при получении статистики: {e}")
|
| 217 |
+
return {}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def delete_history(dialogue_id: Optional[str] = None):
|
| 221 |
+
"""Удалить историю"""
|
| 222 |
+
try:
|
| 223 |
+
with sql_client.begin() as conn:
|
| 224 |
+
if dialogue_id:
|
| 225 |
+
conn.execute(
|
| 226 |
+
text("DELETE FROM query_history WHERE dialogue_id = :dialogue_id"),
|
| 227 |
+
{"dialogue_id": dialogue_id}
|
| 228 |
+
)
|
| 229 |
+
print(f"✅ История диалога {dialogue_id} удалена")
|
| 230 |
+
else:
|
| 231 |
+
conn.execute(text("DELETE FROM query_history"))
|
| 232 |
+
print("✅ Вся история удалена")
|
| 233 |
+
except SQLAlchemyError as e:
|
| 234 |
+
print(f"❌ Ошибка при удалении истории: {e}")
|
| 235 |
+
raise
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def get_recent_dialogues(limit: int = 10) -> List[Dict]:
|
| 239 |
+
"""Получить список последних диалогов"""
|
| 240 |
+
try:
|
| 241 |
+
with sql_client.connect() as conn:
|
| 242 |
+
result = conn.execute(
|
| 243 |
+
text("""
|
| 244 |
+
SELECT
|
| 245 |
+
dialogue_id,
|
| 246 |
+
COUNT(*) as message_count,
|
| 247 |
+
MIN(timestamp) as started_at,
|
| 248 |
+
MAX(timestamp) as last_message_at
|
| 249 |
+
FROM query_history
|
| 250 |
+
GROUP BY dialogue_id
|
| 251 |
+
ORDER BY MAX(timestamp) DESC
|
| 252 |
+
LIMIT :limit
|
| 253 |
+
"""),
|
| 254 |
+
{"limit": limit}
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
rows = result.mappings().all()
|
| 258 |
+
return [
|
| 259 |
+
{
|
| 260 |
+
"dialogue_id": row["dialogue_id"],
|
| 261 |
+
"message_count": row["message_count"],
|
| 262 |
+
"started_at": row["started_at"].isoformat() if row["started_at"] else None,
|
| 263 |
+
"last_message_at": row["last_message_at"].isoformat() if row["last_message_at"] else None
|
| 264 |
+
}
|
| 265 |
+
for row in rows
|
| 266 |
+
]
|
| 267 |
+
except SQLAlchemyError as e:
|
| 268 |
+
print(f"❌ Ошибка при получении списка диалогов: {e}")
|
| 269 |
+
return []
|
src/db_utils/qdrant_utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from typing import Literal, Any
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from qdrant_client import models
|
| 6 |
+
|
| 7 |
+
from src.config import qdrant_client
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def qdrant_create_index(
|
| 11 |
+
index_name: str,
|
| 12 |
+
dim: int,
|
| 13 |
+
distance: Literal["cosine", "euclid", "manhattan"],
|
| 14 |
+
):
|
| 15 |
+
distance_mode = None
|
| 16 |
+
match distance:
|
| 17 |
+
case "cosine":
|
| 18 |
+
distance_mode = models.Distance.COSINE
|
| 19 |
+
case "euclid":
|
| 20 |
+
distance_mode = models.Distance.EUCLID
|
| 21 |
+
case "manhattan":
|
| 22 |
+
distance_mode = models.Distance.MANHATTAN
|
| 23 |
+
case _:
|
| 24 |
+
return ValueError(distance)
|
| 25 |
+
|
| 26 |
+
return qdrant_client.create_collection(
|
| 27 |
+
collection_name=index_name,
|
| 28 |
+
vectors_config=models.VectorParams(
|
| 29 |
+
size=dim,
|
| 30 |
+
distance=distance_mode,
|
| 31 |
+
)
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def qdrant_insert(df: pd.DataFrame, index_name: str) -> Any:
|
| 36 |
+
"""
|
| 37 |
+
df.columns == ["doc_id", "text", "vector"]
|
| 38 |
+
"""
|
| 39 |
+
points = [
|
| 40 |
+
models.PointStruct(
|
| 41 |
+
id=str(uuid.uuid4()), # уникальный id чанка
|
| 42 |
+
vector=list(row.vector), # вектор чанкa
|
| 43 |
+
payload={
|
| 44 |
+
"doc_id": row.doc_id, # <--- связь с PostgreSQL
|
| 45 |
+
"text": row.text,
|
| 46 |
+
},
|
| 47 |
+
) for row in df.itertuples(index=False)
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
return qdrant_client.upsert(collection_name=index_name, points=points)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def qdrant_search(index_name: str, vector: list, limit: int = 5) -> list:
|
| 54 |
+
return qdrant_client.query_points(
|
| 55 |
+
collection_name=index_name,
|
| 56 |
+
query=vector,
|
| 57 |
+
limit=limit,
|
| 58 |
+
)
|
src/db_utils/sql_utils.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Hashable, Optional, Literal
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sqlalchemy import text
|
| 5 |
+
|
| 6 |
+
from src.config import sql_client
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def sql_drop(table: str):
|
| 10 |
+
try:
|
| 11 |
+
with sql_client.begin() as conn:
|
| 12 |
+
conn.execute(text(f"drop table if exists {table};"))
|
| 13 |
+
except Exception as e:
|
| 14 |
+
print("Ошибка:", e)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def sql_dump_df(
|
| 18 |
+
df: pd.DataFrame,
|
| 19 |
+
table: str,
|
| 20 |
+
if_exists: Literal["replace", "append"] = "append",
|
| 21 |
+
) -> Optional[int]:
|
| 22 |
+
return df.to_sql(table, sql_client, if_exists=if_exists, index=False)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def sql_get_table(table: str) -> pd.DataFrame:
|
| 26 |
+
with sql_client.connect() as conn:
|
| 27 |
+
df = pd.read_sql(f"""select * from {table}""", conn)
|
| 28 |
+
|
| 29 |
+
return df
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def sql_get_by_id(id_: Hashable) -> dict:
|
| 33 |
+
with sql_client.connect() as conn:
|
| 34 |
+
row = (
|
| 35 |
+
conn.execute(
|
| 36 |
+
text("SELECT * FROM posts WHERE ctid = :id"),
|
| 37 |
+
{"id": id_},
|
| 38 |
+
)
|
| 39 |
+
.mappings()
|
| 40 |
+
.first()
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
return row
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def sql_get_by_ids(ids_: Hashable) -> list[dict]:
|
| 47 |
+
with sql_client.connect() as conn:
|
| 48 |
+
rows = (
|
| 49 |
+
conn.execute(
|
| 50 |
+
text("SELECT * FROM posts WHERE ctid = ANY(:ids)"),
|
| 51 |
+
{"ids": ids_},
|
| 52 |
+
)
|
| 53 |
+
.mappings()
|
| 54 |
+
.all()
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
return rows
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def sql_fetch_batch(batch_size: int = 16, offset: int = 0):
|
| 61 |
+
query = text("""
|
| 62 |
+
SELECT ctid, content
|
| 63 |
+
FROM posts
|
| 64 |
+
ORDER BY ctid
|
| 65 |
+
LIMIT :limit
|
| 66 |
+
OFFSET :offset
|
| 67 |
+
""")
|
| 68 |
+
|
| 69 |
+
with sql_client.connect() as conn:
|
| 70 |
+
rows = conn.execute(query, {"limit": batch_size, "offset": offset}).mappings().all()
|
| 71 |
+
|
| 72 |
+
return rows
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def sql_get_by_date(message_date: str):
|
| 76 |
+
with sql_client.connect() as conn:
|
| 77 |
+
rows = (
|
| 78 |
+
conn.execute(
|
| 79 |
+
text(
|
| 80 |
+
"""
|
| 81 |
+
SELECT *
|
| 82 |
+
FROM posts
|
| 83 |
+
WHERE message_dt = :message_date
|
| 84 |
+
"""
|
| 85 |
+
),
|
| 86 |
+
{"message_date": message_date},
|
| 87 |
+
)
|
| 88 |
+
.mappings()
|
| 89 |
+
.all()
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return rows
|
src/evaluation/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation module for QA system testing.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .qa_evaluator import (
|
| 6 |
+
QAEvaluator,
|
| 7 |
+
QuestionBatchIterator,
|
| 8 |
+
AnswerEvaluation,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"QAEvaluator",
|
| 13 |
+
"QuestionBatchIterator",
|
| 14 |
+
"AnswerEvaluation",
|
| 15 |
+
]
|
| 16 |
+
|
src/evaluation/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (358 Bytes). View file
|
|
|
src/evaluation/__pycache__/qa_evaluator.cpython-313.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
src/evaluation/qa_evaluator.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class AnswerEvaluation(BaseModel):
|
| 9 |
+
is_valid: bool = Field(
|
| 10 |
+
description="Является ли ответ валидным и корректным относительно вопроса и оригинального текста"
|
| 11 |
+
)
|
| 12 |
+
relevance_score: float = Field(
|
| 13 |
+
description="Оценка релевантности ответа вопросу от 0.0 до 1.0",
|
| 14 |
+
ge=0.0,
|
| 15 |
+
le=1.0
|
| 16 |
+
)
|
| 17 |
+
completeness_score: float = Field(
|
| 18 |
+
description="Оценка полноты ответа от 0.0 до 1.0 (насколько ответ покрывает всю необходимую информацию)",
|
| 19 |
+
ge=0.0,
|
| 20 |
+
le=1.0
|
| 21 |
+
)
|
| 22 |
+
factual_accuracy_score: float = Field(
|
| 23 |
+
description="Оценка фактической точности ответа от 0.0 до 1.0 (соответствие фактам из оригинального текста)",
|
| 24 |
+
ge=0.0,
|
| 25 |
+
le=1.0
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class QuestionBatchIterator:
|
| 30 |
+
def __init__(self, questions, batch_size):
|
| 31 |
+
self.questions = questions
|
| 32 |
+
self.batch_size = batch_size
|
| 33 |
+
self.current_idx = 0
|
| 34 |
+
|
| 35 |
+
def __iter__(self):
|
| 36 |
+
return self
|
| 37 |
+
|
| 38 |
+
def __next__(self):
|
| 39 |
+
if self.current_idx >= len(self.questions):
|
| 40 |
+
raise StopIteration
|
| 41 |
+
|
| 42 |
+
batch = self.questions[self.current_idx:self.current_idx + self.batch_size]
|
| 43 |
+
self.current_idx += self.batch_size
|
| 44 |
+
return batch
|
| 45 |
+
|
| 46 |
+
def __len__(self):
|
| 47 |
+
return (len(self.questions) + self.batch_size - 1) // self.batch_size
|
| 48 |
+
|
| 49 |
+
def reset(self):
|
| 50 |
+
self.current_idx = 0
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class QAEvaluator:
|
| 54 |
+
def __init__(
|
| 55 |
+
self,
|
| 56 |
+
df,
|
| 57 |
+
text_column="original_text",
|
| 58 |
+
model="qwen/qwen3-next-80b-a3b-instruct",
|
| 59 |
+
temperature=0.0,
|
| 60 |
+
api_key=None,
|
| 61 |
+
api_base="https://api.proxyapi.ru/openrouter/v1"
|
| 62 |
+
):
|
| 63 |
+
self.df = df.copy()
|
| 64 |
+
self.original_text_column = text_column
|
| 65 |
+
self.api_key = api_key
|
| 66 |
+
|
| 67 |
+
self.llm = ChatOpenAI(
|
| 68 |
+
model=model,
|
| 69 |
+
temperature=temperature,
|
| 70 |
+
openai_api_key=self.api_key,
|
| 71 |
+
openai_api_base=api_base,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
self._setup_evaluation_agent()
|
| 75 |
+
self._current_question_column = None
|
| 76 |
+
self._questions_data = None
|
| 77 |
+
|
| 78 |
+
def _setup_evaluation_agent(self):
|
| 79 |
+
self.parser = PydanticOutputParser(pydantic_object=AnswerEvaluation)
|
| 80 |
+
|
| 81 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
| 82 |
+
("system", """Ты - эксперт по оценке качества ответов на вопросы по новостным текстам.
|
| 83 |
+
|
| 84 |
+
Твоя задача - оценить, насколько ответ корректен и полон относительно заданного вопроса и оригинального текста.
|
| 85 |
+
|
| 86 |
+
## Критерии оценки:
|
| 87 |
+
|
| 88 |
+
### is_valid (валидность):
|
| 89 |
+
- True: ответ корректно отвечает на вопрос и соответствует фактам из текста
|
| 90 |
+
- False: ответ неверный, не по теме, или содержит фактические ошибки
|
| 91 |
+
|
| 92 |
+
### relevance_score (релевантность, 0.0-1.0):
|
| 93 |
+
- 1.0: ответ полностью по теме вопроса
|
| 94 |
+
- 0.5: ответ частично по теме
|
| 95 |
+
- 0.0: ответ не имеет отношения к вопросу
|
| 96 |
+
|
| 97 |
+
### completeness_score (полнота, 0.0-1.0):
|
| 98 |
+
- 1.0: ответ содержит всю необходимую информацию
|
| 99 |
+
- 0.5: ответ содержит часть информации
|
| 100 |
+
- 0.0: ответ пустой или не содержит нужной информации
|
| 101 |
+
|
| 102 |
+
### factual_accuracy_score (фактическая точность, 0.0-1.0):
|
| 103 |
+
- 1.0: все факты в ответе соответствуют оригинальному тексту
|
| 104 |
+
- 0.5: есть небольшие неточности
|
| 105 |
+
- 0.0: факты в ответе противоречат оригинальному тексту
|
| 106 |
+
|
| 107 |
+
{format_instructions}"""),
|
| 108 |
+
("human", """Оцени следующий ответ:
|
| 109 |
+
|
| 110 |
+
## Оригинальный текст поста:
|
| 111 |
+
{original_text}
|
| 112 |
+
|
| 113 |
+
## Вопрос:
|
| 114 |
+
{question}
|
| 115 |
+
|
| 116 |
+
## Ответ для оценки:
|
| 117 |
+
{answer}
|
| 118 |
+
|
| 119 |
+
Проанализируй и выдай оценку.""")
|
| 120 |
+
])
|
| 121 |
+
|
| 122 |
+
self.evaluation_chain = self.prompt | self.llm | self.parser
|
| 123 |
+
|
| 124 |
+
def get_questions(self, question_column, batch_size=10):
|
| 125 |
+
if question_column not in self.df.columns:
|
| 126 |
+
raise ValueError(f"Колонка '{question_column}' не найдена в DataFrame. "
|
| 127 |
+
f"Доступные колонки: {list(self.df.columns)}")
|
| 128 |
+
|
| 129 |
+
self._current_question_column = question_column
|
| 130 |
+
|
| 131 |
+
self._questions_data = []
|
| 132 |
+
for idx, row in self.df.iterrows():
|
| 133 |
+
self._questions_data.append({
|
| 134 |
+
"index": idx,
|
| 135 |
+
"question": row[question_column],
|
| 136 |
+
"original_text": row[self.original_text_column]
|
| 137 |
+
})
|
| 138 |
+
|
| 139 |
+
questions = [item["question"] for item in self._questions_data]
|
| 140 |
+
|
| 141 |
+
return QuestionBatchIterator(questions, batch_size)
|
| 142 |
+
|
| 143 |
+
def evaluate_answers(self, answers, show_progress=True):
|
| 144 |
+
if self._questions_data is None:
|
| 145 |
+
raise ValueError("Сначала вызовите get_questions() для получения вопросов")
|
| 146 |
+
|
| 147 |
+
if len(answers) != len(self._questions_data):
|
| 148 |
+
raise ValueError(
|
| 149 |
+
f"Количество ответов ({len(answers)}) не совпадает с количеством "
|
| 150 |
+
f"вопросов ({len(self._questions_data)})"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
total_questions = len(answers)
|
| 154 |
+
valid_answers = 0
|
| 155 |
+
invalid_answers = 0
|
| 156 |
+
detailed_results = []
|
| 157 |
+
|
| 158 |
+
relevance_scores = []
|
| 159 |
+
completeness_scores = []
|
| 160 |
+
factual_accuracy_scores = []
|
| 161 |
+
|
| 162 |
+
if show_progress:
|
| 163 |
+
from tqdm import tqdm
|
| 164 |
+
iterator = tqdm(
|
| 165 |
+
zip(self._questions_data, answers),
|
| 166 |
+
total=len(answers),
|
| 167 |
+
desc="Оценка ответов"
|
| 168 |
+
)
|
| 169 |
+
else:
|
| 170 |
+
iterator = zip(self._questions_data, answers)
|
| 171 |
+
|
| 172 |
+
for qa_data, answer in iterator:
|
| 173 |
+
try:
|
| 174 |
+
evaluation = self._evaluate_single_answer(
|
| 175 |
+
original_text=qa_data["original_text"],
|
| 176 |
+
question=qa_data["question"],
|
| 177 |
+
answer=answer
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
if evaluation.is_valid:
|
| 181 |
+
valid_answers += 1
|
| 182 |
+
else:
|
| 183 |
+
invalid_answers += 1
|
| 184 |
+
|
| 185 |
+
relevance_scores.append(evaluation.relevance_score)
|
| 186 |
+
completeness_scores.append(evaluation.completeness_score)
|
| 187 |
+
factual_accuracy_scores.append(evaluation.factual_accuracy_score)
|
| 188 |
+
|
| 189 |
+
detailed_results.append({
|
| 190 |
+
"index": qa_data["index"],
|
| 191 |
+
"question": qa_data["question"],
|
| 192 |
+
"answer": answer,
|
| 193 |
+
"is_valid": evaluation.is_valid,
|
| 194 |
+
"relevance_score": evaluation.relevance_score,
|
| 195 |
+
"completeness_score": evaluation.completeness_score,
|
| 196 |
+
"factual_accuracy_score": evaluation.factual_accuracy_score,
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"Ошибка при оценке ответа: {e}")
|
| 201 |
+
invalid_answers += 1
|
| 202 |
+
relevance_scores.append(0.0)
|
| 203 |
+
completeness_scores.append(0.0)
|
| 204 |
+
factual_accuracy_scores.append(0.0)
|
| 205 |
+
|
| 206 |
+
detailed_results.append({
|
| 207 |
+
"index": qa_data["index"],
|
| 208 |
+
"question": qa_data["question"],
|
| 209 |
+
"answer": answer,
|
| 210 |
+
"is_valid": False,
|
| 211 |
+
"relevance_score": 0.0,
|
| 212 |
+
"completeness_score": 0.0,
|
| 213 |
+
"factual_accuracy_score": 0.0
|
| 214 |
+
})
|
| 215 |
+
|
| 216 |
+
avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0
|
| 217 |
+
avg_completeness = sum(completeness_scores) / len(completeness_scores) if completeness_scores else 0.0
|
| 218 |
+
avg_factual_accuracy = sum(factual_accuracy_scores) / len(factual_accuracy_scores) if factual_accuracy_scores else 0.0
|
| 219 |
+
|
| 220 |
+
accuracy = valid_answers / total_questions if total_questions > 0 else 0.0
|
| 221 |
+
combined_score = (avg_relevance + avg_completeness + avg_factual_accuracy) / 3
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"total_questions": total_questions,
|
| 225 |
+
"valid_answers": valid_answers,
|
| 226 |
+
"invalid_answers": invalid_answers,
|
| 227 |
+
"accuracy": accuracy,
|
| 228 |
+
"avg_relevance": avg_relevance,
|
| 229 |
+
"avg_completeness": avg_completeness,
|
| 230 |
+
"avg_factual_accuracy": avg_factual_accuracy,
|
| 231 |
+
"combined_score": combined_score,
|
| 232 |
+
"detailed_results": detailed_results,
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
def _evaluate_single_answer(self, original_text, question, answer):
|
| 236 |
+
if answer is None or (isinstance(answer, str) and answer.strip() == ""):
|
| 237 |
+
return AnswerEvaluation(
|
| 238 |
+
is_valid=False,
|
| 239 |
+
relevance_score=0.0,
|
| 240 |
+
completeness_score=0.0,
|
| 241 |
+
factual_accuracy_score=0.0,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
result = self.evaluation_chain.invoke({
|
| 245 |
+
"original_text": original_text,
|
| 246 |
+
"question": question,
|
| 247 |
+
"answer": answer,
|
| 248 |
+
"format_instructions": self.parser.get_format_instructions()
|
| 249 |
+
})
|
| 250 |
+
|
| 251 |
+
return result
|
| 252 |
+
|
| 253 |
+
def get_detailed_results_df(self, metrics):
|
| 254 |
+
return pd.DataFrame(metrics["detailed_results"])
|
src/evaluation/score_system.ipynb
ADDED
|
@@ -0,0 +1,687 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "477f0fa2",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"data": {
|
| 11 |
+
"text/plain": [
|
| 12 |
+
"True"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"execution_count": 1,
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"output_type": "execute_result"
|
| 18 |
+
}
|
| 19 |
+
],
|
| 20 |
+
"source": [
|
| 21 |
+
"import pandas as pd\n",
|
| 22 |
+
"import os \n",
|
| 23 |
+
"from dotenv import load_dotenv\n",
|
| 24 |
+
"from tqdm import tqdm\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"load_dotenv()"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": 2,
|
| 32 |
+
"id": "ce00fa3f-b017-4dd9-b39b-fc106ff59c61",
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"outputs": [],
|
| 35 |
+
"source": [
|
| 36 |
+
"import sys\n",
|
| 37 |
+
"from pathlib import Path\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"SRC_ROOT = Path().resolve().parents[1]\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"sys.path.insert(0, str(SRC_ROOT))\n"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"cell_type": "code",
|
| 46 |
+
"execution_count": 3,
|
| 47 |
+
"id": "28a9a5e2",
|
| 48 |
+
"metadata": {},
|
| 49 |
+
"outputs": [
|
| 50 |
+
{
|
| 51 |
+
"name": "stderr",
|
| 52 |
+
"output_type": "stream",
|
| 53 |
+
"text": [
|
| 54 |
+
"/Users/kirill/rag_tg_2025/env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 55 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 56 |
+
"/Users/kirill/rag_tg_2025/src/config.py:41: UserWarning: Qdrant client version 1.16.2 is incompatible with server version 1.14.1. Major versions should match and minor version difference must not exceed 1. Set check_compatibility=False to skip version check.\n",
|
| 57 |
+
" qdrant_client = QdrantClient(url=QDRANT_URL)\n"
|
| 58 |
+
]
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"source": [
|
| 62 |
+
"from src.evaluation.qa_evaluator import QAEvaluator\n",
|
| 63 |
+
"from src.rag.rag import RAG"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"cell_type": "code",
|
| 68 |
+
"execution_count": 4,
|
| 69 |
+
"id": "8c548b19-ad11-4515-95dc-1ae687baaff7",
|
| 70 |
+
"metadata": {},
|
| 71 |
+
"outputs": [],
|
| 72 |
+
"source": [
|
| 73 |
+
"rag = RAG(\n",
|
| 74 |
+
" embed_model_name = \"Qwen/Qwen3-Embedding-0.6B\",\n",
|
| 75 |
+
" embed_index_name = \"recursive_Qwen3-Embedding-0.6B\"\n",
|
| 76 |
+
")"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "code",
|
| 81 |
+
"execution_count": 5,
|
| 82 |
+
"id": "b12d7862-83e2-4fde-a322-892512303e25",
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"outputs": [],
|
| 85 |
+
"source": [
|
| 86 |
+
"test_cases = pd.read_csv(r'/Users/kirill/rag_tg_2025/src/dataset/test_cases.csv')"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"cell_type": "code",
|
| 91 |
+
"execution_count": 6,
|
| 92 |
+
"id": "05fc2a82-27d4-4103-9044-a5b0298956f4",
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"outputs": [
|
| 95 |
+
{
|
| 96 |
+
"data": {
|
| 97 |
+
"text/html": [
|
| 98 |
+
"<div>\n",
|
| 99 |
+
"<style scoped>\n",
|
| 100 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 101 |
+
" vertical-align: middle;\n",
|
| 102 |
+
" }\n",
|
| 103 |
+
"\n",
|
| 104 |
+
" .dataframe tbody tr th {\n",
|
| 105 |
+
" vertical-align: top;\n",
|
| 106 |
+
" }\n",
|
| 107 |
+
"\n",
|
| 108 |
+
" .dataframe thead th {\n",
|
| 109 |
+
" text-align: right;\n",
|
| 110 |
+
" }\n",
|
| 111 |
+
"</style>\n",
|
| 112 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 113 |
+
" <thead>\n",
|
| 114 |
+
" <tr style=\"text-align: right;\">\n",
|
| 115 |
+
" <th></th>\n",
|
| 116 |
+
" <th>message_id</th>\n",
|
| 117 |
+
" <th>original_text</th>\n",
|
| 118 |
+
" <th>strict_question</th>\n",
|
| 119 |
+
" <th>real_question</th>\n",
|
| 120 |
+
" </tr>\n",
|
| 121 |
+
" </thead>\n",
|
| 122 |
+
" <tbody>\n",
|
| 123 |
+
" <tr>\n",
|
| 124 |
+
" <th>0</th>\n",
|
| 125 |
+
" <td>130738</td>\n",
|
| 126 |
+
" <td>Итальянский суд принял решение экстрадировать ...</td>\n",
|
| 127 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 128 |
+
" <td>Что там с Кузнецовым — его в Германию выдадут ...</td>\n",
|
| 129 |
+
" </tr>\n",
|
| 130 |
+
" <tr>\n",
|
| 131 |
+
" <th>1</th>\n",
|
| 132 |
+
" <td>129361</td>\n",
|
| 133 |
+
" <td>Пять пассажиров автобуса №793 пострадали в ДТП...</td>\n",
|
| 134 |
+
" <td>Сколько пассажиров автобуса №793 пострадали в ...</td>\n",
|
| 135 |
+
" <td>Сколько человек в автобусе 793 пострадали, ког...</td>\n",
|
| 136 |
+
" </tr>\n",
|
| 137 |
+
" <tr>\n",
|
| 138 |
+
" <th>2</th>\n",
|
| 139 |
+
" <td>133468</td>\n",
|
| 140 |
+
" <td>Владимир Путин утвердил концепцию государствен...</td>\n",
|
| 141 |
+
" <td>Кто утвердил концепцию государственной миграци...</td>\n",
|
| 142 |
+
" <td>Кто там утвердил новую миграционную концепцию ...</td>\n",
|
| 143 |
+
" </tr>\n",
|
| 144 |
+
" <tr>\n",
|
| 145 |
+
" <th>3</th>\n",
|
| 146 |
+
" <td>123139</td>\n",
|
| 147 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 148 |
+
" <td>Какое юридическое действие предприняли Генерал...</td>\n",
|
| 149 |
+
" <td>Что Генпрокуратура и Минюст сделали с сатанист...</td>\n",
|
| 150 |
+
" </tr>\n",
|
| 151 |
+
" <tr>\n",
|
| 152 |
+
" <th>4</th>\n",
|
| 153 |
+
" <td>129894</td>\n",
|
| 154 |
+
" <td>Обломки дрона обнаружили польские пограничники...</td>\n",
|
| 155 |
+
" <td>Где и кем был обнаружен непилотируемый летател...</td>\n",
|
| 156 |
+
" <td>Что там польские пограничники нашли рядом с Бе...</td>\n",
|
| 157 |
+
" </tr>\n",
|
| 158 |
+
" </tbody>\n",
|
| 159 |
+
"</table>\n",
|
| 160 |
+
"</div>"
|
| 161 |
+
],
|
| 162 |
+
"text/plain": [
|
| 163 |
+
" message_id original_text \\\n",
|
| 164 |
+
"0 130738 Итальянский суд принял решение экстрадировать ... \n",
|
| 165 |
+
"1 129361 Пять пассажиров автобуса №793 пострадали в ДТП... \n",
|
| 166 |
+
"2 133468 Владимир Путин утвердил концепцию государствен... \n",
|
| 167 |
+
"3 123139 Генпрокуратура и Минюст подали в Верховный суд... \n",
|
| 168 |
+
"4 129894 Обломки дрона обнаружили польские пограничники... \n",
|
| 169 |
+
"\n",
|
| 170 |
+
" strict_question \\\n",
|
| 171 |
+
"0 Какое решение приняло итальянское судопроизвод... \n",
|
| 172 |
+
"1 Сколько пассажиров автобуса №793 пострадали в ... \n",
|
| 173 |
+
"2 Кто утвердил концепцию государственной миграци... \n",
|
| 174 |
+
"3 Какое юридическое действие предприняли Генерал... \n",
|
| 175 |
+
"4 Где и кем был обнаружен непилотируемый летател... \n",
|
| 176 |
+
"\n",
|
| 177 |
+
" real_question \n",
|
| 178 |
+
"0 Что там с Кузнецовым — его в Германию выдадут ... \n",
|
| 179 |
+
"1 Сколько человек в автобусе 793 пострадали, ког... \n",
|
| 180 |
+
"2 Кто там утвердил новую миграционную концепцию ... \n",
|
| 181 |
+
"3 Что Генпрокуратура и Минюст сделали с сатанист... \n",
|
| 182 |
+
"4 Что там польские пограничники нашли рядом с Бе... "
|
| 183 |
+
]
|
| 184 |
+
},
|
| 185 |
+
"execution_count": 6,
|
| 186 |
+
"metadata": {},
|
| 187 |
+
"output_type": "execute_result"
|
| 188 |
+
}
|
| 189 |
+
],
|
| 190 |
+
"source": [
|
| 191 |
+
"test_cases.head()"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"cell_type": "code",
|
| 196 |
+
"execution_count": 7,
|
| 197 |
+
"id": "93c8f6dc-b210-43b6-ba0d-f555a82e1c93",
|
| 198 |
+
"metadata": {},
|
| 199 |
+
"outputs": [],
|
| 200 |
+
"source": [
|
| 201 |
+
"evaluator = QAEvaluator(\n",
|
| 202 |
+
" df=test_cases,\n",
|
| 203 |
+
" text_column=\"original_text\",\n",
|
| 204 |
+
" temperature=0.0,\n",
|
| 205 |
+
" api_key=os.getenv(\"OPENROUTER_API_KEY\")\n",
|
| 206 |
+
")\n"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"cell_type": "code",
|
| 211 |
+
"execution_count": 8,
|
| 212 |
+
"id": "4e5b93dd-98a2-44ae-862b-41af1ed9f15d",
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"outputs": [
|
| 215 |
+
{
|
| 216 |
+
"name": "stdout",
|
| 217 |
+
"output_type": "stream",
|
| 218 |
+
"text": [
|
| 219 |
+
"Всего батчей: 11\n"
|
| 220 |
+
]
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"source": [
|
| 224 |
+
"batch_size = 16\n",
|
| 225 |
+
"question_iterator = evaluator.get_questions(\n",
|
| 226 |
+
" question_column=\"strict_question\", # или \"real_question\"\n",
|
| 227 |
+
" batch_size=batch_size\n",
|
| 228 |
+
")\n",
|
| 229 |
+
"\n",
|
| 230 |
+
"print(f\"Всего батчей: {len(question_iterator)}\")"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"cell_type": "code",
|
| 235 |
+
"execution_count": 10,
|
| 236 |
+
"id": "ffc71d69-c387-4cb4-8dc9-23c197398440",
|
| 237 |
+
"metadata": {},
|
| 238 |
+
"outputs": [
|
| 239 |
+
{
|
| 240 |
+
"name": "stderr",
|
| 241 |
+
"output_type": "stream",
|
| 242 |
+
"text": [
|
| 243 |
+
"100%|█████████████████████████████████████████████████████████████████████████████████| 167/167 [09:47<00:00, 3.52s/it]\n"
|
| 244 |
+
]
|
| 245 |
+
}
|
| 246 |
+
],
|
| 247 |
+
"source": [
|
| 248 |
+
"generated_answers = []\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"for query in tqdm(test_cases[\"real_question\"]):\n",
|
| 251 |
+
" results = rag.invoke(query)\n",
|
| 252 |
+
" generated_answers.append(\n",
|
| 253 |
+
" (results[\"answer\"], results[\"reason\"])\n",
|
| 254 |
+
" )"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"cell_type": "code",
|
| 259 |
+
"execution_count": 11,
|
| 260 |
+
"id": "117fabac-a952-40f6-ad3d-0cf74994040d",
|
| 261 |
+
"metadata": {},
|
| 262 |
+
"outputs": [
|
| 263 |
+
{
|
| 264 |
+
"data": {
|
| 265 |
+
"text/html": [
|
| 266 |
+
"<div>\n",
|
| 267 |
+
"<style scoped>\n",
|
| 268 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 269 |
+
" vertical-align: middle;\n",
|
| 270 |
+
" }\n",
|
| 271 |
+
"\n",
|
| 272 |
+
" .dataframe tbody tr th {\n",
|
| 273 |
+
" vertical-align: top;\n",
|
| 274 |
+
" }\n",
|
| 275 |
+
"\n",
|
| 276 |
+
" .dataframe thead th {\n",
|
| 277 |
+
" text-align: right;\n",
|
| 278 |
+
" }\n",
|
| 279 |
+
"</style>\n",
|
| 280 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 281 |
+
" <thead>\n",
|
| 282 |
+
" <tr style=\"text-align: right;\">\n",
|
| 283 |
+
" <th></th>\n",
|
| 284 |
+
" <th>message_id</th>\n",
|
| 285 |
+
" <th>original_text</th>\n",
|
| 286 |
+
" <th>strict_question</th>\n",
|
| 287 |
+
" <th>real_question</th>\n",
|
| 288 |
+
" <th>generated_answer</th>\n",
|
| 289 |
+
" <th>generated_reason</th>\n",
|
| 290 |
+
" </tr>\n",
|
| 291 |
+
" </thead>\n",
|
| 292 |
+
" <tbody>\n",
|
| 293 |
+
" <tr>\n",
|
| 294 |
+
" <th>0</th>\n",
|
| 295 |
+
" <td>130738</td>\n",
|
| 296 |
+
" <td>Итальянский суд принял решение экстрадировать ...</td>\n",
|
| 297 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 298 |
+
" <td>Что там с Кузнецовым — его в Германию выдадут ...</td>\n",
|
| 299 |
+
" <td>Нет, его не выдадут в Германию — решение об эк...</td>\n",
|
| 300 |
+
" <td>Кассационный суд Италии отменил решение апелля...</td>\n",
|
| 301 |
+
" </tr>\n",
|
| 302 |
+
" <tr>\n",
|
| 303 |
+
" <th>1</th>\n",
|
| 304 |
+
" <td>129361</td>\n",
|
| 305 |
+
" <td>Пять пассажиров автобуса №793 пострадали в ДТП...</td>\n",
|
| 306 |
+
" <td>Сколько пассажиров автобуса №793 пострадали в ...</td>\n",
|
| 307 |
+
" <td>Сколько человек в автобусе 793 пострадали, ког...</td>\n",
|
| 308 |
+
" <td>Пять</td>\n",
|
| 309 |
+
" <td>В контексте в первом пункте указано, что пять ...</td>\n",
|
| 310 |
+
" </tr>\n",
|
| 311 |
+
" <tr>\n",
|
| 312 |
+
" <th>2</th>\n",
|
| 313 |
+
" <td>133468</td>\n",
|
| 314 |
+
" <td>Владимир Путин утвердил концепцию государствен...</td>\n",
|
| 315 |
+
" <td>Кто утвердил концепцию государственной миграци...</td>\n",
|
| 316 |
+
" <td>Кто там утвердил новую миграционную концепцию ...</td>\n",
|
| 317 |
+
" <td>Владимир Путин</td>\n",
|
| 318 |
+
" <td>В контексте в первом пункте прямо указано, что...</td>\n",
|
| 319 |
+
" </tr>\n",
|
| 320 |
+
" <tr>\n",
|
| 321 |
+
" <th>3</th>\n",
|
| 322 |
+
" <td>123139</td>\n",
|
| 323 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 324 |
+
" <td>Какое юридическое действие предприняли Генерал...</td>\n",
|
| 325 |
+
" <td>Что Генпрокуратура и Минюст сделали с сатанист...</td>\n",
|
| 326 |
+
" <td>Генпрокуратура и Минюст подали в Верховный суд...</td>\n",
|
| 327 |
+
" <td>В контексте указано, что Генеральная прокурату...</td>\n",
|
| 328 |
+
" </tr>\n",
|
| 329 |
+
" <tr>\n",
|
| 330 |
+
" <th>4</th>\n",
|
| 331 |
+
" <td>129894</td>\n",
|
| 332 |
+
" <td>Обломки дрона обнаружили польские пограничники...</td>\n",
|
| 333 |
+
" <td>Где и кем был обнаружен непилотируемый летател...</td>\n",
|
| 334 |
+
" <td>Что там польские пограничники нашли рядом с Бе...</td>\n",
|
| 335 |
+
" <td>Да, польские пограничники нашли дрон с надпися...</td>\n",
|
| 336 |
+
" <td>Согласно контексту (пункт 1), обломки дрона об...</td>\n",
|
| 337 |
+
" </tr>\n",
|
| 338 |
+
" </tbody>\n",
|
| 339 |
+
"</table>\n",
|
| 340 |
+
"</div>"
|
| 341 |
+
],
|
| 342 |
+
"text/plain": [
|
| 343 |
+
" message_id original_text \\\n",
|
| 344 |
+
"0 130738 Итальянский суд принял решение экстрадировать ... \n",
|
| 345 |
+
"1 129361 Пять пассажиров автобуса №793 пострадали в ДТП... \n",
|
| 346 |
+
"2 133468 Владимир Путин утвердил концепцию государствен... \n",
|
| 347 |
+
"3 123139 Генпрокуратура и Минюст подали в Верховный суд... \n",
|
| 348 |
+
"4 129894 Обломки дрона обнаружили польские пограничники... \n",
|
| 349 |
+
"\n",
|
| 350 |
+
" strict_question \\\n",
|
| 351 |
+
"0 Какое решение приняло итальянское судопроизвод... \n",
|
| 352 |
+
"1 Сколько пассажиров автобуса №793 пострадали в ... \n",
|
| 353 |
+
"2 Кто утвердил концепцию государственной миграц��... \n",
|
| 354 |
+
"3 Какое юридическое действие предприняли Генерал... \n",
|
| 355 |
+
"4 Где и кем был обнаружен непилотируемый летател... \n",
|
| 356 |
+
"\n",
|
| 357 |
+
" real_question \\\n",
|
| 358 |
+
"0 Что там с Кузнецовым — его в Германию выдадут ... \n",
|
| 359 |
+
"1 Сколько человек в автобусе 793 пострадали, ког... \n",
|
| 360 |
+
"2 Кто там утвердил новую миграционную концепцию ... \n",
|
| 361 |
+
"3 Что Генпрокуратура и Минюст сделали с сатанист... \n",
|
| 362 |
+
"4 Что там польские пограничники нашли рядом с Бе... \n",
|
| 363 |
+
"\n",
|
| 364 |
+
" generated_answer \\\n",
|
| 365 |
+
"0 Нет, его не выдадут в Германию — решение об эк... \n",
|
| 366 |
+
"1 Пять \n",
|
| 367 |
+
"2 Владимир Путин \n",
|
| 368 |
+
"3 Генпрокуратура и Минюст подали в Верховный суд... \n",
|
| 369 |
+
"4 Да, польские пограничники нашли дрон с надпися... \n",
|
| 370 |
+
"\n",
|
| 371 |
+
" generated_reason \n",
|
| 372 |
+
"0 Кассационный суд Италии отменил решение апелля... \n",
|
| 373 |
+
"1 В контексте в первом пункте указано, что пять ... \n",
|
| 374 |
+
"2 В контексте в первом пункте прямо указано, что... \n",
|
| 375 |
+
"3 В контексте указано, что Генеральная прокурату... \n",
|
| 376 |
+
"4 Согласно контексту (пункт 1), обломки дрона об... "
|
| 377 |
+
]
|
| 378 |
+
},
|
| 379 |
+
"execution_count": 11,
|
| 380 |
+
"metadata": {},
|
| 381 |
+
"output_type": "execute_result"
|
| 382 |
+
}
|
| 383 |
+
],
|
| 384 |
+
"source": [
|
| 385 |
+
"test_cases[\"generated_answer\"] = [p[0] for p in generated_answers]\n",
|
| 386 |
+
"test_cases[\"generated_reason\"] = [p[1] for p in generated_answers]\n",
|
| 387 |
+
"test_cases.head()"
|
| 388 |
+
]
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"cell_type": "code",
|
| 392 |
+
"execution_count": 12,
|
| 393 |
+
"id": "5e32d246-9a16-4ffc-95c8-542a400efe33",
|
| 394 |
+
"metadata": {},
|
| 395 |
+
"outputs": [
|
| 396 |
+
{
|
| 397 |
+
"name": "stderr",
|
| 398 |
+
"output_type": "stream",
|
| 399 |
+
"text": [
|
| 400 |
+
"Оценка ответов: 4%|██▊ | 7/167 [00:18<10:24, 3.90s/it]"
|
| 401 |
+
]
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"name": "stdout",
|
| 405 |
+
"output_type": "stream",
|
| 406 |
+
"text": [
|
| 407 |
+
"Ошибка при оценке ответа: Failed to parse AnswerEvaluation from completion {}. Got: 4 validation errors for AnswerEvaluation\n",
|
| 408 |
+
"is_valid\n",
|
| 409 |
+
" Field required [type=missing, input_value={}, input_type=dict]\n",
|
| 410 |
+
" For further information visit https://errors.pydantic.dev/2.9/v/missing\n",
|
| 411 |
+
"relevance_score\n",
|
| 412 |
+
" Field required [type=missing, input_value={}, input_type=dict]\n",
|
| 413 |
+
" For further information visit https://errors.pydantic.dev/2.9/v/missing\n",
|
| 414 |
+
"completeness_score\n",
|
| 415 |
+
" Field required [type=missing, input_value={}, input_type=dict]\n",
|
| 416 |
+
" For further information visit https://errors.pydantic.dev/2.9/v/missing\n",
|
| 417 |
+
"factual_accuracy_score\n",
|
| 418 |
+
" Field required [type=missing, input_value={}, input_type=dict]\n",
|
| 419 |
+
" For further information visit https://errors.pydantic.dev/2.9/v/missing\n",
|
| 420 |
+
"For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE \n"
|
| 421 |
+
]
|
| 422 |
+
},
|
| 423 |
+
{
|
| 424 |
+
"name": "stderr",
|
| 425 |
+
"output_type": "stream",
|
| 426 |
+
"text": [
|
| 427 |
+
"Оценка ответов: 100%|█████████████████████████████████████████████████████████████████| 167/167 [03:53<00:00, 1.40s/it]\n"
|
| 428 |
+
]
|
| 429 |
+
}
|
| 430 |
+
],
|
| 431 |
+
"source": [
|
| 432 |
+
"answers = test_cases[\"generated_answer\"]\n",
|
| 433 |
+
"metrics = evaluator.evaluate_answers(answers, show_progress=True)"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"cell_type": "code",
|
| 438 |
+
"execution_count": 13,
|
| 439 |
+
"id": "342b07de-a8e5-4336-a83d-611abd192aff",
|
| 440 |
+
"metadata": {},
|
| 441 |
+
"outputs": [
|
| 442 |
+
{
|
| 443 |
+
"data": {
|
| 444 |
+
"text/html": [
|
| 445 |
+
"<div>\n",
|
| 446 |
+
"<style scoped>\n",
|
| 447 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 448 |
+
" vertical-align: middle;\n",
|
| 449 |
+
" }\n",
|
| 450 |
+
"\n",
|
| 451 |
+
" .dataframe tbody tr th {\n",
|
| 452 |
+
" vertical-align: top;\n",
|
| 453 |
+
" }\n",
|
| 454 |
+
"\n",
|
| 455 |
+
" .dataframe thead th {\n",
|
| 456 |
+
" text-align: right;\n",
|
| 457 |
+
" }\n",
|
| 458 |
+
"</style>\n",
|
| 459 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 460 |
+
" <thead>\n",
|
| 461 |
+
" <tr style=\"text-align: right;\">\n",
|
| 462 |
+
" <th></th>\n",
|
| 463 |
+
" <th>total_questions</th>\n",
|
| 464 |
+
" <th>valid_answers</th>\n",
|
| 465 |
+
" <th>accuracy</th>\n",
|
| 466 |
+
" <th>avg_relevance</th>\n",
|
| 467 |
+
" <th>avg_completeness</th>\n",
|
| 468 |
+
" <th>avg_factual_accuracy</th>\n",
|
| 469 |
+
" <th>combined_score</th>\n",
|
| 470 |
+
" </tr>\n",
|
| 471 |
+
" </thead>\n",
|
| 472 |
+
" <tbody>\n",
|
| 473 |
+
" <tr>\n",
|
| 474 |
+
" <th>0</th>\n",
|
| 475 |
+
" <td>167</td>\n",
|
| 476 |
+
" <td>125</td>\n",
|
| 477 |
+
" <td>0.748503</td>\n",
|
| 478 |
+
" <td>0.879641</td>\n",
|
| 479 |
+
" <td>0.697605</td>\n",
|
| 480 |
+
" <td>0.805389</td>\n",
|
| 481 |
+
" <td>0.794212</td>\n",
|
| 482 |
+
" </tr>\n",
|
| 483 |
+
" </tbody>\n",
|
| 484 |
+
"</table>\n",
|
| 485 |
+
"</div>"
|
| 486 |
+
],
|
| 487 |
+
"text/plain": [
|
| 488 |
+
" total_questions valid_answers accuracy avg_relevance avg_completeness \\\n",
|
| 489 |
+
"0 167 125 0.748503 0.879641 0.697605 \n",
|
| 490 |
+
"\n",
|
| 491 |
+
" avg_factual_accuracy combined_score \n",
|
| 492 |
+
"0 0.805389 0.794212 "
|
| 493 |
+
]
|
| 494 |
+
},
|
| 495 |
+
"execution_count": 13,
|
| 496 |
+
"metadata": {},
|
| 497 |
+
"output_type": "execute_result"
|
| 498 |
+
}
|
| 499 |
+
],
|
| 500 |
+
"source": [
|
| 501 |
+
"metrics_df = pd.DataFrame(\n",
|
| 502 |
+
" data=[(\n",
|
| 503 |
+
" metrics[\"total_questions\"], metrics[\"valid_answers\"], metrics[\"accuracy\"],\\\n",
|
| 504 |
+
" metrics[\"avg_relevance\"], metrics[\"avg_completeness\"], metrics[\"avg_factual_accuracy\"], \\\n",
|
| 505 |
+
" metrics[\"combined_score\"]\n",
|
| 506 |
+
" )],\n",
|
| 507 |
+
" columns=[\"total_questions\", \"valid_answers\", \"accuracy\", \"avg_relevance\", \"avg_completeness\", \\\n",
|
| 508 |
+
" \"avg_factual_accuracy\", \"combined_score\"]\n",
|
| 509 |
+
")\n",
|
| 510 |
+
"metrics_df"
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"cell_type": "code",
|
| 515 |
+
"execution_count": 32,
|
| 516 |
+
"id": "488b4f03-556a-42a5-811a-1fbe36c5f7be",
|
| 517 |
+
"metadata": {},
|
| 518 |
+
"outputs": [
|
| 519 |
+
{
|
| 520 |
+
"data": {
|
| 521 |
+
"text/html": [
|
| 522 |
+
"<div>\n",
|
| 523 |
+
"<style scoped>\n",
|
| 524 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 525 |
+
" vertical-align: middle;\n",
|
| 526 |
+
" }\n",
|
| 527 |
+
"\n",
|
| 528 |
+
" .dataframe tbody tr th {\n",
|
| 529 |
+
" vertical-align: top;\n",
|
| 530 |
+
" }\n",
|
| 531 |
+
"\n",
|
| 532 |
+
" .dataframe thead th {\n",
|
| 533 |
+
" text-align: right;\n",
|
| 534 |
+
" }\n",
|
| 535 |
+
"</style>\n",
|
| 536 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 537 |
+
" <thead>\n",
|
| 538 |
+
" <tr style=\"text-align: right;\">\n",
|
| 539 |
+
" <th></th>\n",
|
| 540 |
+
" <th>index</th>\n",
|
| 541 |
+
" <th>question</th>\n",
|
| 542 |
+
" <th>answer</th>\n",
|
| 543 |
+
" <th>is_valid</th>\n",
|
| 544 |
+
" <th>relevance_score</th>\n",
|
| 545 |
+
" <th>completeness_score</th>\n",
|
| 546 |
+
" <th>factual_accuracy_score</th>\n",
|
| 547 |
+
" </tr>\n",
|
| 548 |
+
" </thead>\n",
|
| 549 |
+
" <tbody>\n",
|
| 550 |
+
" <tr>\n",
|
| 551 |
+
" <th>112</th>\n",
|
| 552 |
+
" <td>112</td>\n",
|
| 553 |
+
" <td>На каком уровне оказалась цена нефти марки Ura...</td>\n",
|
| 554 |
+
" <td>Да, цена российской нефти марки Urals в Новоро...</td>\n",
|
| 555 |
+
" <td>True</td>\n",
|
| 556 |
+
" <td>1.0</td>\n",
|
| 557 |
+
" <td>0.5</td>\n",
|
| 558 |
+
" <td>1.0</td>\n",
|
| 559 |
+
" </tr>\n",
|
| 560 |
+
" <tr>\n",
|
| 561 |
+
" <th>37</th>\n",
|
| 562 |
+
" <td>37</td>\n",
|
| 563 |
+
" <td>Согласно официальному сообщению Росавиации, ка...</td>\n",
|
| 564 |
+
" <td>В аэропорту Ярославля введены временные ограни...</td>\n",
|
| 565 |
+
" <td>True</td>\n",
|
| 566 |
+
" <td>1.0</td>\n",
|
| 567 |
+
" <td>1.0</td>\n",
|
| 568 |
+
" <td>1.0</td>\n",
|
| 569 |
+
" </tr>\n",
|
| 570 |
+
" <tr>\n",
|
| 571 |
+
" <th>140</th>\n",
|
| 572 |
+
" <td>140</td>\n",
|
| 573 |
+
" <td>По какому инциденту Следственный комитет Росси...</td>\n",
|
| 574 |
+
" <td>Следственный комитет завел уголовное дело по ф...</td>\n",
|
| 575 |
+
" <td>True</td>\n",
|
| 576 |
+
" <td>1.0</td>\n",
|
| 577 |
+
" <td>1.0</td>\n",
|
| 578 |
+
" <td>1.0</td>\n",
|
| 579 |
+
" </tr>\n",
|
| 580 |
+
" <tr>\n",
|
| 581 |
+
" <th>0</th>\n",
|
| 582 |
+
" <td>0</td>\n",
|
| 583 |
+
" <td>Какое решение приняло итальянское судопроизвод...</td>\n",
|
| 584 |
+
" <td>Не знаю.</td>\n",
|
| 585 |
+
" <td>False</td>\n",
|
| 586 |
+
" <td>0.5</td>\n",
|
| 587 |
+
" <td>0.0</td>\n",
|
| 588 |
+
" <td>0.0</td>\n",
|
| 589 |
+
" </tr>\n",
|
| 590 |
+
" <tr>\n",
|
| 591 |
+
" <th>98</th>\n",
|
| 592 |
+
" <td>98</td>\n",
|
| 593 |
+
" <td>Что произошло на фестивале в Пенсильвании, США...</td>\n",
|
| 594 |
+
" <td>На фестивале в Пенсильвании минивэн въехал в т...</td>\n",
|
| 595 |
+
" <td>True</td>\n",
|
| 596 |
+
" <td>1.0</td>\n",
|
| 597 |
+
" <td>1.0</td>\n",
|
| 598 |
+
" <td>1.0</td>\n",
|
| 599 |
+
" </tr>\n",
|
| 600 |
+
" </tbody>\n",
|
| 601 |
+
"</table>\n",
|
| 602 |
+
"</div>"
|
| 603 |
+
],
|
| 604 |
+
"text/plain": [
|
| 605 |
+
" index question \\\n",
|
| 606 |
+
"112 112 На каком уровне оказалась цена нефти марки Ura... \n",
|
| 607 |
+
"37 37 Согласно официальному сообщению Росавиации, ка... \n",
|
| 608 |
+
"140 140 По какому инциденту Следственный комитет Росси... \n",
|
| 609 |
+
"0 0 Какое решение приняло итальянское судопроизвод... \n",
|
| 610 |
+
"98 98 Что произошло на фестивале в Пенсильвании, США... \n",
|
| 611 |
+
"\n",
|
| 612 |
+
" answer is_valid \\\n",
|
| 613 |
+
"112 Да, цена российской нефти марки Urals в Новоро... True \n",
|
| 614 |
+
"37 В аэропорту Ярославля введены временные ограни... True \n",
|
| 615 |
+
"140 Следственный комитет завел уголовное дело по ф... True \n",
|
| 616 |
+
"0 Не знаю. False \n",
|
| 617 |
+
"98 На фестивале в Пенсильвании минивэн въехал в т... True \n",
|
| 618 |
+
"\n",
|
| 619 |
+
" relevance_score completeness_score factual_accuracy_score \n",
|
| 620 |
+
"112 1.0 0.5 1.0 \n",
|
| 621 |
+
"37 1.0 1.0 1.0 \n",
|
| 622 |
+
"140 1.0 1.0 1.0 \n",
|
| 623 |
+
"0 0.5 0.0 0.0 \n",
|
| 624 |
+
"98 1.0 1.0 1.0 "
|
| 625 |
+
]
|
| 626 |
+
},
|
| 627 |
+
"execution_count": 32,
|
| 628 |
+
"metadata": {},
|
| 629 |
+
"output_type": "execute_result"
|
| 630 |
+
}
|
| 631 |
+
],
|
| 632 |
+
"source": [
|
| 633 |
+
"pd.DataFrame(metrics[\"detailed_results\"]).sample(5)"
|
| 634 |
+
]
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"cell_type": "code",
|
| 638 |
+
"execution_count": null,
|
| 639 |
+
"id": "731e781a",
|
| 640 |
+
"metadata": {},
|
| 641 |
+
"outputs": [],
|
| 642 |
+
"source": []
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"cell_type": "code",
|
| 646 |
+
"execution_count": null,
|
| 647 |
+
"id": "76ed2ec5",
|
| 648 |
+
"metadata": {},
|
| 649 |
+
"outputs": [],
|
| 650 |
+
"source": []
|
| 651 |
+
}
|
| 652 |
+
],
|
| 653 |
+
"metadata": {
|
| 654 |
+
"kernelspec": {
|
| 655 |
+
"display_name": "Python 3 (ipykernel)",
|
| 656 |
+
"language": "python",
|
| 657 |
+
"name": "python3"
|
| 658 |
+
},
|
| 659 |
+
"language_info": {
|
| 660 |
+
"codemirror_mode": {
|
| 661 |
+
"name": "ipython",
|
| 662 |
+
"version": 3
|
| 663 |
+
},
|
| 664 |
+
"file_extension": ".py",
|
| 665 |
+
"mimetype": "text/x-python",
|
| 666 |
+
"name": "python",
|
| 667 |
+
"nbconvert_exporter": "python",
|
| 668 |
+
"pygments_lexer": "ipython3",
|
| 669 |
+
"version": "3.12.12"
|
| 670 |
+
},
|
| 671 |
+
"toc": {
|
| 672 |
+
"base_numbering": 1,
|
| 673 |
+
"nav_menu": {},
|
| 674 |
+
"number_sections": true,
|
| 675 |
+
"sideBar": true,
|
| 676 |
+
"skip_h1_title": false,
|
| 677 |
+
"title_cell": "Table of Contents",
|
| 678 |
+
"title_sidebar": "Contents",
|
| 679 |
+
"toc_cell": false,
|
| 680 |
+
"toc_position": {},
|
| 681 |
+
"toc_section_display": true,
|
| 682 |
+
"toc_window_display": false
|
| 683 |
+
}
|
| 684 |
+
},
|
| 685 |
+
"nbformat": 4,
|
| 686 |
+
"nbformat_minor": 5
|
| 687 |
+
}
|
src/parser/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
src/parser/__pycache__/pyrosource.cpython-313.pyc
ADDED
|
Binary file (2.48 kB). View file
|
|
|
src/rag/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .rag import RAG
|
src/rag/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (182 Bytes). View file
|
|
|
src/rag/__pycache__/llm.cpython-313.pyc
ADDED
|
Binary file (545 Bytes). View file
|
|
|
src/rag/__pycache__/question_enricher.cpython-313.pyc
ADDED
|
Binary file (5.77 kB). View file
|
|
|
src/rag/__pycache__/rag.cpython-313.pyc
ADDED
|
Binary file (4.53 kB). View file
|
|
|
src/rag/__pycache__/retriever.cpython-313.pyc
ADDED
|
Binary file (1.8 kB). View file
|
|
|
src/rag/llm.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import ChatOpenAI
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def get_model(api_key: str, model: str):
|
| 5 |
+
llm = ChatOpenAI(
|
| 6 |
+
model=model,
|
| 7 |
+
max_retries=2,
|
| 8 |
+
openai_api_key=api_key,
|
| 9 |
+
openai_api_base="https://api.proxyapi.ru/openrouter/v1",
|
| 10 |
+
)
|
| 11 |
+
llm.verbose = False
|
| 12 |
+
|
| 13 |
+
return llm
|
src/rag/question_enricher.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Question Enricher Agent
|
| 3 |
+
Обогащает вопрос пользователя контекстом из истории диалога
|
| 4 |
+
Заменяет местоимения и ссылки на конкретные сущности
|
| 5 |
+
"""
|
| 6 |
+
from typing import List, Dict, Optional
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 10 |
+
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
|
| 11 |
+
from langchain_core.exceptions import OutputParserException
|
| 12 |
+
from pydantic import ValidationError
|
| 13 |
+
|
| 14 |
+
from src.rag.llm import get_model
|
| 15 |
+
from src.config import LLM_API_KEY, LLM
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class EnrichedQuestion(BaseModel):
|
| 19 |
+
enriched_question: str = Field(
|
| 20 |
+
...,
|
| 21 |
+
min_length=1,
|
| 22 |
+
description="Обогащенный вопрос с заменой местоимений и добавлением контекста"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class QuestionEnricher:
|
| 27 |
+
"""
|
| 28 |
+
Агент для обогащения вопросов контекстом из истории диалога.
|
| 29 |
+
Заменяет местоимения (он, она, это, там) и неполные ссылки на конкретные сущности.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self.llm = get_model(LLM_API_KEY, LLM)
|
| 34 |
+
self.parser = JsonOutputParser(pydantic_object=EnrichedQuestion)
|
| 35 |
+
|
| 36 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
| 37 |
+
HumanMessagePromptTemplate.from_template(
|
| 38 |
+
"Ты помощник, который обогащает вопросы пользователя контекстом из истории диалога.\n"
|
| 39 |
+
"Твоя задача:\n"
|
| 40 |
+
"1. Заменить местоимения (он, она, оно, они, это, то, там, тогда и т.д.) на конкретные сущности из истории общения с пользователем\n"
|
| 41 |
+
"2. Дополнить неполные вопросы (например, 'А вчера?' -> 'Какой был курс доллара вчера?')\n"
|
| 42 |
+
"3. Сделать вопрос самодостаточным и понятным без контекста истории\n"
|
| 43 |
+
"4. Сохранить смысл и намерение пользователя\n\n"
|
| 44 |
+
"Если вопрос уже полный и не требует обогащения, верни его без изменений.\n"
|
| 45 |
+
"Если не получается понять, как правильно обогатить какую-то часть вопроса, то ее следует оставить неизмененной."
|
| 46 |
+
),
|
| 47 |
+
HumanMessagePromptTemplate.from_template(
|
| 48 |
+
"{format_instructions}\n\n"
|
| 49 |
+
"История диалога:\n{history}\n\n"
|
| 50 |
+
"Новый вопрос пользователя: {question}\n\n"
|
| 51 |
+
"Обогати вопрос контекстом из истории."
|
| 52 |
+
)
|
| 53 |
+
])
|
| 54 |
+
|
| 55 |
+
self.chain = self.prompt | self.llm | self.parser
|
| 56 |
+
|
| 57 |
+
def _format_history(self, history: List[Dict]) -> str:
|
| 58 |
+
"""Format chat history for the prompt"""
|
| 59 |
+
if not history:
|
| 60 |
+
return "История диалога пуста."
|
| 61 |
+
|
| 62 |
+
history_text = ""
|
| 63 |
+
for i, msg in enumerate(history, 1):
|
| 64 |
+
history_text += f"[{i}] Пользователь: {msg.get('query', '')}\n"
|
| 65 |
+
history_text += f" Ответ: {msg.get('answer', '')}\n\n"
|
| 66 |
+
|
| 67 |
+
return history_text.strip()
|
| 68 |
+
|
| 69 |
+
def enrich(self, question: str, history: Optional[List[Dict]] = None) -> Dict[str, str]:
|
| 70 |
+
"""
|
| 71 |
+
Enrich question with context from history
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
question: Original user question
|
| 75 |
+
history: List of previous messages [{"query": "...", "answer": "..."}, ...]
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dict with enriched_question and explanation
|
| 79 |
+
"""
|
| 80 |
+
# If no history, return original question
|
| 81 |
+
if not history or len(history) == 0:
|
| 82 |
+
return question
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
# Format history
|
| 86 |
+
history_text = self._format_history(history)
|
| 87 |
+
|
| 88 |
+
# Invoke chain
|
| 89 |
+
result = self.chain.invoke({
|
| 90 |
+
"history": history_text,
|
| 91 |
+
"question": question,
|
| 92 |
+
"format_instructions": self.parser.get_format_instructions()
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
return result.get("enriched_question", question)
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
# On any other error, return original question
|
| 99 |
+
return question
|
src/rag/rag.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Dict, Optional
|
| 3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 4 |
+
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
|
| 5 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 6 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 7 |
+
from langchain_core.exceptions import OutputParserException
|
| 8 |
+
from pydantic import ValidationError
|
| 9 |
+
|
| 10 |
+
from src.rag.retriever import Retriever
|
| 11 |
+
from src.rag.llm import get_model
|
| 12 |
+
from src.rag.question_enricher import QuestionEnricher
|
| 13 |
+
from src.config import LLM_API_KEY, LLM, CHAT_HISTORY_LENGTH, ENABLE_QUESTION_ENRICHMENT
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class LLMResponse(BaseModel):
|
| 17 |
+
answer: str = Field(..., min_length=1, description="Прямой точный ответ на вопрос")
|
| 18 |
+
reason: str = Field(..., min_length=1, description="Объяснение, почему ответ именно такой")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class RAG:
|
| 22 |
+
|
| 23 |
+
def __init__(self, embed_model_name: str, embed_index_name: str):
|
| 24 |
+
self.retriever = Retriever(embed_model_name, embed_index_name)
|
| 25 |
+
self.parser = JsonOutputParser(pydantic_object=LLMResponse)
|
| 26 |
+
self.llm = get_model(LLM_API_KEY, LLM)
|
| 27 |
+
self.history_length = CHAT_HISTORY_LENGTH
|
| 28 |
+
self.enable_enrichment = ENABLE_QUESTION_ENRICHMENT
|
| 29 |
+
|
| 30 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
| 31 |
+
SystemMessagePromptTemplate.from_template(
|
| 32 |
+
"Ты полезный и точный ассистент. "
|
| 33 |
+
"Ответь на вопрос, опираясь ТОЛЬКО на предложенный контекст. "
|
| 34 |
+
"Если в контексте нет ответа, ответь \"Не знаю.\""
|
| 35 |
+
),
|
| 36 |
+
HumanMessagePromptTemplate.from_template(
|
| 37 |
+
"{format_instructions}\n\n"
|
| 38 |
+
"Контекст:\n{context}\n\n"
|
| 39 |
+
"Вопрос: {question}"
|
| 40 |
+
),
|
| 41 |
+
])
|
| 42 |
+
|
| 43 |
+
# Initialize question enricher if enabled
|
| 44 |
+
if self.enable_enrichment:
|
| 45 |
+
self.question_enricher = QuestionEnricher()
|
| 46 |
+
else:
|
| 47 |
+
self.question_enricher = None
|
| 48 |
+
|
| 49 |
+
def invoke(self, query: str, history: Optional[List[Dict]] = None):
|
| 50 |
+
"""
|
| 51 |
+
Invoke RAG with optional chat history
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
query: User question
|
| 55 |
+
history: List of previous messages [{"query": "...", "answer": "..."}, ...]
|
| 56 |
+
"""
|
| 57 |
+
try:
|
| 58 |
+
# Enrich question with context from history if enabled
|
| 59 |
+
enriched_query = query
|
| 60 |
+
|
| 61 |
+
if self.enable_enrichment and self.question_enricher and history:
|
| 62 |
+
# Use last N messages for enrichment
|
| 63 |
+
recent_history = history[-self.history_length:] if len(history) > self.history_length else history
|
| 64 |
+
enriched_query = self.question_enricher.enrich(query, recent_history)
|
| 65 |
+
|
| 66 |
+
# Get context from retriever using enriched query
|
| 67 |
+
context = self.retriever.chain.invoke(enriched_query)
|
| 68 |
+
|
| 69 |
+
# Build chain
|
| 70 |
+
chain = (
|
| 71 |
+
self.prompt
|
| 72 |
+
| self.llm
|
| 73 |
+
| self.parser
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Invoke with enriched question
|
| 77 |
+
result = chain.invoke({
|
| 78 |
+
"context": context,
|
| 79 |
+
"question": enriched_query, # Use enriched question
|
| 80 |
+
"format_instructions": self.parser.get_format_instructions(),
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
return result
|
| 84 |
+
|
| 85 |
+
except (OutputParserException, ValidationError) as e:
|
| 86 |
+
return LLMResponse(
|
| 87 |
+
answer="Не знаю.",
|
| 88 |
+
reason="Модель не смогла вернуть ответ в корректном формате."
|
| 89 |
+
)
|
| 90 |
+
|
src/rag/retriever.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.runnables import RunnableLambda
|
| 2 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
| 3 |
+
|
| 4 |
+
from src.db_utils.qdrant_utils import qdrant_search
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Retriever:
|
| 8 |
+
|
| 9 |
+
def __init__(self, embed_model_name: str, embed_index_name: str):
|
| 10 |
+
self.embed_model = HuggingFaceEmbeddings(
|
| 11 |
+
model_name=embed_model_name,
|
| 12 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 13 |
+
)
|
| 14 |
+
self.embed_index_name = embed_index_name
|
| 15 |
+
|
| 16 |
+
self.chain = RunnableLambda(self._retrieve)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _retrieve(self, query: str) -> str:
|
| 20 |
+
docs = qdrant_search(
|
| 21 |
+
self.embed_index_name,
|
| 22 |
+
self.embed_model.embed_query(query),
|
| 23 |
+
)
|
| 24 |
+
return "\n".join(
|
| 25 |
+
f"{i}) {doc.payload['text']}"
|
| 26 |
+
for i, doc in enumerate(docs.points, 1)
|
| 27 |
+
)
|