Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,9 @@ import threading
|
|
| 8 |
import queue
|
| 9 |
import json
|
| 10 |
import os
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
app = Flask(__name__)
|
| 13 |
app.secret_key = 'super_secret_key'
|
|
@@ -18,6 +21,7 @@ message_queue = queue.Queue()
|
|
| 18 |
MISTRAL_MODEL = "mistral-large-latest"
|
| 19 |
N_CTX = 32768
|
| 20 |
MAX_RESULTS = 5
|
|
|
|
| 21 |
|
| 22 |
# Новый клиент Mistral
|
| 23 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
|
@@ -36,6 +40,9 @@ SYSTEM_PROMPT = """
|
|
| 36 |
8. Указывай артикулы деталей при замене
|
| 37 |
9. Отвечай ТОЛЬКО на русском языке
|
| 38 |
10. Всегда проверяй точность кодов ошибок
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
|
| 41 |
BLACKLISTED_DOMAINS = [
|
|
@@ -45,7 +52,19 @@ BLACKLISTED_DOMAINS = [
|
|
| 45 |
'facebook.com',
|
| 46 |
'youtube.com',
|
| 47 |
'x.com',
|
| 48 |
-
'twitter.com'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
]
|
| 50 |
|
| 51 |
logging.basicConfig(
|
|
@@ -57,8 +76,49 @@ logging.basicConfig(
|
|
| 57 |
]
|
| 58 |
)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def generate_search_query(prompt: str) -> dict:
|
| 61 |
-
# Создаем расширенный системный промт с требованием англоязычного вывода
|
| 62 |
system_prompt = """
|
| 63 |
You are a technical expert. Extract structured data from the user's query and generate an English search query.
|
| 64 |
Return data in strict JSON format with these fields:
|
|
@@ -74,48 +134,9 @@ def generate_search_query(prompt: str) -> dict:
|
|
| 74 |
3. Remove brand mentions and the word "error" from model name
|
| 75 |
4. If error code is specified - include it in search_query
|
| 76 |
5. Problem description should be concise technical terms (max 7 words)
|
| 77 |
-
|
| 78 |
-
Examples:
|
| 79 |
-
|
| 80 |
-
Query: "Коника Минольта bizhub 368 выдает ошибку C-2557"
|
| 81 |
-
Response: {
|
| 82 |
-
"brand": "Konica Minolta",
|
| 83 |
-
"model": "bizhub 368",
|
| 84 |
-
"error_code": "C-2557",
|
| 85 |
-
"problem_description": "C-2557 error code",
|
| 86 |
-
"search_query": "Konica Minolta bizhub 368 error code C-2557"
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
Query: "Не печа��ает HP LaserJet Pro M404dn"
|
| 90 |
-
Response: {
|
| 91 |
-
"brand": "HP",
|
| 92 |
-
"model": "LaserJet Pro M404dn",
|
| 93 |
-
"error_code": "",
|
| 94 |
-
"problem_description": "printer not printing",
|
| 95 |
-
"search_query": "HP LaserJet Pro M404dn not printing technical fix"
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
Query: "Hewlett-Packard Color LaserJet MFP E77825 Ошибка 63.00.41"
|
| 99 |
-
Response: {
|
| 100 |
-
"brand": "HP",
|
| 101 |
-
"model": "Color LaserJet MFP E77825",
|
| 102 |
-
"error_code": "63.00.41",
|
| 103 |
-
"problem_description": "error 63.00.41",
|
| 104 |
-
"search_query": "HP Color LaserJet MFP E77825 error code 63.00.41"
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
-
Query: "Canon imagePROGRAF TM-200 Ошибка EC12-2F2B"
|
| 108 |
-
Response: {
|
| 109 |
-
"brand": "Canon",
|
| 110 |
-
"model": "imagePROGRAF TM-200",
|
| 111 |
-
"error_code": "EC12-2F2B",
|
| 112 |
-
"problem_description": "EC12-2F2B error",
|
| 113 |
-
"search_query": "Canon imagePROGRAF TM-200 error code EC12-2F2B"
|
| 114 |
-
}
|
| 115 |
"""
|
| 116 |
|
| 117 |
try:
|
| 118 |
-
# Запрашиваем структурированные данные у Mistral
|
| 119 |
response = mistral_client.chat.complete(
|
| 120 |
model=MISTRAL_MODEL,
|
| 121 |
messages=[
|
|
@@ -127,16 +148,13 @@ def generate_search_query(prompt: str) -> dict:
|
|
| 127 |
response_format={"type": "json_object"}
|
| 128 |
)
|
| 129 |
|
| 130 |
-
# Парсим JSON ответ
|
| 131 |
json_data = json.loads(response.choices[0].message.content)
|
| 132 |
|
| 133 |
-
# Простая валидация и нормализация полей
|
| 134 |
required_fields = ['brand', 'model', 'error_code', 'problem_description', 'search_query']
|
| 135 |
for field in required_fields:
|
| 136 |
if field not in json_data:
|
| 137 |
json_data[field] = ""
|
| 138 |
|
| 139 |
-
# Очистка модели от бренда
|
| 140 |
if json_data['brand'] and json_data['model']:
|
| 141 |
json_data['model'] = re.sub(
|
| 142 |
re.escape(json_data['brand']),
|
|
@@ -145,9 +163,7 @@ def generate_search_query(prompt: str) -> dict:
|
|
| 145 |
flags=re.IGNORECASE
|
| 146 |
).strip()
|
| 147 |
|
| 148 |
-
# Обеспечиваем англоязычный поисковый запрос
|
| 149 |
if not json_data['search_query']:
|
| 150 |
-
# Fallback генерация поискового запроса
|
| 151 |
search_parts = [json_data['brand'], json_data['model']]
|
| 152 |
if json_data['error_code']:
|
| 153 |
search_parts.append(f"error {json_data['error_code']}")
|
|
@@ -158,16 +174,14 @@ def generate_search_query(prompt: str) -> dict:
|
|
| 158 |
return json_data
|
| 159 |
|
| 160 |
except Exception as e:
|
| 161 |
-
# Простой fallback на случай ошибки
|
| 162 |
error_msg = f"❌ Ошибка извлечения данных: {str(e)}"
|
| 163 |
message_queue.put(('log', error_msg))
|
| 164 |
-
|
| 165 |
return {
|
| 166 |
'brand': "",
|
| 167 |
'model': "",
|
| 168 |
'error_code': "",
|
| 169 |
'problem_description': "",
|
| 170 |
-
'search_query': prompt
|
| 171 |
}
|
| 172 |
|
| 173 |
def web_search(query: str) -> tuple:
|
|
@@ -191,7 +205,9 @@ def web_search(query: str) -> tuple:
|
|
| 191 |
|
| 192 |
combined_content = ""
|
| 193 |
sources = []
|
|
|
|
| 194 |
|
|
|
|
| 195 |
featured_snippet = data.get("featured_snippet", {})
|
| 196 |
if featured_snippet:
|
| 197 |
snippet = featured_snippet.get("snippet", "")
|
|
@@ -199,18 +215,13 @@ def web_search(query: str) -> tuple:
|
|
| 199 |
combined_content += f"[Автоответ Google]\n{snippet}\n\n"
|
| 200 |
sources.insert(0, {
|
| 201 |
"title": "Google — автоматический ответ",
|
| 202 |
-
"url": f"https://www.google.com/search?q={requests.utils.quote(query)}"
|
|
|
|
| 203 |
})
|
| 204 |
|
| 205 |
-
|
| 206 |
-
if knowledge_panel:
|
| 207 |
-
title = knowledge_panel.get("title", "")
|
| 208 |
-
description = knowledge_panel.get("description", "")
|
| 209 |
-
if description:
|
| 210 |
-
combined_content += f"[Knowledge Panel] {title}\n{description}\n\n"
|
| 211 |
-
|
| 212 |
organic_results = data.get("organic_results", [])
|
| 213 |
-
for i, res in enumerate(organic_results):
|
| 214 |
title = res.get("title", "Без заголовка")
|
| 215 |
link = res.get("link", "#")
|
| 216 |
snippet = res.get("snippet", "") or ""
|
|
@@ -218,16 +229,35 @@ def web_search(query: str) -> tuple:
|
|
| 218 |
if any(domain in link for domain in BLACKLISTED_DOMAINS):
|
| 219 |
continue
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
elapsed = time.time() - start_time
|
| 229 |
-
message_queue.put(('log', f"✅ Поиск был произведен за {elapsed:.2f}с"))
|
| 230 |
-
return combined_content[:
|
| 231 |
|
| 232 |
except Exception as e:
|
| 233 |
error_msg = f"❌ SerpAPI ошибка: {str(e)}"
|
|
@@ -235,26 +265,31 @@ def web_search(query: str) -> tuple:
|
|
| 235 |
return f"Поиск недоступен: {str(e)}", []
|
| 236 |
|
| 237 |
|
| 238 |
-
def clean_response(response: str) -> str:
|
| 239 |
-
# Удал
|
| 240 |
response = re.sub(r'</?assistant>|<\|system\|>|</s>', '', response, flags=re.IGNORECASE)
|
| 241 |
|
| 242 |
-
# Удал
|
| 243 |
response = re.sub(r'(\*\*Проблема:\*\*.+?)(\*\*Проблема:\*\*)', r'\1', response, flags=re.DOTALL)
|
| 244 |
response = re.sub(r'(\*\*Решение:\*\*.+?)(\*\*Решение:\*\*)', r'\1', response, flags=re.DOTALL)
|
| 245 |
-
response = re.sub(r'(\*\*Источники:\*\*.+?)(\*\*Источники:\*\*)', r'\1', response, flags=re.DOTALL)
|
| 246 |
|
| 247 |
-
# Удал
|
| 248 |
response = re.sub(r'\n\s*\n', '\n\n', response)
|
| 249 |
response = re.sub(r'[ \t]{2,}', ' ', response)
|
| 250 |
|
| 251 |
-
# Удал
|
| 252 |
-
response = re.sub(r'\s*\[Решение\]\s*', '', response)
|
| 253 |
-
|
| 254 |
-
# Удаляем звездочки в конце
|
| 255 |
response = re.sub(r'\*\*Источники:\*\*\s*$', '', response)
|
| 256 |
|
| 257 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
last_dot = response.rfind('.')
|
| 259 |
if last_dot != -1:
|
| 260 |
response = response[:last_dot + 1]
|
|
@@ -262,6 +297,52 @@ def clean_response(response: str) -> str:
|
|
| 262 |
return response.strip()
|
| 263 |
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
def process_query(prompt: str):
|
| 266 |
try:
|
| 267 |
start_time = time.time()
|
|
@@ -271,7 +352,6 @@ def process_query(prompt: str):
|
|
| 271 |
norm_data = generate_search_query(prompt)
|
| 272 |
message_queue.put(('log', f"⏏️ Извлечено: {json.dumps(norm_data, ensure_ascii=False)}"))
|
| 273 |
|
| 274 |
-
# Используем англоязычный поисковый запрос
|
| 275 |
search_query = norm_data['search_query']
|
| 276 |
search_data, sources = web_search(search_query)
|
| 277 |
|
|
@@ -305,8 +385,6 @@ def process_query(prompt: str):
|
|
| 305 |
Суть проблемы (на основе поиска): {extracted_problem}
|
| 306 |
Данные поиска:
|
| 307 |
{search_data}
|
| 308 |
-
Список источников (для справки, не включай в ответ):
|
| 309 |
-
{sources_text}
|
| 310 |
"""},
|
| 311 |
{"role": "user", "content": f"Проблема: {prompt}"}
|
| 312 |
]
|
|
@@ -318,7 +396,7 @@ def process_query(prompt: str):
|
|
| 318 |
for chunk in mistral_client.chat.stream(
|
| 319 |
model=MISTRAL_MODEL,
|
| 320 |
messages=messages,
|
| 321 |
-
max_tokens=
|
| 322 |
temperature=0.3
|
| 323 |
):
|
| 324 |
if chunk.data.choices[0].delta.content is not None:
|
|
@@ -326,27 +404,17 @@ def process_query(prompt: str):
|
|
| 326 |
full_response += chunk_text
|
| 327 |
message_queue.put(('response_chunk', chunk_text))
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
if "замените" in search_data.lower() or "replace" in search_data.lower():
|
| 335 |
-
critical_steps.append("При необходимости замените неисправные компоненты")
|
| 336 |
-
|
| 337 |
-
if critical_steps:
|
| 338 |
-
solution_section = re.search(r'\*\*Решение:\*\*(.+?)(\*\*Источники:\*\*|$)',
|
| 339 |
-
final_response,
|
| 340 |
-
flags=re.DOTALL)
|
| 341 |
-
if solution_section:
|
| 342 |
-
updated_solution = solution_section.group(1) + "\n" + "\n".join(critical_steps)
|
| 343 |
-
final_response = final_response.replace(solution_section.group(1), updated_solution)
|
| 344 |
|
| 345 |
message_queue.put(('response_end', final_response))
|
| 346 |
message_queue.put(('sources', json.dumps(sources)))
|
| 347 |
|
| 348 |
total_time = time.time() - start_time
|
| 349 |
-
message_queue.put(('log', f"💡 Ответ: {final_response}"))
|
| 350 |
message_queue.put(('log', f"⏱ Время: {total_time:.1f}с"))
|
| 351 |
message_queue.put(('done', ''))
|
| 352 |
|
|
|
|
| 8 |
import queue
|
| 9 |
import json
|
| 10 |
import os
|
| 11 |
+
import trafilatura
|
| 12 |
+
from bs4 import BeautifulSoup
|
| 13 |
+
import random
|
| 14 |
|
| 15 |
app = Flask(__name__)
|
| 16 |
app.secret_key = 'super_secret_key'
|
|
|
|
| 21 |
MISTRAL_MODEL = "mistral-large-latest"
|
| 22 |
N_CTX = 32768
|
| 23 |
MAX_RESULTS = 5
|
| 24 |
+
MAX_CONTENT_LENGTH = 10000 # Максимальная длина контента на источник
|
| 25 |
|
| 26 |
# Новый клиент Mistral
|
| 27 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
|
|
|
| 40 |
8. Указывай артикулы деталей при замене
|
| 41 |
9. Отвечай ТОЛЬКО на русском языке
|
| 42 |
10. Всегда проверяй точность кодов ошибок
|
| 43 |
+
11. Основывайся ТОЛЬКО на предоставленных источниках
|
| 44 |
+
12. Для замены деталей указывай точные артикулы из источников
|
| 45 |
+
13. Если в источниках противоречия - укажи это в решении
|
| 46 |
"""
|
| 47 |
|
| 48 |
BLACKLISTED_DOMAINS = [
|
|
|
|
| 52 |
'facebook.com',
|
| 53 |
'youtube.com',
|
| 54 |
'x.com',
|
| 55 |
+
'twitter.com',
|
| 56 |
+
'tiktok.com',
|
| 57 |
+
'instagram.com'
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
USER_AGENTS = [
|
| 61 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
| 62 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
| 63 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
| 64 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
| 65 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
| 66 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
| 67 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
|
| 68 |
]
|
| 69 |
|
| 70 |
logging.basicConfig(
|
|
|
|
| 76 |
]
|
| 77 |
)
|
| 78 |
|
| 79 |
+
def get_random_headers():
|
| 80 |
+
return {
|
| 81 |
+
'User-Agent': random.choice(USER_AGENTS),
|
| 82 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 83 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 84 |
+
'Referer': 'https://www.google.com/',
|
| 85 |
+
'DNT': '1',
|
| 86 |
+
'Connection': 'keep-alive',
|
| 87 |
+
'Upgrade-Insecure-Requests': '1'
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
def extract_main_content(html, url):
|
| 91 |
+
"""Извлекает основной контент страницы с помощью trafilatura или BeautifulSoup"""
|
| 92 |
+
try:
|
| 93 |
+
# Пробуем trafilatura
|
| 94 |
+
content = trafilatura.extract(html, include_links=False, include_tables=False)
|
| 95 |
+
if content and len(content) > 500:
|
| 96 |
+
return content[:MAX_CONTENT_LENGTH]
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logging.error(f"Trafilatura error: {str(e)}")
|
| 99 |
+
|
| 100 |
+
# Fallback на BeautifulSoup
|
| 101 |
+
try:
|
| 102 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 103 |
+
|
| 104 |
+
# Удаляем ненужные элементы
|
| 105 |
+
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form']):
|
| 106 |
+
element.decompose()
|
| 107 |
+
|
| 108 |
+
# Пытаемся найти основной контент
|
| 109 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main|article|post', re.I))
|
| 110 |
+
|
| 111 |
+
if main_content:
|
| 112 |
+
text = main_content.get_text(separator='\n', strip=True)
|
| 113 |
+
return text[:MAX_CONTENT_LENGTH] if text else None
|
| 114 |
+
|
| 115 |
+
# Fallback: весь текст body
|
| 116 |
+
return soup.body.get_text(separator='\n', strip=True)[:MAX_CONTENT_LENGTH]
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logging.error(f"BeautifulSoup error: {str(e)}")
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
def generate_search_query(prompt: str) -> dict:
|
|
|
|
| 122 |
system_prompt = """
|
| 123 |
You are a technical expert. Extract structured data from the user's query and generate an English search query.
|
| 124 |
Return data in strict JSON format with these fields:
|
|
|
|
| 134 |
3. Remove brand mentions and the word "error" from model name
|
| 135 |
4. If error code is specified - include it in search_query
|
| 136 |
5. Problem description should be concise technical terms (max 7 words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
"""
|
| 138 |
|
| 139 |
try:
|
|
|
|
| 140 |
response = mistral_client.chat.complete(
|
| 141 |
model=MISTRAL_MODEL,
|
| 142 |
messages=[
|
|
|
|
| 148 |
response_format={"type": "json_object"}
|
| 149 |
)
|
| 150 |
|
|
|
|
| 151 |
json_data = json.loads(response.choices[0].message.content)
|
| 152 |
|
|
|
|
| 153 |
required_fields = ['brand', 'model', 'error_code', 'problem_description', 'search_query']
|
| 154 |
for field in required_fields:
|
| 155 |
if field not in json_data:
|
| 156 |
json_data[field] = ""
|
| 157 |
|
|
|
|
| 158 |
if json_data['brand'] and json_data['model']:
|
| 159 |
json_data['model'] = re.sub(
|
| 160 |
re.escape(json_data['brand']),
|
|
|
|
| 163 |
flags=re.IGNORECASE
|
| 164 |
).strip()
|
| 165 |
|
|
|
|
| 166 |
if not json_data['search_query']:
|
|
|
|
| 167 |
search_parts = [json_data['brand'], json_data['model']]
|
| 168 |
if json_data['error_code']:
|
| 169 |
search_parts.append(f"error {json_data['error_code']}")
|
|
|
|
| 174 |
return json_data
|
| 175 |
|
| 176 |
except Exception as e:
|
|
|
|
| 177 |
error_msg = f"❌ Ошибка извлечения данных: {str(e)}"
|
| 178 |
message_queue.put(('log', error_msg))
|
|
|
|
| 179 |
return {
|
| 180 |
'brand': "",
|
| 181 |
'model': "",
|
| 182 |
'error_code': "",
|
| 183 |
'problem_description': "",
|
| 184 |
+
'search_query': prompt
|
| 185 |
}
|
| 186 |
|
| 187 |
def web_search(query: str) -> tuple:
|
|
|
|
| 205 |
|
| 206 |
combined_content = ""
|
| 207 |
sources = []
|
| 208 |
+
full_contents = []
|
| 209 |
|
| 210 |
+
# Обработка featured snippet
|
| 211 |
featured_snippet = data.get("featured_snippet", {})
|
| 212 |
if featured_snippet:
|
| 213 |
snippet = featured_snippet.get("snippet", "")
|
|
|
|
| 215 |
combined_content += f"[Автоответ Google]\n{snippet}\n\n"
|
| 216 |
sources.insert(0, {
|
| 217 |
"title": "Google — автоматический ответ",
|
| 218 |
+
"url": f"https://www.google.com/search?q={requests.utils.quote(query)}",
|
| 219 |
+
"content": snippet
|
| 220 |
})
|
| 221 |
|
| 222 |
+
# Обработка organic results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
organic_results = data.get("organic_results", [])
|
| 224 |
+
for i, res in enumerate(organic_results[:5]): # Ограничиваемся топ-5
|
| 225 |
title = res.get("title", "Без заголовка")
|
| 226 |
link = res.get("link", "#")
|
| 227 |
snippet = res.get("snippet", "") or ""
|
|
|
|
| 229 |
if any(domain in link for domain in BLACKLISTED_DOMAINS):
|
| 230 |
continue
|
| 231 |
|
| 232 |
+
# Загрузка полного контента
|
| 233 |
+
content = None
|
| 234 |
+
try:
|
| 235 |
+
headers = get_random_headers()
|
| 236 |
+
page_response = requests.get(link, headers=headers, timeout=8)
|
| 237 |
+
if page_response.status_code == 200:
|
| 238 |
+
content = extract_main_content(page_response.text, link)
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logging.error(f"Ошибка загрузки {link}: {str(e)}")
|
| 241 |
+
|
| 242 |
+
if not content:
|
| 243 |
+
content = snippet # Fallback на сниппет
|
| 244 |
+
|
| 245 |
+
# Форматирование контента
|
| 246 |
+
cleaned_content = re.sub(r'\s+', ' ', content).strip()
|
| 247 |
+
combined_content += f"[[Источник {i+1}]] {title}\n{cleaned_content}\n\n"
|
| 248 |
+
|
| 249 |
+
# Сохранение источника
|
| 250 |
+
source_data = {
|
| 251 |
+
"title": title,
|
| 252 |
+
"url": link,
|
| 253 |
+
"content": cleaned_content[:MAX_CONTENT_LENGTH]
|
| 254 |
+
}
|
| 255 |
+
sources.append(source_data)
|
| 256 |
+
full_contents.append(cleaned_content[:MAX_CONTENT_LENGTH])
|
| 257 |
|
| 258 |
elapsed = time.time() - start_time
|
| 259 |
+
message_queue.put(('log', f"✅ Поиск был произведен за {elapsed:.2f}с. Найдено {len(sources)} источников."))
|
| 260 |
+
return combined_content[:20000], sources
|
| 261 |
|
| 262 |
except Exception as e:
|
| 263 |
error_msg = f"❌ SerpAPI ошибка: {str(e)}"
|
|
|
|
| 265 |
return f"Поиск недоступен: {str(e)}", []
|
| 266 |
|
| 267 |
|
| 268 |
+
def clean_response(response: str, sources: list) -> str:
|
| 269 |
+
# Удаление служебных тегов
|
| 270 |
response = re.sub(r'</?assistant>|<\|system\|>|</s>', '', response, flags=re.IGNORECASE)
|
| 271 |
|
| 272 |
+
# Удаление дублирования разделов
|
| 273 |
response = re.sub(r'(\*\*Проблема:\*\*.+?)(\*\*Проблема:\*\*)', r'\1', response, flags=re.DOTALL)
|
| 274 |
response = re.sub(r'(\*\*Решение:\*\*.+?)(\*\*Решение:\*\*)', r'\1', response, flags=re.DOTALL)
|
|
|
|
| 275 |
|
| 276 |
+
# Удаление лишних переносов
|
| 277 |
response = re.sub(r'\n\s*\n', '\n\n', response)
|
| 278 |
response = re.sub(r'[ \t]{2,}', ' ', response)
|
| 279 |
|
| 280 |
+
# Удаление звездочек в конце
|
|
|
|
|
|
|
|
|
|
| 281 |
response = re.sub(r'\*\*Источники:\*\*\s*$', '', response)
|
| 282 |
|
| 283 |
+
# Добавление ссылок на источники
|
| 284 |
+
if sources:
|
| 285 |
+
sources_text = "\n\n**Источники информации:**\n"
|
| 286 |
+
for i, source in enumerate(sources[:3]): # Показываем первые 3 источника
|
| 287 |
+
domain = re.search(r'https?://([^/]+)', source['url'])
|
| 288 |
+
domain_name = domain.group(1) if domain else "Источник"
|
| 289 |
+
sources_text += f"• [{domain_name}]({source['url']}) - {source['title']}\n"
|
| 290 |
+
response += sources_text
|
| 291 |
+
|
| 292 |
+
# Обрезка до последней точки
|
| 293 |
last_dot = response.rfind('.')
|
| 294 |
if last_dot != -1:
|
| 295 |
response = response[:last_dot + 1]
|
|
|
|
| 297 |
return response.strip()
|
| 298 |
|
| 299 |
|
| 300 |
+
def verify_with_sources(response: str, sources: list) -> str:
|
| 301 |
+
"""Проверяет соответствие ответа источникам с помощью LLM"""
|
| 302 |
+
try:
|
| 303 |
+
message_queue.put(('log', "🔍 Проверяю соответствие ответа источникам..."))
|
| 304 |
+
|
| 305 |
+
sources_text = "\n\n".join([
|
| 306 |
+
f"Источник {i+1} ({source['title']}):\n{source['content'][:1500]}"
|
| 307 |
+
for i, source in enumerate(sources)
|
| 308 |
+
])
|
| 309 |
+
|
| 310 |
+
verification_prompt = f"""
|
| 311 |
+
Проверь соответствие решения источникам:
|
| 312 |
+
|
| 313 |
+
### Ответ бота:
|
| 314 |
+
{response}
|
| 315 |
+
|
| 316 |
+
### Источники:
|
| 317 |
+
{sources_text}
|
| 318 |
+
|
| 319 |
+
Правила проверки:
|
| 320 |
+
1. Все шаги решения должны иметь подтверждение в источниках
|
| 321 |
+
2. Детали замены должны точно соответствовать арт��кулам из источников
|
| 322 |
+
3. Если в ответе есть шаги не из источников - удали их
|
| 323 |
+
4. Если есть противоречия между источниками - укажи это в решении
|
| 324 |
+
5. Если ошибки в кодах ошибок - исправь
|
| 325 |
+
6. Сохрани оригинальную структуру ответа
|
| 326 |
+
|
| 327 |
+
Верни исправленный ответ.
|
| 328 |
+
"""
|
| 329 |
+
|
| 330 |
+
verification = mistral_client.chat.complete(
|
| 331 |
+
model=MISTRAL_MODEL,
|
| 332 |
+
messages=[{"role": "user", "content": verification_prompt}],
|
| 333 |
+
max_tokens=2048,
|
| 334 |
+
temperature=0.1
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
verified_response = verification.choices[0].message.content
|
| 338 |
+
return verified_response.strip()
|
| 339 |
+
|
| 340 |
+
except Exception as e:
|
| 341 |
+
error_msg = f"❌ Ошибка верификации: {str(e)}"
|
| 342 |
+
message_queue.put(('log', error_msg))
|
| 343 |
+
return response
|
| 344 |
+
|
| 345 |
+
|
| 346 |
def process_query(prompt: str):
|
| 347 |
try:
|
| 348 |
start_time = time.time()
|
|
|
|
| 352 |
norm_data = generate_search_query(prompt)
|
| 353 |
message_queue.put(('log', f"⏏️ Извлечено: {json.dumps(norm_data, ensure_ascii=False)}"))
|
| 354 |
|
|
|
|
| 355 |
search_query = norm_data['search_query']
|
| 356 |
search_data, sources = web_search(search_query)
|
| 357 |
|
|
|
|
| 385 |
Суть проблемы (на основе поиска): {extracted_problem}
|
| 386 |
Данные поиска:
|
| 387 |
{search_data}
|
|
|
|
|
|
|
| 388 |
"""},
|
| 389 |
{"role": "user", "content": f"Проблема: {prompt}"}
|
| 390 |
]
|
|
|
|
| 396 |
for chunk in mistral_client.chat.stream(
|
| 397 |
model=MISTRAL_MODEL,
|
| 398 |
messages=messages,
|
| 399 |
+
max_tokens=2048,
|
| 400 |
temperature=0.3
|
| 401 |
):
|
| 402 |
if chunk.data.choices[0].delta.content is not None:
|
|
|
|
| 404 |
full_response += chunk_text
|
| 405 |
message_queue.put(('response_chunk', chunk_text))
|
| 406 |
|
| 407 |
+
# Проверка соответствия источникам
|
| 408 |
+
verified_response = verify_with_sources(full_response, sources)
|
| 409 |
+
|
| 410 |
+
# Очистка и форматирование ответа
|
| 411 |
+
final_response = clean_response(verified_response, sources)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
message_queue.put(('response_end', final_response))
|
| 414 |
message_queue.put(('sources', json.dumps(sources)))
|
| 415 |
|
| 416 |
total_time = time.time() - start_time
|
| 417 |
+
message_queue.put(('log', f"💡 Ответ: {final_response[:200]}..."))
|
| 418 |
message_queue.put(('log', f"⏱ Время: {total_time:.1f}с"))
|
| 419 |
message_queue.put(('done', ''))
|
| 420 |
|