Spaces:
Sleeping
Sleeping
Commit
·
df86177
1
Parent(s):
d6c8aaf
fixed errors - promp improvement + json fix document column
Browse files
app.py
CHANGED
|
@@ -26,6 +26,8 @@ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
|
| 26 |
CUSTOM_PROMPT_NEW = """
|
| 27 |
Вы являетесь высокоспециализированным Ассистентом для анализа документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы на основе анализа нормативной документации (НД). Все ваши ответы должны основываться исключительно на предоставленном контексте без использования внешних знаний или предположений.
|
| 28 |
|
|
|
|
|
|
|
| 29 |
История чата:
|
| 30 |
{chat_history}
|
| 31 |
|
|
@@ -105,25 +107,32 @@ def log_message(message):
|
|
| 105 |
sys.stdout.flush()
|
| 106 |
|
| 107 |
def table_to_document(table_json):
|
|
|
|
|
|
|
| 108 |
metadata = {
|
| 109 |
-
"document_id":
|
| 110 |
-
"section": table_json
|
| 111 |
-
"table_number": table_json
|
| 112 |
-
"table_title": table_json
|
| 113 |
}
|
| 114 |
|
| 115 |
-
description = table_json
|
| 116 |
-
headers =
|
| 117 |
-
|
| 118 |
-
rows = []
|
| 119 |
-
for row in table_json["data"]:
|
| 120 |
-
row_str = " | ".join([f"{k}: {v}" for k,v in row.items()])
|
| 121 |
-
rows.append(row_str)
|
| 122 |
|
| 123 |
-
table_text = f"Таблица {table_json
|
|
|
|
|
|
|
| 124 |
table_text += f"Описание: {description}\n"
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
return Document(text=table_text, metadata=metadata)
|
| 129 |
|
|
@@ -144,6 +153,8 @@ def download_table_data():
|
|
| 144 |
table_documents = []
|
| 145 |
for file_path in table_files:
|
| 146 |
try:
|
|
|
|
|
|
|
| 147 |
local_path = hf_hub_download(
|
| 148 |
repo_id=REPO_ID,
|
| 149 |
filename=file_path,
|
|
@@ -155,13 +166,23 @@ def download_table_data():
|
|
| 155 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 156 |
table_data = json.load(f)
|
| 157 |
|
| 158 |
-
if isinstance(table_data,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
for table_json in table_data:
|
| 160 |
doc = table_to_document(table_json)
|
| 161 |
table_documents.append(doc)
|
| 162 |
-
|
| 163 |
-
doc = table_to_document(table_data)
|
| 164 |
-
table_documents.append(doc)
|
| 165 |
|
| 166 |
except Exception as e:
|
| 167 |
log_message(f"❌ Ошибка обработки файла {file_path}: {str(e)}")
|
|
@@ -212,6 +233,7 @@ def initialize_models():
|
|
| 212 |
index_faiss = faiss.read_index(faiss_index_path)
|
| 213 |
chunks_df = pd.read_csv(chunks_csv_path)
|
| 214 |
log_message(f"📄 Загружено {len(chunks_df)} основных чанков")
|
|
|
|
| 215 |
|
| 216 |
table_documents = download_table_data()
|
| 217 |
|
|
@@ -231,7 +253,8 @@ def initialize_models():
|
|
| 231 |
if text_column is None:
|
| 232 |
text_column = chunks_df.columns[0]
|
| 233 |
|
| 234 |
-
log_message("📝
|
|
|
|
| 235 |
documents = []
|
| 236 |
for i, (_, row) in enumerate(chunks_df.iterrows()):
|
| 237 |
doc = Document(
|
|
@@ -282,16 +305,30 @@ def answer_question(question, history):
|
|
| 282 |
try:
|
| 283 |
start_time = time.time()
|
| 284 |
|
|
|
|
|
|
|
| 285 |
chat_history_text = format_chat_history()
|
|
|
|
| 286 |
|
| 287 |
query_with_history = question
|
| 288 |
-
|
|
|
|
| 289 |
retrieved_nodes = query_engine.retriever.retrieve(query_with_history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
end_time = time.time()
|
| 292 |
processing_time = end_time - start_time
|
| 293 |
|
| 294 |
bot_response = response.response
|
|
|
|
|
|
|
| 295 |
chat_history.append((question, bot_response))
|
| 296 |
|
| 297 |
if len(chat_history) > 10:
|
|
@@ -307,6 +344,7 @@ def answer_question(question, history):
|
|
| 307 |
|
| 308 |
except Exception as e:
|
| 309 |
error_msg = f"❌ Ошибка обработки вопроса: {str(e)}"
|
|
|
|
| 310 |
history.append([question, error_msg])
|
| 311 |
return history, ""
|
| 312 |
|
|
@@ -348,8 +386,16 @@ def generate_sources_html(nodes):
|
|
| 348 |
def clear_chat():
|
| 349 |
global chat_history
|
| 350 |
chat_history = []
|
|
|
|
| 351 |
return [], ""
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
def create_demo_interface():
|
| 354 |
with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
|
| 355 |
|
|
@@ -397,22 +443,15 @@ def create_demo_interface():
|
|
| 397 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
|
| 398 |
)
|
| 399 |
|
| 400 |
-
def
|
| 401 |
-
return
|
| 402 |
-
|
| 403 |
-
def bot_message(history):
|
| 404 |
-
if history and history[-1][1] is None:
|
| 405 |
-
user_msg = history[-1][0]
|
| 406 |
-
updated_history, sources = answer_question(user_msg, history[:-1])
|
| 407 |
-
return updated_history, sources
|
| 408 |
-
return history, ""
|
| 409 |
|
| 410 |
-
msg.submit(
|
| 411 |
-
|
| 412 |
)
|
| 413 |
|
| 414 |
-
send_btn.click(
|
| 415 |
-
|
| 416 |
)
|
| 417 |
|
| 418 |
clear_btn.click(clear_chat, outputs=[chatbot, sources_output])
|
|
|
|
| 26 |
CUSTOM_PROMPT_NEW = """
|
| 27 |
Вы являетесь высокоспециализированным Ассистентом для анализа документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы на основе анализа нормативной документации (НД). Все ваши ответы должны основываться исключительно на предоставленном контексте без использования внешних знаний или предположений.
|
| 28 |
|
| 29 |
+
ВАЖНО: ВСЕ ОТВЕТЫ ДОЛЖНЫ БЫТЬ ТОЛЬКО НА РУССКОМ ЯЗЫКЕ!
|
| 30 |
+
|
| 31 |
История чата:
|
| 32 |
{chat_history}
|
| 33 |
|
|
|
|
| 107 |
sys.stdout.flush()
|
| 108 |
|
| 109 |
def table_to_document(table_json):
|
| 110 |
+
document_id = table_json.get("document_id") or table_json.get("document", "unknown")
|
| 111 |
+
|
| 112 |
metadata = {
|
| 113 |
+
"document_id": document_id,
|
| 114 |
+
"section": table_json.get("section", ""),
|
| 115 |
+
"table_number": table_json.get("table_number", ""),
|
| 116 |
+
"table_title": table_json.get("table_title", ""),
|
| 117 |
}
|
| 118 |
|
| 119 |
+
description = table_json.get("table_description", "")
|
| 120 |
+
headers = table_json.get("headers", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
table_text = f"Таблица {table_json.get('table_number', '')} - {table_json.get('table_title', '')}\n"
|
| 123 |
+
table_text += f"Документ: {document_id}\n"
|
| 124 |
+
table_text += f"Раздел: {table_json.get('section', '')}\n"
|
| 125 |
table_text += f"Описание: {description}\n"
|
| 126 |
+
|
| 127 |
+
if headers:
|
| 128 |
+
table_text += f"Заголовки: {' | '.join(headers)}\n"
|
| 129 |
+
|
| 130 |
+
data = table_json.get("data", [])
|
| 131 |
+
if data:
|
| 132 |
+
for row in data:
|
| 133 |
+
if isinstance(row, dict):
|
| 134 |
+
row_str = " | ".join([f"{k}: {v}" for k,v in row.items()])
|
| 135 |
+
table_text += f"{row_str}\n"
|
| 136 |
|
| 137 |
return Document(text=table_text, metadata=metadata)
|
| 138 |
|
|
|
|
| 153 |
table_documents = []
|
| 154 |
for file_path in table_files:
|
| 155 |
try:
|
| 156 |
+
log_message(f"🔄 Обработка файла: {file_path}")
|
| 157 |
+
|
| 158 |
local_path = hf_hub_download(
|
| 159 |
repo_id=REPO_ID,
|
| 160 |
filename=file_path,
|
|
|
|
| 166 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 167 |
table_data = json.load(f)
|
| 168 |
|
| 169 |
+
log_message(f"📋 Структура JSON: {list(table_data.keys()) if isinstance(table_data, dict) else 'Список'}")
|
| 170 |
+
|
| 171 |
+
if isinstance(table_data, dict):
|
| 172 |
+
if 'sheets' in table_data:
|
| 173 |
+
for sheet in table_data['sheets']:
|
| 174 |
+
doc = table_to_document(sheet)
|
| 175 |
+
table_documents.append(doc)
|
| 176 |
+
log_message(f"✅ Создан документ из таблицы: {sheet.get('table_number', 'unknown')}")
|
| 177 |
+
else:
|
| 178 |
+
doc = table_to_document(table_data)
|
| 179 |
+
table_documents.append(doc)
|
| 180 |
+
log_message(f"✅ Создан документ из JSON объекта")
|
| 181 |
+
elif isinstance(table_data, list):
|
| 182 |
for table_json in table_data:
|
| 183 |
doc = table_to_document(table_json)
|
| 184 |
table_documents.append(doc)
|
| 185 |
+
log_message(f"✅ Создан документ из элемента списка")
|
|
|
|
|
|
|
| 186 |
|
| 187 |
except Exception as e:
|
| 188 |
log_message(f"❌ Ошибка обработки файла {file_path}: {str(e)}")
|
|
|
|
| 233 |
index_faiss = faiss.read_index(faiss_index_path)
|
| 234 |
chunks_df = pd.read_csv(chunks_csv_path)
|
| 235 |
log_message(f"📄 Загружено {len(chunks_df)} основных чанков")
|
| 236 |
+
log_message(f"📋 Колонки в chunks_df: {list(chunks_df.columns)}")
|
| 237 |
|
| 238 |
table_documents = download_table_data()
|
| 239 |
|
|
|
|
| 253 |
if text_column is None:
|
| 254 |
text_column = chunks_df.columns[0]
|
| 255 |
|
| 256 |
+
log_message(f"📝 Используется колонка для текста: {text_column}")
|
| 257 |
+
|
| 258 |
documents = []
|
| 259 |
for i, (_, row) in enumerate(chunks_df.iterrows()):
|
| 260 |
doc = Document(
|
|
|
|
| 305 |
try:
|
| 306 |
start_time = time.time()
|
| 307 |
|
| 308 |
+
log_message(f"🔍 Получен вопрос: {question}")
|
| 309 |
+
|
| 310 |
chat_history_text = format_chat_history()
|
| 311 |
+
log_message(f"📜 История чата: {len(chat_history)} сообщений")
|
| 312 |
|
| 313 |
query_with_history = question
|
| 314 |
+
|
| 315 |
+
log_message("🔎 Поиск релевантных чанков...")
|
| 316 |
retrieved_nodes = query_engine.retriever.retrieve(query_with_history)
|
| 317 |
+
log_message(f"📊 Найдено {len(retrieved_nodes)} релевантных чанков")
|
| 318 |
+
|
| 319 |
+
for i, node in enumerate(retrieved_nodes[:3]):
|
| 320 |
+
log_message(f"📄 Чанк {i+1}: {node.text[:100]}...")
|
| 321 |
+
log_message(f"🏷️ Метаданные: {node.metadata}")
|
| 322 |
+
|
| 323 |
+
log_message("🤖 Отправка запроса в LLM...")
|
| 324 |
+
response = query_engine.query(query_with_history)
|
| 325 |
|
| 326 |
end_time = time.time()
|
| 327 |
processing_time = end_time - start_time
|
| 328 |
|
| 329 |
bot_response = response.response
|
| 330 |
+
log_message(f"✅ Получен ответ: {bot_response[:100]}...")
|
| 331 |
+
|
| 332 |
chat_history.append((question, bot_response))
|
| 333 |
|
| 334 |
if len(chat_history) > 10:
|
|
|
|
| 344 |
|
| 345 |
except Exception as e:
|
| 346 |
error_msg = f"❌ Ошибка обработки вопроса: {str(e)}"
|
| 347 |
+
log_message(f"❌ Ошибка: {str(e)}")
|
| 348 |
history.append([question, error_msg])
|
| 349 |
return history, ""
|
| 350 |
|
|
|
|
| 386 |
def clear_chat():
|
| 387 |
global chat_history
|
| 388 |
chat_history = []
|
| 389 |
+
log_message("🗑️ История чата очищена")
|
| 390 |
return [], ""
|
| 391 |
|
| 392 |
+
def handle_submit(message, history):
|
| 393 |
+
if not message.strip():
|
| 394 |
+
return history, ""
|
| 395 |
+
|
| 396 |
+
updated_history, sources = answer_question(message, history)
|
| 397 |
+
return updated_history, sources
|
| 398 |
+
|
| 399 |
def create_demo_interface():
|
| 400 |
with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
|
| 401 |
|
|
|
|
| 443 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
|
| 444 |
)
|
| 445 |
|
| 446 |
+
def submit_message(message, history):
|
| 447 |
+
return handle_submit(message, history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
+
msg.submit(submit_message, [msg, chatbot], [chatbot, sources_output]).then(
|
| 450 |
+
lambda: "", None, msg
|
| 451 |
)
|
| 452 |
|
| 453 |
+
send_btn.click(submit_message, [msg, chatbot], [chatbot, sources_output]).then(
|
| 454 |
+
lambda: "", None, msg
|
| 455 |
)
|
| 456 |
|
| 457 |
clear_btn.click(clear_chat, outputs=[chatbot, sources_output])
|