Spaces:
Sleeping
Sleeping
Commit
·
af31260
1
Parent(s):
0970ce1
new final version of the rag
Browse files- app.py +182 -35
- config.py +17 -4
- converters/converter.py +116 -0
- documents_prep.py +188 -76
- index_retriever.py +11 -24
- table_prep.py +0 -229
- utils.py +71 -41
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from config import (
|
|
| 10 |
HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
|
| 11 |
JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
|
| 12 |
)
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def merge_table_chunks(chunk_info):
|
|
@@ -39,7 +40,6 @@ def merge_table_chunks(chunk_info):
|
|
| 39 |
|
| 40 |
return list(merged.values())
|
| 41 |
|
| 42 |
-
|
| 43 |
def create_chunks_display_html(chunk_info):
|
| 44 |
if not chunk_info:
|
| 45 |
return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
|
|
@@ -142,19 +142,14 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 142 |
separator=" ",
|
| 143 |
backup_separators=["\n", ".", "!", "?"]
|
| 144 |
)
|
| 145 |
-
|
| 146 |
-
log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
|
| 147 |
-
log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
|
| 148 |
|
| 149 |
all_documents = []
|
| 150 |
chunks_df = None
|
| 151 |
|
| 152 |
-
# CHANGED: Use load_all_documents instead of loading separately
|
| 153 |
if use_json_instead_csv and json_files_dir:
|
| 154 |
log_message("Используем JSON файлы вместо CSV")
|
| 155 |
from documents_prep import load_all_documents
|
| 156 |
|
| 157 |
-
# This will handle text, tables, and images all together with proper logging
|
| 158 |
all_documents = load_all_documents(
|
| 159 |
repo_id=repo_id,
|
| 160 |
hf_token=hf_token,
|
|
@@ -163,12 +158,10 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 163 |
image_dir=image_data_dir if image_data_dir else ""
|
| 164 |
)
|
| 165 |
else:
|
| 166 |
-
# OLD PATH: Loading separately (fallback)
|
| 167 |
if chunks_filename:
|
| 168 |
log_message("Загружаем данные из CSV")
|
| 169 |
|
| 170 |
if table_data_dir:
|
| 171 |
-
log_message("Добавляю табличные данные")
|
| 172 |
from documents_prep import load_table_documents
|
| 173 |
|
| 174 |
table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
|
|
@@ -176,7 +169,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 176 |
all_documents.extend(table_chunks)
|
| 177 |
|
| 178 |
if image_data_dir:
|
| 179 |
-
log_message("Добавляю данные изображений")
|
| 180 |
from documents_prep import load_image_documents
|
| 181 |
|
| 182 |
image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
|
|
@@ -188,7 +180,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 188 |
vector_index = create_vector_index(all_documents)
|
| 189 |
query_engine = create_query_engine(vector_index)
|
| 190 |
|
| 191 |
-
# Create chunk_info for display (extract from documents metadata)
|
| 192 |
chunk_info = []
|
| 193 |
for doc in all_documents:
|
| 194 |
chunk_info.append({
|
|
@@ -233,16 +224,48 @@ def switch_model(model_name, vector_index):
|
|
| 233 |
log_message(error_msg)
|
| 234 |
return None, f"❌ {error_msg}"
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
def main_answer_question(question):
|
| 237 |
-
global query_engine, reranker, current_model, chunks_df
|
| 238 |
if not question.strip():
|
| 239 |
return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
|
| 240 |
"<div style='color: black;'>Источники появятся после обработки запроса</div>",
|
| 241 |
"<div style='color: black;'>Чанки появятся после обработки запроса</div>")
|
| 242 |
|
| 243 |
try:
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
| 246 |
return answer_html, sources_html, chunks_html
|
| 247 |
|
| 248 |
except Exception as e:
|
|
@@ -251,6 +274,36 @@ def main_answer_question(question):
|
|
| 251 |
"<div style='color: black;'>Источники недоступны из-за ошибки</div>",
|
| 252 |
"<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
def retrieve_chunks(question: str, top_k: int = 20) -> list:
|
| 255 |
from index_retriever import rerank_nodes
|
| 256 |
global query_engine, reranker
|
|
@@ -362,24 +415,128 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
|
|
| 362 |
label="Релевантные чанки",
|
| 363 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
|
| 364 |
)
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
)
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
)
|
| 377 |
|
| 378 |
-
|
| 379 |
-
fn=
|
| 380 |
-
|
| 381 |
-
outputs=[answer_output, sources_output, chunks_output]
|
| 382 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
return demo
|
| 384 |
|
| 385 |
|
|
@@ -389,13 +546,6 @@ reranker = None
|
|
| 389 |
vector_index = None
|
| 390 |
current_model = DEFAULT_MODEL
|
| 391 |
|
| 392 |
-
def main_answer_question(question):
|
| 393 |
-
global query_engine, reranker, current_model, chunks_df
|
| 394 |
-
answer_html, sources_html, chunks_html = answer_question(
|
| 395 |
-
question, query_engine, reranker, current_model, chunks_df
|
| 396 |
-
)
|
| 397 |
-
return answer_html, sources_html, chunks_html
|
| 398 |
-
|
| 399 |
def main_switch_model(model_name):
|
| 400 |
global query_engine, vector_index, current_model
|
| 401 |
|
|
@@ -406,9 +556,6 @@ def main_switch_model(model_name):
|
|
| 406 |
|
| 407 |
return status_message
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
def main():
|
| 413 |
global query_engine, chunks_df, reranker, vector_index, current_model
|
| 414 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
|
|
|
|
| 10 |
HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
|
| 11 |
JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
|
| 12 |
)
|
| 13 |
+
from converters.converter import convert_single_excel_to_json, convert_single_excel_to_csv
|
| 14 |
|
| 15 |
|
| 16 |
def merge_table_chunks(chunk_info):
|
|
|
|
| 40 |
|
| 41 |
return list(merged.values())
|
| 42 |
|
|
|
|
| 43 |
def create_chunks_display_html(chunk_info):
|
| 44 |
if not chunk_info:
|
| 45 |
return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
|
|
|
|
| 142 |
separator=" ",
|
| 143 |
backup_separators=["\n", ".", "!", "?"]
|
| 144 |
)
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
all_documents = []
|
| 147 |
chunks_df = None
|
| 148 |
|
|
|
|
| 149 |
if use_json_instead_csv and json_files_dir:
|
| 150 |
log_message("Используем JSON файлы вместо CSV")
|
| 151 |
from documents_prep import load_all_documents
|
| 152 |
|
|
|
|
| 153 |
all_documents = load_all_documents(
|
| 154 |
repo_id=repo_id,
|
| 155 |
hf_token=hf_token,
|
|
|
|
| 158 |
image_dir=image_data_dir if image_data_dir else ""
|
| 159 |
)
|
| 160 |
else:
|
|
|
|
| 161 |
if chunks_filename:
|
| 162 |
log_message("Загружаем данные из CSV")
|
| 163 |
|
| 164 |
if table_data_dir:
|
|
|
|
| 165 |
from documents_prep import load_table_documents
|
| 166 |
|
| 167 |
table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
|
|
|
|
| 169 |
all_documents.extend(table_chunks)
|
| 170 |
|
| 171 |
if image_data_dir:
|
|
|
|
| 172 |
from documents_prep import load_image_documents
|
| 173 |
|
| 174 |
image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
|
|
|
|
| 180 |
vector_index = create_vector_index(all_documents)
|
| 181 |
query_engine = create_query_engine(vector_index)
|
| 182 |
|
|
|
|
| 183 |
chunk_info = []
|
| 184 |
for doc in all_documents:
|
| 185 |
chunk_info.append({
|
|
|
|
| 224 |
log_message(error_msg)
|
| 225 |
return None, f"❌ {error_msg}"
|
| 226 |
|
| 227 |
+
retrieval_params = {
|
| 228 |
+
'vector_top_k': 50,
|
| 229 |
+
'bm25_top_k': 50,
|
| 230 |
+
'similarity_cutoff': 0.55,
|
| 231 |
+
'hybrid_top_k': 100,
|
| 232 |
+
'rerank_top_k': 20
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
|
| 236 |
+
similarity_cutoff=0.55, hybrid_top_k=100):
|
| 237 |
+
try:
|
| 238 |
+
from config import CUSTOM_PROMPT
|
| 239 |
+
from index_retriever import create_query_engine as create_index_query_engine
|
| 240 |
+
|
| 241 |
+
query_engine = create_index_query_engine(
|
| 242 |
+
vector_index=vector_index,
|
| 243 |
+
vector_top_k=vector_top_k,
|
| 244 |
+
bm25_top_k=bm25_top_k,
|
| 245 |
+
similarity_cutoff=similarity_cutoff,
|
| 246 |
+
hybrid_top_k=hybrid_top_k
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
log_message(f"Query engine created with params: vector_top_k={vector_top_k}, "
|
| 250 |
+
f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, hybrid_top_k={hybrid_top_k}")
|
| 251 |
+
return query_engine
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
log_message(f"Ошибка создания query engine: {str(e)}")
|
| 255 |
+
raise
|
| 256 |
+
|
| 257 |
def main_answer_question(question):
|
| 258 |
+
global query_engine, reranker, current_model, chunks_df, retrieval_params
|
| 259 |
if not question.strip():
|
| 260 |
return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
|
| 261 |
"<div style='color: black;'>Источники появятся после обработки запроса</div>",
|
| 262 |
"<div style='color: black;'>Чанки появятся после обработки запроса</div>")
|
| 263 |
|
| 264 |
try:
|
| 265 |
+
answer_html, sources_html, chunks_html = answer_question(
|
| 266 |
+
question, query_engine, reranker, current_model, chunks_df,
|
| 267 |
+
rerank_top_k=retrieval_params['rerank_top_k']
|
| 268 |
+
)
|
| 269 |
return answer_html, sources_html, chunks_html
|
| 270 |
|
| 271 |
except Exception as e:
|
|
|
|
| 274 |
"<div style='color: black;'>Источники недоступны из-за ошибки</div>",
|
| 275 |
"<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
|
| 276 |
|
| 277 |
+
def update_retrieval_params(vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k):
|
| 278 |
+
global query_engine, vector_index, retrieval_params
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
retrieval_params['vector_top_k'] = vector_top_k
|
| 282 |
+
retrieval_params['bm25_top_k'] = bm25_top_k
|
| 283 |
+
retrieval_params['similarity_cutoff'] = similarity_cutoff
|
| 284 |
+
retrieval_params['hybrid_top_k'] = hybrid_top_k
|
| 285 |
+
retrieval_params['rerank_top_k'] = rerank_top_k
|
| 286 |
+
|
| 287 |
+
# Recreate query engine with new parameters
|
| 288 |
+
if vector_index is not None:
|
| 289 |
+
query_engine = create_query_engine(
|
| 290 |
+
vector_index=vector_index,
|
| 291 |
+
vector_top_k=vector_top_k,
|
| 292 |
+
bm25_top_k=bm25_top_k,
|
| 293 |
+
similarity_cutoff=similarity_cutoff,
|
| 294 |
+
hybrid_top_k=hybrid_top_k
|
| 295 |
+
)
|
| 296 |
+
log_message(f"Параметры поиска обновлены: vector_top_k={vector_top_k}, "
|
| 297 |
+
f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, "
|
| 298 |
+
f"hybrid_top_k={hybrid_top_k}, rerank_top_k={rerank_top_k}")
|
| 299 |
+
return f"✅ Параметры обновлены"
|
| 300 |
+
else:
|
| 301 |
+
return "❌ Система не инициализирована"
|
| 302 |
+
except Exception as e:
|
| 303 |
+
error_msg = f"Ошибка обновления параметров: {str(e)}"
|
| 304 |
+
log_message(error_msg)
|
| 305 |
+
return f"❌ {error_msg}"
|
| 306 |
+
|
| 307 |
def retrieve_chunks(question: str, top_k: int = 20) -> list:
|
| 308 |
from index_retriever import rerank_nodes
|
| 309 |
global query_engine, reranker
|
|
|
|
| 415 |
label="Релевантные чанки",
|
| 416 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
|
| 417 |
)
|
| 418 |
+
|
| 419 |
+
with gr.Tab("⚙️ Параметры поиска"):
|
| 420 |
+
gr.Markdown("### Настройка параметров векторного поиска и переранжирования")
|
| 421 |
|
| 422 |
+
with gr.Row():
|
| 423 |
+
with gr.Column():
|
| 424 |
+
vector_top_k = gr.Slider(
|
| 425 |
+
minimum=10,
|
| 426 |
+
maximum=200,
|
| 427 |
+
value=50,
|
| 428 |
+
step=10,
|
| 429 |
+
label="Vector Top K",
|
| 430 |
+
info="Количество результатов из векторного поиска"
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
with gr.Column():
|
| 434 |
+
bm25_top_k = gr.Slider(
|
| 435 |
+
minimum=10,
|
| 436 |
+
maximum=200,
|
| 437 |
+
value=50,
|
| 438 |
+
step=10,
|
| 439 |
+
label="BM25 Top K",
|
| 440 |
+
info="Количество результатов из BM25 поиска"
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
with gr.Row():
|
| 444 |
+
with gr.Column():
|
| 445 |
+
similarity_cutoff = gr.Slider(
|
| 446 |
+
minimum=0.0,
|
| 447 |
+
maximum=1.0,
|
| 448 |
+
value=0.55,
|
| 449 |
+
step=0.05,
|
| 450 |
+
label="Similarity Cutoff",
|
| 451 |
+
info="Минимальный порог схожести для векторного поиска"
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
with gr.Column():
|
| 455 |
+
hybrid_top_k = gr.Slider(
|
| 456 |
+
minimum=10,
|
| 457 |
+
maximum=300,
|
| 458 |
+
value=100,
|
| 459 |
+
step=10,
|
| 460 |
+
label="Hybrid Top K",
|
| 461 |
+
info="Количество результатов из гибридного поиска"
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
with gr.Row():
|
| 465 |
+
with gr.Column():
|
| 466 |
+
rerank_top_k = gr.Slider(
|
| 467 |
+
minimum=5,
|
| 468 |
+
maximum=100,
|
| 469 |
+
value=20,
|
| 470 |
+
step=5,
|
| 471 |
+
label="Rerank Top K",
|
| 472 |
+
info="Количество результатов после переранжирования"
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
with gr.Column():
|
| 476 |
+
update_btn = gr.Button("Применить параметры", variant="primary")
|
| 477 |
+
update_status = gr.Textbox(
|
| 478 |
+
value="Параметры готовы к применению",
|
| 479 |
+
label="Статус",
|
| 480 |
+
interactive=False
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
gr.Markdown("""
|
| 484 |
+
### Рекомендации:
|
| 485 |
+
- **Vector Top K**: Увеличьте для более полного поиска по семантике (50-100)
|
| 486 |
+
- **BM25 Top K**: Увеличьте для лучшего поиска по ключевым словам (30-80)
|
| 487 |
+
- **Similarity Cutoff**: Снизьте для более мягких критериев (0.3-0.6), повысьте для строгих (0.7-0.9)
|
| 488 |
+
- **Hybrid Top K**: Объединённые результаты (100-150)
|
| 489 |
+
- **Rerank Top K**: Финальные результаты (10-30)
|
| 490 |
+
""")
|
| 491 |
+
|
| 492 |
+
update_btn.click(
|
| 493 |
+
fn=update_retrieval_params,
|
| 494 |
+
inputs=[vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k],
|
| 495 |
+
outputs=[update_status]
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
gr.Markdown("### Текущие параметры:")
|
| 499 |
+
current_params_display = gr.Textbox(
|
| 500 |
+
value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
|
| 501 |
+
label="",
|
| 502 |
+
interactive=False,
|
| 503 |
+
lines=2
|
| 504 |
)
|
| 505 |
|
| 506 |
+
def display_current_params():
|
| 507 |
+
return f"""Vector Top K: {retrieval_params['vector_top_k']}
|
| 508 |
+
BM25 Top K: {retrieval_params['bm25_top_k']}
|
| 509 |
+
Similarity Cutoff: {retrieval_params['similarity_cutoff']}
|
| 510 |
+
Hybrid Top K: {retrieval_params['hybrid_top_k']}
|
| 511 |
+
Rerank Top K: {retrieval_params['rerank_top_k']}"""
|
| 512 |
+
|
| 513 |
+
demo.load(
|
| 514 |
+
fn=display_current_params,
|
| 515 |
+
outputs=[current_params_display]
|
| 516 |
)
|
| 517 |
|
| 518 |
+
update_btn.click(
|
| 519 |
+
fn=display_current_params,
|
| 520 |
+
outputs=[current_params_display]
|
|
|
|
| 521 |
)
|
| 522 |
+
|
| 523 |
+
switch_btn.click(
|
| 524 |
+
fn=switch_model_func,
|
| 525 |
+
inputs=[model_dropdown],
|
| 526 |
+
outputs=[model_status]
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
ask_btn.click(
|
| 530 |
+
fn=answer_question_func,
|
| 531 |
+
inputs=[question_input],
|
| 532 |
+
outputs=[answer_output, sources_output, chunks_output]
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
question_input.submit(
|
| 536 |
+
fn=answer_question_func,
|
| 537 |
+
inputs=[question_input],
|
| 538 |
+
outputs=[answer_output, sources_output, chunks_output]
|
| 539 |
+
)
|
| 540 |
return demo
|
| 541 |
|
| 542 |
|
|
|
|
| 546 |
vector_index = None
|
| 547 |
current_model = DEFAULT_MODEL
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
def main_switch_model(model_name):
|
| 550 |
global query_engine, vector_index, current_model
|
| 551 |
|
|
|
|
| 556 |
|
| 557 |
return status_message
|
| 558 |
|
|
|
|
|
|
|
|
|
|
| 559 |
def main():
|
| 560 |
global query_engine, chunks_df, reranker, vector_index, current_model
|
| 561 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
|
config.py
CHANGED
|
@@ -5,7 +5,7 @@ SIMILARITY_THRESHOLD = 0.7
|
|
| 5 |
RAG_FILES_DIR = "rag_files"
|
| 6 |
PROCESSED_DATA_FILE = "processed_chunks.csv"
|
| 7 |
|
| 8 |
-
REPO_ID = "
|
| 9 |
faiss_index_filename = "cleaned_faiss_index.index"
|
| 10 |
CHUNKS_FILENAME = "processed_chunks.csv"
|
| 11 |
TABLE_DATA_DIR = "Табличные данные_JSON"
|
|
@@ -18,7 +18,6 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
|
| 18 |
HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
|
| 19 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 20 |
|
| 21 |
-
# Available models configuration
|
| 22 |
AVAILABLE_MODELS = {
|
| 23 |
"Gemini 2.5 Flash": {
|
| 24 |
"provider": "google",
|
|
@@ -52,8 +51,22 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 52 |
CHUNK_SIZE = 1500
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
-
MAX_CHARS_TABLE =
|
| 56 |
-
MAX_ROWS_TABLE =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
| 59 |
Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
|
|
|
|
| 5 |
RAG_FILES_DIR = "rag_files"
|
| 6 |
PROCESSED_DATA_FILE = "processed_chunks.csv"
|
| 7 |
|
| 8 |
+
REPO_ID = "RAG-AIEXP/ragfiles"
|
| 9 |
faiss_index_filename = "cleaned_faiss_index.index"
|
| 10 |
CHUNKS_FILENAME = "processed_chunks.csv"
|
| 11 |
TABLE_DATA_DIR = "Табличные данные_JSON"
|
|
|
|
| 18 |
HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
|
| 19 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 20 |
|
|
|
|
| 21 |
AVAILABLE_MODELS = {
|
| 22 |
"Gemini 2.5 Flash": {
|
| 23 |
"provider": "google",
|
|
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
+
MAX_CHARS_TABLE = 3000
|
| 55 |
+
MAX_ROWS_TABLE = 30
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
QUERY_EXPANSION_PROMPT = """Ты — интеллектуальный помощник для расширения поисковых запросов по стандартам ГОСТ, ТУ, ISO, EN и другой технической документации.
|
| 59 |
+
Твоя цель — помочь системе найти все возможные формулировки вопроса, включая те, где встречаются редкие или неочевидные термины.
|
| 60 |
+
Пользователь задал вопрос: "{original_query}"
|
| 61 |
+
|
| 62 |
+
Сгенерируй 5 вариантов запроса, которые:
|
| 63 |
+
Сохраняют смысл исходного вопроса
|
| 64 |
+
Используют синонимы и технические термины (например: "сталь" → "сплав", "марка", "материал")
|
| 65 |
+
Добавляют возможные контекстные уточнения (например: "ГОСТ", "ТУ", "марка", "лист", "труба", "прокат", "применение", "химический состав")
|
| 66 |
+
Могут охватывать как частотные, так и редкие термины
|
| 67 |
+
Краткие — не более 10 слов каждая
|
| 68 |
+
|
| 69 |
+
Верни только 5 запросов, каждый с новой строки, без нумерации и пояснений."""
|
| 70 |
|
| 71 |
CUSTOM_PROMPT = """
|
| 72 |
Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
|
converters/converter.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from config import *
|
| 2 |
+
from utils import log_message
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
def process_uploaded_file(file, file_type):
|
| 8 |
+
"""Обработка загруженного файла и добавление в систему"""
|
| 9 |
+
try:
|
| 10 |
+
if file is None:
|
| 11 |
+
return "❌ Файл не выбран"
|
| 12 |
+
|
| 13 |
+
from huggingface_hub import HfApi
|
| 14 |
+
import tempfile
|
| 15 |
+
import shutil
|
| 16 |
+
|
| 17 |
+
# Создаем временную директорию
|
| 18 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 19 |
+
# Сохраняем загруженный файл
|
| 20 |
+
file_path = os.path.join(temp_dir, file.name)
|
| 21 |
+
shutil.copy(file.name, file_path)
|
| 22 |
+
|
| 23 |
+
# Определяем целевую директорию на HuggingFace
|
| 24 |
+
if file_type == "Таблица":
|
| 25 |
+
target_dir = TABLE_DATA_DIR
|
| 26 |
+
# Конвертируем Excel в JSON
|
| 27 |
+
if file.name.endswith(('.xlsx', '.xls')):
|
| 28 |
+
json_path = convert_single_excel_to_json(file_path, temp_dir)
|
| 29 |
+
upload_file = json_path
|
| 30 |
+
else:
|
| 31 |
+
upload_file = file_path
|
| 32 |
+
elif file_type == "Изображение (метаданные)":
|
| 33 |
+
target_dir = IMAGE_DATA_DIR
|
| 34 |
+
# Конвертируем Excel в CSV
|
| 35 |
+
if file.name.endswith(('.xlsx', '.xls')):
|
| 36 |
+
csv_path = convert_single_excel_to_csv(file_path, temp_dir)
|
| 37 |
+
upload_file = csv_path
|
| 38 |
+
else:
|
| 39 |
+
upload_file = file_path
|
| 40 |
+
else: # JSON документ
|
| 41 |
+
target_dir = JSON_FILES_DIR
|
| 42 |
+
upload_file = file_path
|
| 43 |
+
|
| 44 |
+
# Загружаем на HuggingFace
|
| 45 |
+
api = HfApi()
|
| 46 |
+
api.upload_file(
|
| 47 |
+
path_or_fileobj=upload_file,
|
| 48 |
+
path_in_repo=f"{target_dir}/{os.path.basename(upload_file)}",
|
| 49 |
+
repo_id=HF_REPO_ID,
|
| 50 |
+
token=HF_TOKEN,
|
| 51 |
+
repo_type="dataset"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
log_message(f"Файл {file.name} успешно загружен в {target_dir}")
|
| 55 |
+
return f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n⚠️ Перезапустите систему для применения изменений"
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
error_msg = f"Ошибка обработки файла: {str(e)}"
|
| 59 |
+
log_message(error_msg)
|
| 60 |
+
return f"❌ {error_msg}"
|
| 61 |
+
|
| 62 |
+
def convert_single_excel_to_json(excel_path, output_dir):
|
| 63 |
+
"""Конвертация одного Excel файла в JSON для таблиц"""
|
| 64 |
+
df_dict = pd.read_excel(excel_path, sheet_name=None)
|
| 65 |
+
|
| 66 |
+
result = {
|
| 67 |
+
"document": os.path.basename(excel_path),
|
| 68 |
+
"total_sheets": len(df_dict),
|
| 69 |
+
"sheets": []
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
for sheet_name, df in df_dict.items():
|
| 73 |
+
if df.empty or "Номер таблицы" not in df.columns:
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
df = df.dropna(how='all').fillna("")
|
| 77 |
+
grouped = df.groupby("Номер таблицы")
|
| 78 |
+
|
| 79 |
+
for table_number, group in grouped:
|
| 80 |
+
group = group.reset_index(drop=True)
|
| 81 |
+
|
| 82 |
+
sheet_data = {
|
| 83 |
+
"sheet_name": sheet_name,
|
| 84 |
+
"document_id": str(group.iloc[0].get("Обозначение документа", "")),
|
| 85 |
+
"section": str(group.iloc[0].get("Раздел документа", "")),
|
| 86 |
+
"table_number": str(table_number),
|
| 87 |
+
"table_title": str(group.iloc[0].get("Название таблицы", "")),
|
| 88 |
+
"table_description": str(group.iloc[0].get("Примечание", "")),
|
| 89 |
+
"headers": [col for col in df.columns if col not in
|
| 90 |
+
["Обозначение документа", "Раздел документа", "Номер таблицы",
|
| 91 |
+
"Название таблицы", "Примечание"]],
|
| 92 |
+
"data": []
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
for _, row in group.iterrows():
|
| 96 |
+
row_dict = {col: str(row[col]) if pd.notna(row[col]) else ""
|
| 97 |
+
for col in sheet_data["headers"]}
|
| 98 |
+
sheet_data["data"].append(row_dict)
|
| 99 |
+
|
| 100 |
+
result["sheets"].append(sheet_data)
|
| 101 |
+
|
| 102 |
+
json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
|
| 103 |
+
json_path = os.path.join(output_dir, json_filename)
|
| 104 |
+
|
| 105 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
| 106 |
+
json.dump(result, f, ensure_ascii=False, indent=2)
|
| 107 |
+
|
| 108 |
+
return json_path
|
| 109 |
+
|
| 110 |
+
def convert_single_excel_to_csv(excel_path, output_dir):
|
| 111 |
+
"""Конвертация одного Excel файла в CSV для изображений"""
|
| 112 |
+
df = pd.read_excel(excel_path)
|
| 113 |
+
csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
|
| 114 |
+
csv_path = os.path.join(output_dir, csv_filename)
|
| 115 |
+
df.to_csv(csv_path, index=False, encoding='utf-8')
|
| 116 |
+
return csv_path
|
documents_prep.py
CHANGED
|
@@ -6,21 +6,83 @@ from llama_index.core import Document
|
|
| 6 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 7 |
from my_logging import log_message
|
| 8 |
from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def chunk_text_documents(documents):
|
| 11 |
text_splitter = SentenceSplitter(
|
| 12 |
chunk_size=CHUNK_SIZE,
|
| 13 |
chunk_overlap=CHUNK_OVERLAP
|
| 14 |
)
|
|
|
|
|
|
|
| 15 |
|
| 16 |
chunked = []
|
| 17 |
for doc in documents:
|
| 18 |
chunks = text_splitter.get_nodes_from_documents([doc])
|
| 19 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
chunk.metadata.update({
|
| 21 |
'chunk_id': i,
|
| 22 |
'total_chunks': len(chunks),
|
| 23 |
-
'chunk_size': len(chunk.text)
|
| 24 |
})
|
| 25 |
chunked.append(chunk)
|
| 26 |
|
|
@@ -31,23 +93,14 @@ def chunk_text_documents(documents):
|
|
| 31 |
max_size = max(len(c.text) for c in chunked)
|
| 32 |
log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
|
| 33 |
log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def normalize_text(text):
|
| 38 |
-
if not text:
|
| 39 |
-
return text
|
| 40 |
-
|
| 41 |
-
# Replace Cyrillic 'C' with Latin 'С' (U+0421)
|
| 42 |
-
# This is for welding types like C-25 -> С-25
|
| 43 |
-
text = text.replace('С-', 'C')
|
| 44 |
-
|
| 45 |
-
# Also handle cases like "Type C" or variations
|
| 46 |
-
import re
|
| 47 |
-
# Match "C" followed by digit or space in context of welding types
|
| 48 |
-
text = re.sub(r'\bС(\d)', r'С\1', text)
|
| 49 |
|
| 50 |
-
return
|
| 51 |
|
| 52 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 53 |
headers = table_data.get('headers', [])
|
|
@@ -55,49 +108,108 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 55 |
table_num = table_data.get('table_number', 'unknown')
|
| 56 |
table_title = table_data.get('table_title', '')
|
| 57 |
section = table_data.get('section', '')
|
|
|
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
table_num_clean = str(table_num).strip()
|
| 60 |
-
table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
|
| 61 |
|
| 62 |
import re
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
if
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
else:
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
else:
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
if not rows:
|
| 74 |
return []
|
| 75 |
|
| 76 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
base_size = len(base_content)
|
| 81 |
available_space = max_chars - base_size - 200
|
| 82 |
|
| 83 |
# If entire table fits, return as one chunk
|
| 84 |
-
full_rows_content = format_table_rows([{**row, '_idx': i+1}
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 87 |
|
| 88 |
metadata = {
|
| 89 |
'type': 'table',
|
| 90 |
'document_id': doc_id,
|
| 91 |
-
'table_number': table_num_clean,
|
| 92 |
-
'table_identifier':
|
| 93 |
-
'table_title':
|
| 94 |
'section': section,
|
| 95 |
-
'
|
|
|
|
| 96 |
'chunk_size': len(content),
|
| 97 |
-
'is_complete_table': True
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
-
log_message(f" Single chunk: {len(content)} chars, {len(
|
| 101 |
return [Document(text=content, metadata=metadata)]
|
| 102 |
|
| 103 |
chunks = []
|
|
@@ -105,30 +217,33 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 105 |
current_size = 0
|
| 106 |
chunk_num = 0
|
| 107 |
|
| 108 |
-
for i, row in enumerate(
|
| 109 |
row_text = format_single_row(row, i + 1)
|
| 110 |
row_size = len(row_text)
|
| 111 |
|
| 112 |
-
should_split = (current_size + row_size > available_space or
|
|
|
|
| 113 |
|
| 114 |
if should_split:
|
| 115 |
content = base_content + format_table_rows(current_rows)
|
| 116 |
-
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(
|
| 117 |
content += format_table_footer(table_identifier, doc_id)
|
| 118 |
|
| 119 |
metadata = {
|
| 120 |
'type': 'table',
|
| 121 |
'document_id': doc_id,
|
| 122 |
-
'table_number': table_num_clean,
|
| 123 |
-
'table_identifier':
|
| 124 |
-
'table_title':
|
| 125 |
'section': section,
|
|
|
|
| 126 |
'chunk_id': chunk_num,
|
| 127 |
'row_start': current_rows[0]['_idx'] - 1,
|
| 128 |
'row_end': current_rows[-1]['_idx'],
|
| 129 |
-
'total_rows': len(
|
| 130 |
'chunk_size': len(content),
|
| 131 |
-
'is_complete_table': False
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
chunks.append(Document(text=content, metadata=metadata))
|
|
@@ -138,31 +253,31 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 138 |
current_rows = []
|
| 139 |
current_size = 0
|
| 140 |
|
| 141 |
-
# Add row with index
|
| 142 |
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
|
| 143 |
row_copy['_idx'] = i + 1
|
| 144 |
current_rows.append(row_copy)
|
| 145 |
current_size += row_size
|
| 146 |
-
|
| 147 |
-
# Add final chunk
|
| 148 |
if current_rows:
|
| 149 |
content = base_content + format_table_rows(current_rows)
|
| 150 |
-
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(
|
| 151 |
content += format_table_footer(table_identifier, doc_id)
|
| 152 |
|
| 153 |
metadata = {
|
| 154 |
'type': 'table',
|
| 155 |
'document_id': doc_id,
|
| 156 |
-
'table_number': table_num_clean,
|
| 157 |
-
'table_identifier':
|
| 158 |
-
'table_title':
|
| 159 |
'section': section,
|
|
|
|
| 160 |
'chunk_id': chunk_num,
|
| 161 |
'row_start': current_rows[0]['_idx'] - 1,
|
| 162 |
'row_end': current_rows[-1]['_idx'],
|
| 163 |
-
'total_rows': len(
|
| 164 |
'chunk_size': len(content),
|
| 165 |
-
'is_complete_table': False
|
|
|
|
| 166 |
}
|
| 167 |
|
| 168 |
chunks.append(Document(text=content, metadata=metadata))
|
|
@@ -170,33 +285,36 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 170 |
|
| 171 |
return chunks
|
| 172 |
|
| 173 |
-
|
| 174 |
-
# MODIFIED: Update format_table_header function
|
| 175 |
-
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 176 |
content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
|
| 177 |
|
| 178 |
-
# Add
|
| 179 |
-
if table_num:
|
| 180 |
-
content += f"
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
if table_title:
|
| 183 |
content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
|
| 184 |
|
| 185 |
if section:
|
| 186 |
content += f"РАЗДЕЛ: {section}\n"
|
|
|
|
|
|
|
| 187 |
|
| 188 |
content += f"{'='*70}\n"
|
| 189 |
|
| 190 |
if headers:
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
content += f"ЗАГОЛОВКИ: {header_str}\n\n"
|
| 193 |
|
| 194 |
content += "ДАННЫЕ:\n"
|
| 195 |
return content
|
| 196 |
|
| 197 |
-
|
| 198 |
def format_single_row(row, idx):
|
| 199 |
-
"""Format a single row"""
|
| 200 |
if isinstance(row, dict):
|
| 201 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 202 |
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
|
@@ -208,18 +326,14 @@ def format_single_row(row, idx):
|
|
| 208 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 209 |
return ""
|
| 210 |
|
| 211 |
-
|
| 212 |
def format_table_rows(rows):
|
| 213 |
-
"""Format multiple rows"""
|
| 214 |
content = ""
|
| 215 |
for row in rows:
|
| 216 |
idx = row.get('_idx', 0)
|
| 217 |
content += format_single_row(row, idx)
|
| 218 |
return content
|
| 219 |
|
| 220 |
-
|
| 221 |
def format_table_footer(table_identifier, doc_id):
|
| 222 |
-
"""Format table footer"""
|
| 223 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 224 |
|
| 225 |
def load_json_documents(repo_id, hf_token, json_dir):
|
|
@@ -290,7 +404,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 290 |
stats['failed'] += 1
|
| 291 |
continue
|
| 292 |
|
| 293 |
-
# Try UTF-8 first (most common)
|
| 294 |
try:
|
| 295 |
text_content = file_content.decode('utf-8')
|
| 296 |
except UnicodeDecodeError:
|
|
@@ -298,7 +411,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 298 |
text_content = file_content.decode('utf-8-sig')
|
| 299 |
except UnicodeDecodeError:
|
| 300 |
try:
|
| 301 |
-
# Try UTF-16 (the issue you're seeing)
|
| 302 |
text_content = file_content.decode('utf-16')
|
| 303 |
except UnicodeDecodeError:
|
| 304 |
try:
|
|
@@ -345,13 +457,11 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 345 |
log_message(f" Success: {stats['success']}")
|
| 346 |
log_message(f" Empty: {stats['empty']}")
|
| 347 |
log_message(f" Failed: {stats['failed']}")
|
| 348 |
-
log_message(f" Total sections: {len(documents)}")
|
| 349 |
log_message(f"="*60)
|
| 350 |
|
| 351 |
return documents
|
| 352 |
|
| 353 |
def extract_sections_from_json(json_path):
|
| 354 |
-
"""Extract sections from a single JSON file"""
|
| 355 |
documents = []
|
| 356 |
|
| 357 |
try:
|
|
@@ -401,14 +511,15 @@ def extract_sections_from_json(json_path):
|
|
| 401 |
|
| 402 |
return documents
|
| 403 |
|
| 404 |
-
|
| 405 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 406 |
log_message("Loading tables...")
|
| 407 |
-
|
| 408 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 409 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 410 |
|
| 411 |
all_chunks = []
|
|
|
|
|
|
|
| 412 |
for file_path in table_files:
|
| 413 |
try:
|
| 414 |
local_path = hf_hub_download(
|
|
@@ -425,20 +536,22 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 425 |
|
| 426 |
for sheet in data.get('sheets', []):
|
| 427 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
|
|
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
|
|
|
| 431 |
all_chunks.extend(chunks)
|
| 432 |
|
| 433 |
except Exception as e:
|
| 434 |
log_message(f"Error loading {file_path}: {e}")
|
| 435 |
|
| 436 |
-
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
|
|
|
|
|
|
| 437 |
return all_chunks
|
| 438 |
|
| 439 |
-
|
| 440 |
def load_image_documents(repo_id, hf_token, image_dir):
|
| 441 |
-
"""Load image descriptions"""
|
| 442 |
log_message("Loading images...")
|
| 443 |
|
| 444 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
@@ -484,7 +597,6 @@ def load_image_documents(repo_id, hf_token, image_dir):
|
|
| 484 |
|
| 485 |
return documents
|
| 486 |
|
| 487 |
-
|
| 488 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 489 |
"""Main loader - combines all document types"""
|
| 490 |
log_message("="*60)
|
|
|
|
| 6 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 7 |
from my_logging import log_message
|
| 8 |
from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
def normalize_text(text):
|
| 12 |
+
if not text:
|
| 13 |
+
return text
|
| 14 |
+
|
| 15 |
+
# Replace Cyrillic 'C' with Latin 'С' (U+0421)
|
| 16 |
+
# This is for welding types like C-25 -> С-25
|
| 17 |
+
text = text.replace('С-', 'C')
|
| 18 |
+
text = re.sub(r'\bС(\d)', r'С\1', text)
|
| 19 |
+
return text
|
| 20 |
+
|
| 21 |
+
def normalize_steel_designations(text):
|
| 22 |
+
if not text:
|
| 23 |
+
return text, 0, []
|
| 24 |
+
|
| 25 |
+
import re
|
| 26 |
+
|
| 27 |
+
changes_count = 0
|
| 28 |
+
changes_list = []
|
| 29 |
+
|
| 30 |
+
# Mapping of Cyrillic to Latin for steel designations
|
| 31 |
+
replacements = {
|
| 32 |
+
'Х': 'X',
|
| 33 |
+
'Н': 'H',
|
| 34 |
+
'Т': 'T',
|
| 35 |
+
'С': 'C',
|
| 36 |
+
'В': 'B',
|
| 37 |
+
'К': 'K',
|
| 38 |
+
'М': 'M',
|
| 39 |
+
'А': 'A',
|
| 40 |
+
'Р': 'P',
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Pattern: starts with digits, then letters+digits (steel grade pattern)
|
| 44 |
+
# Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
|
| 45 |
+
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 46 |
+
|
| 47 |
+
# Also match welding wire patterns like СВ-08Х19Н10
|
| 48 |
+
pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 49 |
+
|
| 50 |
+
def replace_in_steel_grade(match):
|
| 51 |
+
nonlocal changes_count, changes_list
|
| 52 |
+
original = match.group(0)
|
| 53 |
+
converted = ''.join(replacements.get(ch, ch) for ch in original)
|
| 54 |
+
if converted != original:
|
| 55 |
+
changes_count += 1
|
| 56 |
+
changes_list.append(f"{original} → {converted}")
|
| 57 |
+
return converted
|
| 58 |
+
normalized_text = re.sub(pattern, replace_in_steel_grade, text)
|
| 59 |
+
normalized_text = re.sub(pattern_wire, replace_in_steel_grade, normalized_text)
|
| 60 |
+
|
| 61 |
+
return normalized_text, changes_count, changes_list
|
| 62 |
|
| 63 |
def chunk_text_documents(documents):
|
| 64 |
text_splitter = SentenceSplitter(
|
| 65 |
chunk_size=CHUNK_SIZE,
|
| 66 |
chunk_overlap=CHUNK_OVERLAP
|
| 67 |
)
|
| 68 |
+
total_normalizations = 0
|
| 69 |
+
chunks_with_changes = 0
|
| 70 |
|
| 71 |
chunked = []
|
| 72 |
for doc in documents:
|
| 73 |
chunks = text_splitter.get_nodes_from_documents([doc])
|
| 74 |
for i, chunk in enumerate(chunks):
|
| 75 |
+
original_text = chunk.text
|
| 76 |
+
chunk.text, changes, change_list = normalize_steel_designations(chunk.text)
|
| 77 |
+
|
| 78 |
+
if changes > 0:
|
| 79 |
+
chunks_with_changes += 1
|
| 80 |
+
total_normalizations += changes
|
| 81 |
+
|
| 82 |
chunk.metadata.update({
|
| 83 |
'chunk_id': i,
|
| 84 |
'total_chunks': len(chunks),
|
| 85 |
+
'chunk_size': len(chunk.text)
|
| 86 |
})
|
| 87 |
chunked.append(chunk)
|
| 88 |
|
|
|
|
| 93 |
max_size = max(len(c.text) for c in chunked)
|
| 94 |
log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
|
| 95 |
log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
|
| 96 |
+
log_message(f" Steel designation normalization:")
|
| 97 |
+
log_message(f" - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
|
| 98 |
+
log_message(f" - Total steel grades normalized: {total_normalizations}")
|
| 99 |
+
log_message(f" - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else " - No normalizations needed")
|
| 100 |
|
| 101 |
+
log_message("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
return chunked
|
| 104 |
|
| 105 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 106 |
headers = table_data.get('headers', [])
|
|
|
|
| 108 |
table_num = table_data.get('table_number', 'unknown')
|
| 109 |
table_title = table_data.get('table_title', '')
|
| 110 |
section = table_data.get('section', '')
|
| 111 |
+
sheet_name = table_data.get('sheet_name', '')
|
| 112 |
|
| 113 |
+
# Apply steel designation normalization to title and section
|
| 114 |
+
table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
|
| 115 |
+
section, section_changes, section_list = normalize_steel_designations(section)
|
| 116 |
+
|
| 117 |
table_num_clean = str(table_num).strip()
|
|
|
|
| 118 |
|
| 119 |
import re
|
| 120 |
+
|
| 121 |
+
if table_num_clean in ['-', '', 'unknown', 'nan']:
|
| 122 |
+
if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
|
| 123 |
+
appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
|
| 124 |
+
(sheet_name + ' ' + section).lower())
|
| 125 |
+
if appendix_match:
|
| 126 |
+
appendix_num = appendix_match.group(1)
|
| 127 |
+
table_identifier = f"Приложение {appendix_num}"
|
| 128 |
+
else:
|
| 129 |
+
table_identifier = "Приложение"
|
| 130 |
else:
|
| 131 |
+
if table_title:
|
| 132 |
+
first_words = ' '.join(table_title.split()[:5])
|
| 133 |
+
table_identifier = f"{first_words}"
|
| 134 |
+
else:
|
| 135 |
+
table_identifier = section.split(',')[0] if section else "БезНомера"
|
| 136 |
else:
|
| 137 |
+
if 'приложени' in section.lower():
|
| 138 |
+
appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
|
| 139 |
+
if appendix_match:
|
| 140 |
+
appendix_num = appendix_match.group(1)
|
| 141 |
+
table_identifier = f"{table_num_clean} Приложение {appendix_num}"
|
| 142 |
+
else:
|
| 143 |
+
table_identifier = table_num_clean
|
| 144 |
+
else:
|
| 145 |
+
table_identifier = table_num_clean
|
| 146 |
|
| 147 |
if not rows:
|
| 148 |
return []
|
| 149 |
|
| 150 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 151 |
|
| 152 |
+
# Normalize all row content (including steel designations)
|
| 153 |
+
normalized_rows = []
|
| 154 |
+
total_row_changes = 0
|
| 155 |
+
rows_with_changes = 0
|
| 156 |
+
all_row_changes = []
|
| 157 |
+
|
| 158 |
+
for row in rows:
|
| 159 |
+
if isinstance(row, dict):
|
| 160 |
+
normalized_row = {}
|
| 161 |
+
row_had_changes = False
|
| 162 |
+
for k, v in row.items():
|
| 163 |
+
normalized_val, changes, change_list = normalize_steel_designations(str(v))
|
| 164 |
+
normalized_row[k] = normalized_val
|
| 165 |
+
if changes > 0:
|
| 166 |
+
total_row_changes += changes
|
| 167 |
+
row_had_changes = True
|
| 168 |
+
all_row_changes.extend(change_list) # NEW
|
| 169 |
+
if row_had_changes:
|
| 170 |
+
rows_with_changes += 1
|
| 171 |
+
normalized_rows.append(normalized_row)
|
| 172 |
+
else:
|
| 173 |
+
normalized_rows.append(row)
|
| 174 |
+
|
| 175 |
+
# Log normalization stats with examples
|
| 176 |
+
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 177 |
+
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 178 |
+
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
| 179 |
+
if title_list:
|
| 180 |
+
log_message(f" Title changes: {', '.join(title_list[:3])}")
|
| 181 |
+
if section_list:
|
| 182 |
+
log_message(f" Section changes: {', '.join(section_list[:3])}")
|
| 183 |
+
if all_row_changes:
|
| 184 |
+
log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
|
| 185 |
+
base_content = format_table_header(doc_id, table_identifier, table_num,
|
| 186 |
+
table_title, section, headers,
|
| 187 |
+
sheet_name)
|
| 188 |
base_size = len(base_content)
|
| 189 |
available_space = max_chars - base_size - 200
|
| 190 |
|
| 191 |
# If entire table fits, return as one chunk
|
| 192 |
+
full_rows_content = format_table_rows([{**row, '_idx': i+1}
|
| 193 |
+
for i, row in enumerate(normalized_rows)])
|
| 194 |
+
|
| 195 |
+
if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
|
| 196 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 197 |
|
| 198 |
metadata = {
|
| 199 |
'type': 'table',
|
| 200 |
'document_id': doc_id,
|
| 201 |
+
'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
|
| 202 |
+
'table_identifier': table_identifier,
|
| 203 |
+
'table_title': table_title,
|
| 204 |
'section': section,
|
| 205 |
+
'sheet_name': sheet_name,
|
| 206 |
+
'total_rows': len(normalized_rows),
|
| 207 |
'chunk_size': len(content),
|
| 208 |
+
'is_complete_table': True,
|
| 209 |
+
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
|
| 210 |
}
|
| 211 |
|
| 212 |
+
log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
|
| 213 |
return [Document(text=content, metadata=metadata)]
|
| 214 |
|
| 215 |
chunks = []
|
|
|
|
| 217 |
current_size = 0
|
| 218 |
chunk_num = 0
|
| 219 |
|
| 220 |
+
for i, row in enumerate(normalized_rows):
|
| 221 |
row_text = format_single_row(row, i + 1)
|
| 222 |
row_size = len(row_text)
|
| 223 |
|
| 224 |
+
should_split = (current_size + row_size > available_space or
|
| 225 |
+
len(current_rows) >= max_rows) and current_rows
|
| 226 |
|
| 227 |
if should_split:
|
| 228 |
content = base_content + format_table_rows(current_rows)
|
| 229 |
+
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
|
| 230 |
content += format_table_footer(table_identifier, doc_id)
|
| 231 |
|
| 232 |
metadata = {
|
| 233 |
'type': 'table',
|
| 234 |
'document_id': doc_id,
|
| 235 |
+
'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
|
| 236 |
+
'table_identifier': table_identifier,
|
| 237 |
+
'table_title': table_title,
|
| 238 |
'section': section,
|
| 239 |
+
'sheet_name': sheet_name,
|
| 240 |
'chunk_id': chunk_num,
|
| 241 |
'row_start': current_rows[0]['_idx'] - 1,
|
| 242 |
'row_end': current_rows[-1]['_idx'],
|
| 243 |
+
'total_rows': len(normalized_rows),
|
| 244 |
'chunk_size': len(content),
|
| 245 |
+
'is_complete_table': False,
|
| 246 |
+
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
|
| 247 |
}
|
| 248 |
|
| 249 |
chunks.append(Document(text=content, metadata=metadata))
|
|
|
|
| 253 |
current_rows = []
|
| 254 |
current_size = 0
|
| 255 |
|
|
|
|
| 256 |
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
|
| 257 |
row_copy['_idx'] = i + 1
|
| 258 |
current_rows.append(row_copy)
|
| 259 |
current_size += row_size
|
| 260 |
+
|
|
|
|
| 261 |
if current_rows:
|
| 262 |
content = base_content + format_table_rows(current_rows)
|
| 263 |
+
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
|
| 264 |
content += format_table_footer(table_identifier, doc_id)
|
| 265 |
|
| 266 |
metadata = {
|
| 267 |
'type': 'table',
|
| 268 |
'document_id': doc_id,
|
| 269 |
+
'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
|
| 270 |
+
'table_identifier': table_identifier,
|
| 271 |
+
'table_title': table_title,
|
| 272 |
'section': section,
|
| 273 |
+
'sheet_name': sheet_name,
|
| 274 |
'chunk_id': chunk_num,
|
| 275 |
'row_start': current_rows[0]['_idx'] - 1,
|
| 276 |
'row_end': current_rows[-1]['_idx'],
|
| 277 |
+
'total_rows': len(normalized_rows),
|
| 278 |
'chunk_size': len(content),
|
| 279 |
+
'is_complete_table': False,
|
| 280 |
+
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
|
| 281 |
}
|
| 282 |
|
| 283 |
chunks.append(Document(text=content, metadata=metadata))
|
|
|
|
| 285 |
|
| 286 |
return chunks
|
| 287 |
|
| 288 |
+
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
|
|
|
|
|
|
|
| 289 |
content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
|
| 290 |
|
| 291 |
+
# Add multiple searchable identifiers
|
| 292 |
+
if table_num and table_num not in ['-', 'unknown']:
|
| 293 |
+
content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
|
| 294 |
+
|
| 295 |
+
if sheet_name:
|
| 296 |
+
content += f"ЛИСТ: {sheet_name}\n"
|
| 297 |
|
| 298 |
if table_title:
|
| 299 |
content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
|
| 300 |
|
| 301 |
if section:
|
| 302 |
content += f"РАЗДЕЛ: {section}\n"
|
| 303 |
+
|
| 304 |
+
content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
|
| 305 |
|
| 306 |
content += f"{'='*70}\n"
|
| 307 |
|
| 308 |
if headers:
|
| 309 |
+
# Normalize headers too
|
| 310 |
+
normalized_headers = [normalize_text(str(h)) for h in headers]
|
| 311 |
+
header_str = ' | '.join(normalized_headers)
|
| 312 |
content += f"ЗАГОЛОВКИ: {header_str}\n\n"
|
| 313 |
|
| 314 |
content += "ДАННЫЕ:\n"
|
| 315 |
return content
|
| 316 |
|
|
|
|
| 317 |
def format_single_row(row, idx):
|
|
|
|
| 318 |
if isinstance(row, dict):
|
| 319 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 320 |
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
|
|
|
| 326 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 327 |
return ""
|
| 328 |
|
|
|
|
| 329 |
def format_table_rows(rows):
|
|
|
|
| 330 |
content = ""
|
| 331 |
for row in rows:
|
| 332 |
idx = row.get('_idx', 0)
|
| 333 |
content += format_single_row(row, idx)
|
| 334 |
return content
|
| 335 |
|
|
|
|
| 336 |
def format_table_footer(table_identifier, doc_id):
|
|
|
|
| 337 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 338 |
|
| 339 |
def load_json_documents(repo_id, hf_token, json_dir):
|
|
|
|
| 404 |
stats['failed'] += 1
|
| 405 |
continue
|
| 406 |
|
|
|
|
| 407 |
try:
|
| 408 |
text_content = file_content.decode('utf-8')
|
| 409 |
except UnicodeDecodeError:
|
|
|
|
| 411 |
text_content = file_content.decode('utf-8-sig')
|
| 412 |
except UnicodeDecodeError:
|
| 413 |
try:
|
|
|
|
| 414 |
text_content = file_content.decode('utf-16')
|
| 415 |
except UnicodeDecodeError:
|
| 416 |
try:
|
|
|
|
| 457 |
log_message(f" Success: {stats['success']}")
|
| 458 |
log_message(f" Empty: {stats['empty']}")
|
| 459 |
log_message(f" Failed: {stats['failed']}")
|
|
|
|
| 460 |
log_message(f"="*60)
|
| 461 |
|
| 462 |
return documents
|
| 463 |
|
| 464 |
def extract_sections_from_json(json_path):
|
|
|
|
| 465 |
documents = []
|
| 466 |
|
| 467 |
try:
|
|
|
|
| 511 |
|
| 512 |
return documents
|
| 513 |
|
|
|
|
| 514 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 515 |
log_message("Loading tables...")
|
| 516 |
+
log_message("="*60)
|
| 517 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 518 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 519 |
|
| 520 |
all_chunks = []
|
| 521 |
+
tables_processed = 0
|
| 522 |
+
|
| 523 |
for file_path in table_files:
|
| 524 |
try:
|
| 525 |
local_path = hf_hub_download(
|
|
|
|
| 536 |
|
| 537 |
for sheet in data.get('sheets', []):
|
| 538 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 539 |
+
tables_processed += 1
|
| 540 |
|
| 541 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id,
|
| 542 |
+
max_chars=MAX_CHARS_TABLE,
|
| 543 |
+
max_rows=MAX_ROWS_TABLE)
|
| 544 |
all_chunks.extend(chunks)
|
| 545 |
|
| 546 |
except Exception as e:
|
| 547 |
log_message(f"Error loading {file_path}: {e}")
|
| 548 |
|
| 549 |
+
log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
|
| 550 |
+
log_message("="*60)
|
| 551 |
+
|
| 552 |
return all_chunks
|
| 553 |
|
|
|
|
| 554 |
def load_image_documents(repo_id, hf_token, image_dir):
|
|
|
|
| 555 |
log_message("Loading images...")
|
| 556 |
|
| 557 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
|
|
| 597 |
|
| 598 |
return documents
|
| 599 |
|
|
|
|
| 600 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 601 |
"""Main loader - combines all document types"""
|
| 602 |
log_message("="*60)
|
index_retriever.py
CHANGED
|
@@ -10,7 +10,6 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
-
|
| 14 |
connection_type_sources = {}
|
| 15 |
table_count = 0
|
| 16 |
|
|
@@ -22,21 +21,9 @@ def create_vector_index(documents):
|
|
| 22 |
table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
|
| 23 |
if conn_type not in connection_type_sources:
|
| 24 |
connection_type_sources[conn_type] = []
|
| 25 |
-
connection_type_sources[conn_type].append(table_id)
|
| 26 |
-
|
| 27 |
-
log_message("="*60)
|
| 28 |
-
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 29 |
-
log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
|
| 30 |
-
for conn_type in sorted(connection_type_sources.keys()):
|
| 31 |
-
sources = list(set(connection_type_sources[conn_type])) # Unique sources
|
| 32 |
-
log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
|
| 33 |
-
for src in sources:
|
| 34 |
-
log_message(f" - {src}")
|
| 35 |
-
log_message("="*60)
|
| 36 |
-
|
| 37 |
return VectorStoreIndex.from_documents(documents)
|
| 38 |
|
| 39 |
-
|
| 40 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 41 |
if not nodes or not reranker:
|
| 42 |
return nodes[:top_k]
|
|
@@ -48,13 +35,10 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
|
| 48 |
scores = reranker.predict(pairs)
|
| 49 |
scored_nodes = list(zip(nodes, scores))
|
| 50 |
|
| 51 |
-
scored_nodes.sort(key=lambda x: x[1], reverse=True)
|
| 52 |
-
|
| 53 |
-
# Apply threshold
|
| 54 |
filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
|
| 55 |
|
| 56 |
if not filtered:
|
| 57 |
-
# Lower threshold if nothing passes
|
| 58 |
filtered = scored_nodes[:top_k]
|
| 59 |
|
| 60 |
log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
|
|
@@ -65,24 +49,25 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
|
| 65 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 66 |
return nodes[:top_k]
|
| 67 |
|
| 68 |
-
def create_query_engine(vector_index
|
|
|
|
| 69 |
try:
|
| 70 |
from config import CUSTOM_PROMPT
|
| 71 |
|
| 72 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 73 |
docstore=vector_index.docstore,
|
| 74 |
-
similarity_top_k=
|
| 75 |
)
|
| 76 |
|
| 77 |
vector_retriever = VectorIndexRetriever(
|
| 78 |
index=vector_index,
|
| 79 |
-
similarity_top_k=
|
| 80 |
-
similarity_cutoff=
|
| 81 |
)
|
| 82 |
|
| 83 |
hybrid_retriever = QueryFusionRetriever(
|
| 84 |
[vector_retriever, bm25_retriever],
|
| 85 |
-
similarity_top_k=
|
| 86 |
num_queries=1
|
| 87 |
)
|
| 88 |
|
|
@@ -97,7 +82,9 @@ def create_query_engine(vector_index):
|
|
| 97 |
response_synthesizer=response_synthesizer
|
| 98 |
)
|
| 99 |
|
| 100 |
-
log_message("Query engine
|
|
|
|
|
|
|
| 101 |
return query_engine
|
| 102 |
|
| 103 |
except Exception as e:
|
|
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
|
|
|
| 13 |
connection_type_sources = {}
|
| 14 |
table_count = 0
|
| 15 |
|
|
|
|
| 21 |
table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
|
| 22 |
if conn_type not in connection_type_sources:
|
| 23 |
connection_type_sources[conn_type] = []
|
| 24 |
+
connection_type_sources[conn_type].append(table_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
return VectorStoreIndex.from_documents(documents)
|
| 26 |
|
|
|
|
| 27 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 28 |
if not nodes or not reranker:
|
| 29 |
return nodes[:top_k]
|
|
|
|
| 35 |
scores = reranker.predict(pairs)
|
| 36 |
scored_nodes = list(zip(nodes, scores))
|
| 37 |
|
| 38 |
+
scored_nodes.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
| 39 |
filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
|
| 40 |
|
| 41 |
if not filtered:
|
|
|
|
| 42 |
filtered = scored_nodes[:top_k]
|
| 43 |
|
| 44 |
log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
|
|
|
|
| 49 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 50 |
return nodes[:top_k]
|
| 51 |
|
| 52 |
+
def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
|
| 53 |
+
similarity_cutoff=0.55, hybrid_top_k=100):
|
| 54 |
try:
|
| 55 |
from config import CUSTOM_PROMPT
|
| 56 |
|
| 57 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 58 |
docstore=vector_index.docstore,
|
| 59 |
+
similarity_top_k=bm25_top_k
|
| 60 |
)
|
| 61 |
|
| 62 |
vector_retriever = VectorIndexRetriever(
|
| 63 |
index=vector_index,
|
| 64 |
+
similarity_top_k=vector_top_k,
|
| 65 |
+
similarity_cutoff=similarity_cutoff
|
| 66 |
)
|
| 67 |
|
| 68 |
hybrid_retriever = QueryFusionRetriever(
|
| 69 |
[vector_retriever, bm25_retriever],
|
| 70 |
+
similarity_top_k=hybrid_top_k,
|
| 71 |
num_queries=1
|
| 72 |
)
|
| 73 |
|
|
|
|
| 82 |
response_synthesizer=response_synthesizer
|
| 83 |
)
|
| 84 |
|
| 85 |
+
log_message(f"Query engine created: vector_top_k={vector_top_k}, "
|
| 86 |
+
f"bm25_top_k={bm25_top_k}, similarity_cutoff={similarity_cutoff}, "
|
| 87 |
+
f"hybrid_top_k={hybrid_top_k}")
|
| 88 |
return query_engine
|
| 89 |
|
| 90 |
except Exception as e:
|
table_prep.py
DELETED
|
@@ -1,229 +0,0 @@
|
|
| 1 |
-
from collections import defaultdict
|
| 2 |
-
import json
|
| 3 |
-
from huggingface_hub import hf_hub_download, list_repo_files
|
| 4 |
-
from llama_index.core import Document
|
| 5 |
-
from my_logging import log_message
|
| 6 |
-
from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def create_table_content(table_data):
|
| 10 |
-
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 11 |
-
table_num = table_data.get('table_number', 'Неизвестно')
|
| 12 |
-
table_title = table_data.get('table_title', 'Неизвестно')
|
| 13 |
-
section = table_data.get('section', 'Неизвестно')
|
| 14 |
-
|
| 15 |
-
content = f"Таблица: {table_num}\n"
|
| 16 |
-
content += f"Название: {table_title}\n"
|
| 17 |
-
content += f"Документ: {doc_id}\n"
|
| 18 |
-
content += f"Раздел: {section}\n"
|
| 19 |
-
|
| 20 |
-
headers = table_data.get('headers', [])
|
| 21 |
-
if headers:
|
| 22 |
-
content += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 23 |
-
|
| 24 |
-
if 'data' in table_data and isinstance(table_data['data'], list):
|
| 25 |
-
content += "\nДанные таблицы:\n"
|
| 26 |
-
for row_idx, row in enumerate(table_data['data'], start=1):
|
| 27 |
-
if isinstance(row, dict):
|
| 28 |
-
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 29 |
-
content += f"Строка {row_idx}: {row_text}\n"
|
| 30 |
-
|
| 31 |
-
return content
|
| 32 |
-
|
| 33 |
-
def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
|
| 34 |
-
lines = doc.text.strip().split('\n')
|
| 35 |
-
|
| 36 |
-
header_lines = []
|
| 37 |
-
data_rows = []
|
| 38 |
-
in_data = False
|
| 39 |
-
|
| 40 |
-
for line in lines:
|
| 41 |
-
if line.startswith('Данные таблицы:'):
|
| 42 |
-
in_data = True
|
| 43 |
-
header_lines.append(line)
|
| 44 |
-
elif in_data and line.startswith('Строка'):
|
| 45 |
-
data_rows.append(line)
|
| 46 |
-
elif not in_data:
|
| 47 |
-
header_lines.append(line)
|
| 48 |
-
|
| 49 |
-
header = '\n'.join(header_lines) + '\n'
|
| 50 |
-
|
| 51 |
-
if not data_rows:
|
| 52 |
-
return [doc]
|
| 53 |
-
|
| 54 |
-
chunks = []
|
| 55 |
-
current_rows = []
|
| 56 |
-
current_size = len(header)
|
| 57 |
-
|
| 58 |
-
for row in data_rows:
|
| 59 |
-
row_size = len(row) + 1
|
| 60 |
-
# Check both limits: chunk size and row count
|
| 61 |
-
if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
|
| 62 |
-
chunk_text = header + '\n'.join(current_rows)
|
| 63 |
-
chunks.append(chunk_text)
|
| 64 |
-
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
| 65 |
-
current_rows = []
|
| 66 |
-
current_size = len(header)
|
| 67 |
-
|
| 68 |
-
current_rows.append(row)
|
| 69 |
-
current_size += row_size
|
| 70 |
-
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
|
| 71 |
-
|
| 72 |
-
# Add final chunk
|
| 73 |
-
if current_rows:
|
| 74 |
-
chunk_text = header + '\n'.join(current_rows)
|
| 75 |
-
chunks.append(chunk_text)
|
| 76 |
-
log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
| 77 |
-
|
| 78 |
-
# Create Document objects
|
| 79 |
-
chunked_docs = []
|
| 80 |
-
for i, chunk_text in enumerate(chunks):
|
| 81 |
-
chunk_doc = Document(
|
| 82 |
-
text=chunk_text,
|
| 83 |
-
metadata={
|
| 84 |
-
"type": "table",
|
| 85 |
-
"table_number": doc.metadata.get('table_number'),
|
| 86 |
-
"document_id": doc.metadata.get('document_id'),
|
| 87 |
-
"section": doc.metadata.get('section'),
|
| 88 |
-
"chunk_id": i,
|
| 89 |
-
"total_chunks": len(chunks),
|
| 90 |
-
"is_chunked": True
|
| 91 |
-
}
|
| 92 |
-
)
|
| 93 |
-
chunked_docs.append(chunk_doc)
|
| 94 |
-
|
| 95 |
-
return chunked_docs
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
# def table_to_document(table_data, document_id=None):
|
| 99 |
-
# if not isinstance(table_data, dict):
|
| 100 |
-
# return []
|
| 101 |
-
|
| 102 |
-
# doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
|
| 103 |
-
# table_num = table_data.get('table_number', 'Неизвестно')
|
| 104 |
-
# table_title = table_data.get('table_title', 'Неизвестно')
|
| 105 |
-
# section = table_data.get('section', 'Неизвестно')
|
| 106 |
-
# table_rows = table_data.get('data', [])
|
| 107 |
-
|
| 108 |
-
# if not table_rows:
|
| 109 |
-
# return []
|
| 110 |
-
|
| 111 |
-
# # Build table content
|
| 112 |
-
# content = f"Таблица: {table_num}\n"
|
| 113 |
-
# content += f"Название: {table_title}\n"
|
| 114 |
-
# content += f"Документ: {doc_id}\n"
|
| 115 |
-
# content += f"Раздел: {section}\n"
|
| 116 |
-
|
| 117 |
-
# headers = table_data.get('headers', [])
|
| 118 |
-
# if headers:
|
| 119 |
-
# content += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 120 |
-
|
| 121 |
-
# content += "\nДанные таблицы:\n"
|
| 122 |
-
# for row_idx, row in enumerate(table_rows, start=1):
|
| 123 |
-
# if isinstance(row, dict):
|
| 124 |
-
# row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 125 |
-
# content += f"Строка {row_idx}: {row_text}\n"
|
| 126 |
-
|
| 127 |
-
# # Create base document
|
| 128 |
-
# base_doc = Document(
|
| 129 |
-
# text=content,
|
| 130 |
-
# metadata={
|
| 131 |
-
# "type": "table",
|
| 132 |
-
# "table_number": table_num,
|
| 133 |
-
# "document_id": doc_id,
|
| 134 |
-
# "section": section
|
| 135 |
-
# }
|
| 136 |
-
# )
|
| 137 |
-
# if len(content) > 4000:
|
| 138 |
-
# chunks = chunk_table_document(base_doc)
|
| 139 |
-
# log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 140 |
-
# return chunk_table_document(base_doc)
|
| 141 |
-
# return [base_doc]
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
# def load_table_data(repo_id, hf_token, table_data_dir):
|
| 145 |
-
# try:
|
| 146 |
-
# files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 147 |
-
# table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
|
| 148 |
-
|
| 149 |
-
# log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
|
| 150 |
-
|
| 151 |
-
# table_documents = []
|
| 152 |
-
# stats = {
|
| 153 |
-
# 'total_tables': 0,
|
| 154 |
-
# 'total_size': 0,
|
| 155 |
-
# 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
|
| 156 |
-
# }
|
| 157 |
-
|
| 158 |
-
# for file_path in table_files:
|
| 159 |
-
# try:
|
| 160 |
-
# local_path = hf_hub_download(
|
| 161 |
-
# repo_id=repo_id,
|
| 162 |
-
# filename=file_path,
|
| 163 |
-
# local_dir='',
|
| 164 |
-
# repo_type="dataset",
|
| 165 |
-
# token=hf_token
|
| 166 |
-
# )
|
| 167 |
-
|
| 168 |
-
# log_message(f"\nОбработка файла: {file_path}")
|
| 169 |
-
|
| 170 |
-
# with open(local_path, 'r', encoding='utf-8') as f:
|
| 171 |
-
# table_data = json.load(f)
|
| 172 |
-
|
| 173 |
-
# if isinstance(table_data, dict):
|
| 174 |
-
# document_id = table_data.get('document', 'unknown')
|
| 175 |
-
|
| 176 |
-
# if 'sheets' in table_data:
|
| 177 |
-
# sorted_sheets = sorted(
|
| 178 |
-
# table_data['sheets'],
|
| 179 |
-
# key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
|
| 180 |
-
# )
|
| 181 |
-
|
| 182 |
-
# for sheet in sorted_sheets:
|
| 183 |
-
# sheet['document'] = document_id
|
| 184 |
-
# docs_list = table_to_document(sheet, document_id)
|
| 185 |
-
# table_documents.extend(docs_list)
|
| 186 |
-
|
| 187 |
-
# for doc in docs_list:
|
| 188 |
-
# stats['total_tables'] += 1
|
| 189 |
-
# size = doc.metadata.get('content_size', 0)
|
| 190 |
-
# stats['total_size'] += size
|
| 191 |
-
# stats['by_document'][document_id]['count'] += 1
|
| 192 |
-
# stats['by_document'][document_id]['size'] += size
|
| 193 |
-
# log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
|
| 194 |
-
# else:
|
| 195 |
-
# docs_list = table_to_document(table_data, document_id)
|
| 196 |
-
# table_documents.extend(docs_list)
|
| 197 |
-
|
| 198 |
-
# for doc in docs_list:
|
| 199 |
-
# stats['total_tables'] += 1
|
| 200 |
-
# size = doc.metadata.get('content_size', 0)
|
| 201 |
-
# stats['total_size'] += size
|
| 202 |
-
# stats['by_document'][document_id]['count'] += 1
|
| 203 |
-
# stats['by_document'][document_id]['size'] += size
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
# except Exception as e:
|
| 207 |
-
# log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
|
| 208 |
-
# continue
|
| 209 |
-
|
| 210 |
-
# # Log summary statistics
|
| 211 |
-
# log_message("\n" + "=" * 60)
|
| 212 |
-
# log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
|
| 213 |
-
# log_message("=" * 60)
|
| 214 |
-
# log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
|
| 215 |
-
# log_message(f"Общий размер: {stats['total_size']:,} символов")
|
| 216 |
-
# log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
|
| 217 |
-
|
| 218 |
-
# log_message("\nПо документам:")
|
| 219 |
-
# for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 220 |
-
# log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
|
| 221 |
-
# f"{doc_stats['size']:,} символов")
|
| 222 |
-
|
| 223 |
-
# log_message("=" * 60)
|
| 224 |
-
|
| 225 |
-
# return table_documents
|
| 226 |
-
|
| 227 |
-
# except Exception as e:
|
| 228 |
-
# log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 229 |
-
# return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
CHANGED
|
@@ -9,6 +9,18 @@ import time
|
|
| 9 |
from index_retriever import rerank_nodes
|
| 10 |
from my_logging import log_message
|
| 11 |
from config import PROMPT_SIMPLE_POISK
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def get_llm_model(model_name):
|
| 14 |
try:
|
|
@@ -172,65 +184,83 @@ def deduplicate_nodes(nodes):
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
| 175 |
-
def
|
| 176 |
-
|
| 177 |
-
all_nodes = list(vector_index.docstore.docs.values())
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
if node.metadata.get('type') == 'table':
|
| 182 |
-
text = node.get_content()
|
| 183 |
-
if search_term in text or search_term in node.metadata.get('table_title', ''):
|
| 184 |
-
matching.append({
|
| 185 |
-
'doc_id': node.metadata.get('document_id'),
|
| 186 |
-
'table_num': node.metadata.get('table_number'),
|
| 187 |
-
'title': node.metadata.get('table_title', '')[:100]
|
| 188 |
-
})
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
from documents_prep import normalize_text
|
| 199 |
|
| 200 |
-
|
| 201 |
-
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 202 |
-
# NORMALIZE the question to convert C to С
|
| 203 |
normalized_question = normalize_text(question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
if query_engine is None:
|
| 206 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 207 |
|
| 208 |
try:
|
| 209 |
start_time = time.time()
|
| 210 |
-
|
| 211 |
-
retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
|
| 212 |
log_message(f"user query: {question}")
|
| 213 |
-
log_message(f"
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 217 |
-
|
| 218 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 219 |
-
|
| 220 |
-
# DEBUG: Log what was retrieved
|
| 221 |
log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
|
| 222 |
-
for i, node in enumerate(unique_retrieved):
|
| 223 |
-
|
| 224 |
-
table_title = node.metadata.get('table_title', 'N/A')
|
| 225 |
doc_id = node.metadata.get('document_id', 'N/A')
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
|
| 232 |
-
|
| 233 |
-
response = query_engine.query(normalized_question)
|
| 234 |
|
| 235 |
end_time = time.time()
|
| 236 |
processing_time = end_time - start_time
|
|
@@ -243,7 +273,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 243 |
<h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
|
| 244 |
<div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
|
| 245 |
<div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
|
| 246 |
-
Время обработки: {processing_time:.2f} секунд
|
| 247 |
</div>
|
| 248 |
</div>"""
|
| 249 |
log_message(f"Model Answer: {response.response}")
|
|
|
|
| 9 |
from index_retriever import rerank_nodes
|
| 10 |
from my_logging import log_message
|
| 11 |
from config import PROMPT_SIMPLE_POISK
|
| 12 |
+
from config import QUERY_EXPANSION_PROMPT
|
| 13 |
+
from documents_prep import normalize_text, normalize_steel_designations
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
KEYWORD_EXPANSIONS = {
|
| 17 |
+
"08X18H10T": ["Листы", "Трубы", "Поковки", "Крепежные изделия", "Сортовой прокат", "Отливки"],
|
| 18 |
+
"12X18H10T": ["Листы", "Поковки", "Сортовой прокат"],
|
| 19 |
+
"10X17H13M2T": ["Трубы", "Арматура", "Поковки", "Фланцы"],
|
| 20 |
+
"20X23H18": ["Листы", "Сортовой прокат", "Поковки"],
|
| 21 |
+
"03X17H14M3": ["Трубы", "Листы", "Проволока"],
|
| 22 |
+
"СВ-08X19H10": ["Сварочная проволока", "Сварка", "Сварочные материалы"],
|
| 23 |
+
}
|
| 24 |
|
| 25 |
def get_llm_model(model_name):
|
| 26 |
try:
|
|
|
|
| 184 |
|
| 185 |
return unique_nodes
|
| 186 |
|
| 187 |
+
def enhance_query_with_keywords(query):
|
| 188 |
+
query_upper = query.upper()
|
|
|
|
| 189 |
|
| 190 |
+
added_context = []
|
| 191 |
+
keywords_found = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
for keyword, expansions in KEYWORD_EXPANSIONS.items():
|
| 194 |
+
keyword_upper = keyword.upper()
|
| 195 |
+
|
| 196 |
+
if keyword_upper in query_upper:
|
| 197 |
+
context = ' '.join(expansions)
|
| 198 |
+
added_context.append(context)
|
| 199 |
+
keywords_found.append(keyword)
|
| 200 |
+
log_message(f" Found keyword '{keyword}': added context '{context}'")
|
| 201 |
|
| 202 |
+
if added_context:
|
| 203 |
+
unique_context = ' '.join(set(' '.join(added_context).split()))
|
| 204 |
+
enhanced = f"{query} {unique_context}"
|
| 205 |
+
|
| 206 |
+
log_message(f"Enhanced query with keywords: {', '.join(keywords_found)}")
|
| 207 |
+
log_message(f"Added context: {unique_context[:100]}...")
|
| 208 |
+
|
| 209 |
+
return enhanced
|
| 210 |
+
return f"{query}"
|
| 211 |
|
|
|
|
| 212 |
|
| 213 |
+
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
|
|
|
|
|
|
| 214 |
normalized_question = normalize_text(question)
|
| 215 |
+
normalized_question_2, query_changes, change_list = normalize_steel_designations(question)
|
| 216 |
+
enhanced_question = enhance_query_with_keywords(normalized_question_2)
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
llm = get_llm_model(current_model)
|
| 220 |
+
expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=enhanced_question)
|
| 221 |
+
expanded_queries = llm.complete(expansion_prompt).text.strip()
|
| 222 |
+
enhanced_question = f"{enhanced_question} {expanded_queries}"
|
| 223 |
+
log_message(f"LLM expanded query: {expanded_queries[:200]}...")
|
| 224 |
+
except Exception as e:
|
| 225 |
+
log_message(f"Query expansion failed: {e}, using keyword-only enhancement")
|
| 226 |
|
| 227 |
+
if change_list:
|
| 228 |
+
log_message(f"Query changes: {', '.join(change_list)}")
|
| 229 |
+
if change_list:
|
| 230 |
+
log_message(f"Query changes: {', '.join(change_list)}")
|
| 231 |
if query_engine is None:
|
| 232 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 233 |
|
| 234 |
try:
|
| 235 |
start_time = time.time()
|
| 236 |
+
retrieved_nodes = query_engine.retriever.retrieve(enhanced_question)
|
|
|
|
| 237 |
log_message(f"user query: {question}")
|
| 238 |
+
log_message(f"after steel normalization: {normalized_question_2}")
|
| 239 |
+
log_message(f"enhanced query: {enhanced_question}")
|
|
|
|
|
|
|
|
|
|
| 240 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
|
|
|
|
|
|
| 241 |
log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
|
| 242 |
+
for i, node in enumerate(unique_retrieved):
|
| 243 |
+
node_type = node.metadata.get('type', 'text')
|
|
|
|
| 244 |
doc_id = node.metadata.get('document_id', 'N/A')
|
| 245 |
+
|
| 246 |
+
if node_type == 'table':
|
| 247 |
+
table_num = node.metadata.get('table_number', 'N/A')
|
| 248 |
+
table_id = node.metadata.get('table_identifier', 'N/A')
|
| 249 |
+
table_title = node.metadata.get('table_title', 'N/A')
|
| 250 |
+
content_preview = node.text[:200].replace('\n', ' ')
|
| 251 |
+
log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
|
| 252 |
+
log_message(f" Title: {table_title[:80]}")
|
| 253 |
+
log_message(f" Content: {content_preview}...")
|
| 254 |
+
else:
|
| 255 |
+
section = node.metadata.get('section_id', 'N/A')
|
| 256 |
+
log_message(f" [{i+1}] {doc_id} - Text section {section}")
|
| 257 |
+
|
| 258 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 259 |
|
| 260 |
+
reranked_nodes = rerank_nodes(enhanced_question, unique_retrieved, reranker,
|
| 261 |
+
top_k=rerank_top_k)
|
| 262 |
|
| 263 |
+
response = query_engine.query(enhanced_question)
|
|
|
|
| 264 |
|
| 265 |
end_time = time.time()
|
| 266 |
processing_time = end_time - start_time
|
|
|
|
| 273 |
<h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
|
| 274 |
<div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
|
| 275 |
<div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
|
| 276 |
+
Время обработки: {processing_time:.2f} секунд
|
| 277 |
</div>
|
| 278 |
</div>"""
|
| 279 |
log_message(f"Model Answer: {response.response}")
|