Spaces:
Sleeping
Sleeping
Commit
·
a90618e
1
Parent(s):
566457a
a new version with the normalization = 3rd release
Browse files- app.py +26 -23
- config.py +2 -1
- documents_prep.py +106 -59
- index_retriever.py +28 -3
- table_prep.py +107 -107
- utils.py +54 -5
app.py
CHANGED
|
@@ -149,37 +149,39 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 149 |
all_documents = []
|
| 150 |
chunks_df = None
|
| 151 |
|
|
|
|
| 152 |
if use_json_instead_csv and json_files_dir:
|
| 153 |
log_message("Используем JSON файлы вместо CSV")
|
| 154 |
-
from documents_prep import
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
else:
|
|
|
|
| 162 |
if chunks_filename:
|
| 163 |
log_message("Загружаем данные из CSV")
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
if table_data_dir:
|
| 167 |
-
log_message("Добавляю табличные данные")
|
| 168 |
-
from documents_prep import load_table_documents
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
from documents_prep import load_image_documents
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
log_message(f"Всего документов после всей обработки: {len(all_documents)}")
|
| 185 |
|
|
@@ -197,6 +199,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 197 |
'table_number': doc.metadata.get('table_number', ''),
|
| 198 |
'image_number': doc.metadata.get('image_number', ''),
|
| 199 |
'section': doc.metadata.get('section', ''),
|
|
|
|
| 200 |
})
|
| 201 |
|
| 202 |
log_message(f"Система успешно инициализирована")
|
|
|
|
| 149 |
all_documents = []
|
| 150 |
chunks_df = None
|
| 151 |
|
| 152 |
+
# CHANGED: Use load_all_documents instead of loading separately
|
| 153 |
if use_json_instead_csv and json_files_dir:
|
| 154 |
log_message("Используем JSON файлы вместо CSV")
|
| 155 |
+
from documents_prep import load_all_documents
|
| 156 |
|
| 157 |
+
# This will handle text, tables, and images all together with proper logging
|
| 158 |
+
all_documents = load_all_documents(
|
| 159 |
+
repo_id=repo_id,
|
| 160 |
+
hf_token=hf_token,
|
| 161 |
+
json_dir=json_files_dir,
|
| 162 |
+
table_dir=table_data_dir if table_data_dir else "",
|
| 163 |
+
image_dir=image_data_dir if image_data_dir else ""
|
| 164 |
+
)
|
| 165 |
else:
|
| 166 |
+
# OLD PATH: Loading separately (fallback)
|
| 167 |
if chunks_filename:
|
| 168 |
log_message("Загружаем данные из CSV")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
+
if table_data_dir:
|
| 171 |
+
log_message("Добавляю табличные данные")
|
| 172 |
+
from documents_prep import load_table_documents
|
| 173 |
+
|
| 174 |
+
table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
|
| 175 |
+
log_message(f"Загружено {len(table_chunks)} табличных чанков")
|
| 176 |
+
all_documents.extend(table_chunks)
|
|
|
|
| 177 |
|
| 178 |
+
if image_data_dir:
|
| 179 |
+
log_message("Добавляю данные изображений")
|
| 180 |
+
from documents_prep import load_image_documents
|
| 181 |
+
|
| 182 |
+
image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
|
| 183 |
+
log_message(f"Загружено {len(image_documents)} документов изображений")
|
| 184 |
+
all_documents.extend(image_documents)
|
| 185 |
|
| 186 |
log_message(f"Всего документов после всей обработки: {len(all_documents)}")
|
| 187 |
|
|
|
|
| 199 |
'table_number': doc.metadata.get('table_number', ''),
|
| 200 |
'image_number': doc.metadata.get('image_number', ''),
|
| 201 |
'section': doc.metadata.get('section', ''),
|
| 202 |
+
'connection_type': doc.metadata.get('connection_type', '') # ADD THIS
|
| 203 |
})
|
| 204 |
|
| 205 |
log_message(f"Система успешно инициализирована")
|
config.py
CHANGED
|
@@ -53,10 +53,11 @@ CHUNK_SIZE = 1500
|
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
MAX_CHARS_TABLE = 2500
|
| 56 |
-
MAX_ROWS_TABLE =
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
| 59 |
Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
|
|
|
|
| 60 |
|
| 61 |
ПРАВИЛА АНАЛИЗА ЗАПРОСА:
|
| 62 |
|
|
|
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
MAX_CHARS_TABLE = 2500
|
| 56 |
+
MAX_ROWS_TABLE = 15
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
| 59 |
Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
|
| 60 |
+
СТРОГО ОТВЕТИТЬ ТОЛЬКО НА РУССКОМ!
|
| 61 |
|
| 62 |
ПРАВИЛА АНАЛИЗА ЗАПРОСА:
|
| 63 |
|
documents_prep.py
CHANGED
|
@@ -34,6 +34,26 @@ def chunk_text_documents(documents):
|
|
| 34 |
|
| 35 |
return chunked
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 39 |
headers = table_data.get('headers', [])
|
|
@@ -41,6 +61,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 41 |
table_num = table_data.get('table_number', 'unknown')
|
| 42 |
table_title = table_data.get('table_title', '')
|
| 43 |
section = table_data.get('section', '')
|
|
|
|
| 44 |
|
| 45 |
table_num_clean = str(table_num).strip()
|
| 46 |
|
|
@@ -60,8 +81,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 60 |
|
| 61 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 62 |
|
| 63 |
-
# Calculate base metadata size
|
| 64 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
base_size = len(base_content)
|
| 66 |
available_space = max_chars - base_size - 200
|
| 67 |
|
|
@@ -79,7 +105,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 79 |
'section': section,
|
| 80 |
'total_rows': len(rows),
|
| 81 |
'chunk_size': len(content),
|
| 82 |
-
'is_complete_table': True
|
|
|
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
|
@@ -113,7 +141,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 113 |
'row_end': current_rows[-1]['_idx'],
|
| 114 |
'total_rows': len(rows),
|
| 115 |
'chunk_size': len(content),
|
| 116 |
-
'is_complete_table': False
|
|
|
|
| 117 |
}
|
| 118 |
|
| 119 |
chunks.append(Document(text=content, metadata=metadata))
|
|
@@ -155,37 +184,62 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 155 |
|
| 156 |
return chunks
|
| 157 |
|
| 158 |
-
|
| 159 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 160 |
-
content = f"
|
|
|
|
|
|
|
| 161 |
if table_title:
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
if section:
|
| 164 |
-
content += f"
|
| 165 |
-
content += f"{'='*70}\n"
|
| 166 |
|
| 167 |
-
|
| 168 |
-
header_str = ' | '.join(str(h) for h in headers)
|
| 169 |
-
content += f"ЗАГОЛОВКИ: {header_str}\n\n"
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return content
|
| 173 |
|
| 174 |
|
| 175 |
def format_single_row(row, idx):
|
| 176 |
-
"""Format a single row"""
|
| 177 |
if isinstance(row, dict):
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
if parts:
|
| 181 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 182 |
elif isinstance(row, list):
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
if parts:
|
| 185 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 186 |
return ""
|
| 187 |
|
| 188 |
-
|
| 189 |
def format_table_rows(rows):
|
| 190 |
"""Format multiple rows"""
|
| 191 |
content = ""
|
|
@@ -199,40 +253,6 @@ def format_table_footer(table_identifier, doc_id):
|
|
| 199 |
"""Format table footer"""
|
| 200 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 201 |
|
| 202 |
-
def load_table_documents(repo_id, hf_token, table_dir):
|
| 203 |
-
log_message("Loading tables...")
|
| 204 |
-
|
| 205 |
-
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 206 |
-
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 207 |
-
|
| 208 |
-
all_chunks = []
|
| 209 |
-
for file_path in table_files:
|
| 210 |
-
try:
|
| 211 |
-
local_path = hf_hub_download(
|
| 212 |
-
repo_id=repo_id,
|
| 213 |
-
filename=file_path,
|
| 214 |
-
repo_type="dataset",
|
| 215 |
-
token=hf_token
|
| 216 |
-
)
|
| 217 |
-
|
| 218 |
-
with open(local_path, 'r', encoding='utf-8') as f:
|
| 219 |
-
data = json.load(f)
|
| 220 |
-
|
| 221 |
-
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 222 |
-
|
| 223 |
-
for sheet in data.get('sheets', []):
|
| 224 |
-
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 225 |
-
|
| 226 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
|
| 227 |
-
all_chunks.extend(chunks)
|
| 228 |
-
|
| 229 |
-
except Exception as e:
|
| 230 |
-
log_message(f"Error loading {file_path}: {e}")
|
| 231 |
-
|
| 232 |
-
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 233 |
-
return all_chunks
|
| 234 |
-
|
| 235 |
-
|
| 236 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 237 |
import zipfile
|
| 238 |
import tempfile
|
|
@@ -414,13 +434,14 @@ def extract_sections_from_json(json_path):
|
|
| 414 |
|
| 415 |
|
| 416 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 417 |
-
"""Load and chunk tables"""
|
| 418 |
log_message("Loading tables...")
|
| 419 |
|
| 420 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 421 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 422 |
|
| 423 |
all_chunks = []
|
|
|
|
|
|
|
| 424 |
for file_path in table_files:
|
| 425 |
try:
|
| 426 |
local_path = hf_hub_download(
|
|
@@ -433,23 +454,38 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 433 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 434 |
data = json.load(f)
|
| 435 |
|
| 436 |
-
# Extract file-level document_id
|
| 437 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 438 |
|
| 439 |
for sheet in data.get('sheets', []):
|
| 440 |
-
# Use sheet-level document_id if available, otherwise use file-level
|
| 441 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
|
|
|
|
|
|
| 442 |
|
| 443 |
-
|
| 444 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id)
|
| 445 |
all_chunks.extend(chunks)
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
except Exception as e:
|
| 448 |
log_message(f"Error loading {file_path}: {e}")
|
| 449 |
|
| 450 |
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 451 |
-
return all_chunks
|
| 452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
def load_image_documents(repo_id, hf_token, image_dir):
|
| 455 |
"""Load image descriptions"""
|
|
@@ -498,9 +534,7 @@ def load_image_documents(repo_id, hf_token, image_dir):
|
|
| 498 |
|
| 499 |
return documents
|
| 500 |
|
| 501 |
-
|
| 502 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 503 |
-
"""Main loader - combines all document types"""
|
| 504 |
log_message("="*60)
|
| 505 |
log_message("STARTING DOCUMENT LOADING")
|
| 506 |
log_message("="*60)
|
|
@@ -512,6 +546,19 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
|
| 512 |
# Load tables (already chunked)
|
| 513 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 514 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
# Load images (no chunking needed)
|
| 516 |
image_docs = load_image_documents(repo_id, hf_token, image_dir)
|
| 517 |
|
|
|
|
| 34 |
|
| 35 |
return chunked
|
| 36 |
|
| 37 |
+
def normalize_connection_type(s):
|
| 38 |
+
# Replace Cyrillic with Latin
|
| 39 |
+
s = s.replace('С', 'C').replace('с', 'c')
|
| 40 |
+
s = s.replace('У', 'U').replace('у', 'u')
|
| 41 |
+
s = s.replace('Т', 'T').replace('т', 't')
|
| 42 |
+
s= s.replace('С-', 'C-').replace('с-', 'c-')
|
| 43 |
+
s = s.replace('У-', 'U-').replace('у-', 'u-')
|
| 44 |
+
s = s.replace('Т-', 'T-').replace('т-', 't-')
|
| 45 |
+
# REMOVE ALL HYPHENS for consistent tokenization
|
| 46 |
+
s = s.replace('-', '')
|
| 47 |
+
return s
|
| 48 |
+
|
| 49 |
+
def extract_connection_type(text):
|
| 50 |
+
import re
|
| 51 |
+
# Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
|
| 52 |
+
match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
|
| 53 |
+
if match:
|
| 54 |
+
normalized = normalize_connection_type(match.group(0))
|
| 55 |
+
return normalized
|
| 56 |
+
return ''
|
| 57 |
|
| 58 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 59 |
headers = table_data.get('headers', [])
|
|
|
|
| 61 |
table_num = table_data.get('table_number', 'unknown')
|
| 62 |
table_title = table_data.get('table_title', '')
|
| 63 |
section = table_data.get('section', '')
|
| 64 |
+
table_description = table_data.get('table_description', '')
|
| 65 |
|
| 66 |
table_num_clean = str(table_num).strip()
|
| 67 |
|
|
|
|
| 81 |
|
| 82 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 83 |
|
| 84 |
+
# Calculate base metadata size - NOW INCLUDING DESCRIPTION
|
| 85 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 86 |
+
|
| 87 |
+
# ADD DESCRIPTION HERE if it exists
|
| 88 |
+
if table_description:
|
| 89 |
+
base_content += f"ОПИСАНИЕ: {table_description}\n\n"
|
| 90 |
+
|
| 91 |
base_size = len(base_content)
|
| 92 |
available_space = max_chars - base_size - 200
|
| 93 |
|
|
|
|
| 105 |
'section': section,
|
| 106 |
'total_rows': len(rows),
|
| 107 |
'chunk_size': len(content),
|
| 108 |
+
'is_complete_table': True,
|
| 109 |
+
'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
|
| 110 |
+
|
| 111 |
}
|
| 112 |
|
| 113 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
|
|
|
| 141 |
'row_end': current_rows[-1]['_idx'],
|
| 142 |
'total_rows': len(rows),
|
| 143 |
'chunk_size': len(content),
|
| 144 |
+
'is_complete_table': False,
|
| 145 |
+
'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
|
| 146 |
}
|
| 147 |
|
| 148 |
chunks.append(Document(text=content, metadata=metadata))
|
|
|
|
| 184 |
|
| 185 |
return chunks
|
| 186 |
|
|
|
|
| 187 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 188 |
+
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 189 |
+
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 190 |
+
|
| 191 |
if table_title:
|
| 192 |
+
# Normalize the title text itself for better searchability
|
| 193 |
+
normalized_title = normalize_connection_type(table_title)
|
| 194 |
+
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
|
| 195 |
+
|
| 196 |
+
# Extract and store the normalized connection type
|
| 197 |
+
connection_type = extract_connection_type(table_title)
|
| 198 |
+
if connection_type:
|
| 199 |
+
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
| 200 |
+
|
| 201 |
+
if table_num and table_num != table_identifier:
|
| 202 |
+
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
| 203 |
+
|
| 204 |
if section:
|
| 205 |
+
content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
|
|
|
|
| 206 |
|
| 207 |
+
content += f"\n{'='*70}\n"
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
if headers:
|
| 210 |
+
content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
|
| 211 |
+
for i, h in enumerate(headers, 1):
|
| 212 |
+
# NORMALIZE HEADERS TOO
|
| 213 |
+
normalized_header = normalize_connection_type(h)
|
| 214 |
+
content += f" {i}. {normalized_header}\n"
|
| 215 |
+
content += "\n"
|
| 216 |
+
|
| 217 |
+
content += "ДАННЫЕ ТАБЛИЦЫ:\n"
|
| 218 |
return content
|
| 219 |
|
| 220 |
|
| 221 |
def format_single_row(row, idx):
|
| 222 |
+
"""Format a single row with normalization"""
|
| 223 |
if isinstance(row, dict):
|
| 224 |
+
# NORMALIZE VALUES IN ROWS
|
| 225 |
+
parts = []
|
| 226 |
+
for k, v in row.items():
|
| 227 |
+
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
|
| 228 |
+
normalized_v = normalize_connection_type(str(v))
|
| 229 |
+
parts.append(f"{k}: {normalized_v}")
|
| 230 |
if parts:
|
| 231 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 232 |
elif isinstance(row, list):
|
| 233 |
+
# NORMALIZE LIST VALUES
|
| 234 |
+
parts = []
|
| 235 |
+
for v in row:
|
| 236 |
+
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
|
| 237 |
+
normalized_v = normalize_connection_type(str(v))
|
| 238 |
+
parts.append(normalized_v)
|
| 239 |
if parts:
|
| 240 |
return f"{idx}. {' | '.join(parts)}\n"
|
| 241 |
return ""
|
| 242 |
|
|
|
|
| 243 |
def format_table_rows(rows):
|
| 244 |
"""Format multiple rows"""
|
| 245 |
content = ""
|
|
|
|
| 253 |
"""Format table footer"""
|
| 254 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 257 |
import zipfile
|
| 258 |
import tempfile
|
|
|
|
| 434 |
|
| 435 |
|
| 436 |
def load_table_documents(repo_id, hf_token, table_dir):
|
|
|
|
| 437 |
log_message("Loading tables...")
|
| 438 |
|
| 439 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 440 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 441 |
|
| 442 |
all_chunks = []
|
| 443 |
+
connection_type_sources = {} # Track which table each type comes from
|
| 444 |
+
|
| 445 |
for file_path in table_files:
|
| 446 |
try:
|
| 447 |
local_path = hf_hub_download(
|
|
|
|
| 454 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 455 |
data = json.load(f)
|
| 456 |
|
|
|
|
| 457 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 458 |
|
| 459 |
for sheet in data.get('sheets', []):
|
|
|
|
| 460 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 461 |
+
table_num = sheet.get('table_number', 'unknown')
|
| 462 |
+
table_title = sheet.get('table_title', '')
|
| 463 |
|
| 464 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
|
|
|
|
| 465 |
all_chunks.extend(chunks)
|
| 466 |
|
| 467 |
+
# Track connection type source
|
| 468 |
+
conn_type = extract_connection_type(table_title)
|
| 469 |
+
if conn_type:
|
| 470 |
+
if conn_type not in connection_type_sources:
|
| 471 |
+
connection_type_sources[conn_type] = []
|
| 472 |
+
connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
|
| 473 |
+
|
| 474 |
except Exception as e:
|
| 475 |
log_message(f"Error loading {file_path}: {e}")
|
| 476 |
|
| 477 |
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
|
|
|
| 478 |
|
| 479 |
+
log_message("="*60)
|
| 480 |
+
log_message("CONNECTION TYPES AND THEIR SOURCES:")
|
| 481 |
+
for conn_type in sorted(connection_type_sources.keys()):
|
| 482 |
+
sources = connection_type_sources[conn_type]
|
| 483 |
+
log_message(f" {conn_type}: {len(sources)} tables")
|
| 484 |
+
for src in sources:
|
| 485 |
+
log_message(f" - {src}")
|
| 486 |
+
log_message("="*60)
|
| 487 |
+
|
| 488 |
+
return all_chunks
|
| 489 |
|
| 490 |
def load_image_documents(repo_id, hf_token, image_dir):
|
| 491 |
"""Load image descriptions"""
|
|
|
|
| 534 |
|
| 535 |
return documents
|
| 536 |
|
|
|
|
| 537 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
|
|
|
| 538 |
log_message("="*60)
|
| 539 |
log_message("STARTING DOCUMENT LOADING")
|
| 540 |
log_message("="*60)
|
|
|
|
| 546 |
# Load tables (already chunked)
|
| 547 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 548 |
|
| 549 |
+
# NEW: Analyze connection types in tables
|
| 550 |
+
connection_types = {}
|
| 551 |
+
for chunk in table_chunks:
|
| 552 |
+
conn_type = chunk.metadata.get('connection_type', '')
|
| 553 |
+
if conn_type:
|
| 554 |
+
connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
|
| 555 |
+
|
| 556 |
+
log_message("="*60)
|
| 557 |
+
log_message("CONNECTION TYPES FOUND IN TABLES:")
|
| 558 |
+
for conn_type, count in sorted(connection_types.items()):
|
| 559 |
+
log_message(f" {conn_type}: {count} chunks")
|
| 560 |
+
log_message("="*60)
|
| 561 |
+
|
| 562 |
# Load images (no chunking needed)
|
| 563 |
image_docs = load_image_documents(repo_id, hf_token, image_dir)
|
| 564 |
|
index_retriever.py
CHANGED
|
@@ -10,8 +10,33 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
return VectorStoreIndex.from_documents(documents)
|
| 14 |
|
|
|
|
| 15 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 16 |
if not nodes or not reranker:
|
| 17 |
return nodes[:top_k]
|
|
@@ -46,18 +71,18 @@ def create_query_engine(vector_index):
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
-
similarity_top_k=
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
-
similarity_top_k=
|
| 55 |
similarity_cutoff=0.55
|
| 56 |
)
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
-
similarity_top_k=
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
|
|
|
| 10 |
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
+
|
| 14 |
+
connection_type_sources = {}
|
| 15 |
+
table_count = 0
|
| 16 |
+
|
| 17 |
+
for doc in documents:
|
| 18 |
+
if doc.metadata.get('type') == 'table':
|
| 19 |
+
table_count += 1
|
| 20 |
+
conn_type = doc.metadata.get('connection_type', '')
|
| 21 |
+
if conn_type:
|
| 22 |
+
table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
|
| 23 |
+
if conn_type not in connection_type_sources:
|
| 24 |
+
connection_type_sources[conn_type] = []
|
| 25 |
+
connection_type_sources[conn_type].append(table_id)
|
| 26 |
+
|
| 27 |
+
log_message("="*60)
|
| 28 |
+
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 29 |
+
log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
|
| 30 |
+
for conn_type in sorted(connection_type_sources.keys()):
|
| 31 |
+
sources = list(set(connection_type_sources[conn_type])) # Unique sources
|
| 32 |
+
log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
|
| 33 |
+
for src in sources:
|
| 34 |
+
log_message(f" - {src}")
|
| 35 |
+
log_message("="*60)
|
| 36 |
+
|
| 37 |
return VectorStoreIndex.from_documents(documents)
|
| 38 |
|
| 39 |
+
|
| 40 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 41 |
if not nodes or not reranker:
|
| 42 |
return nodes[:top_k]
|
|
|
|
| 71 |
|
| 72 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 73 |
docstore=vector_index.docstore,
|
| 74 |
+
similarity_top_k=100
|
| 75 |
)
|
| 76 |
|
| 77 |
vector_retriever = VectorIndexRetriever(
|
| 78 |
index=vector_index,
|
| 79 |
+
similarity_top_k=100,
|
| 80 |
similarity_cutoff=0.55
|
| 81 |
)
|
| 82 |
|
| 83 |
hybrid_retriever = QueryFusionRetriever(
|
| 84 |
[vector_retriever, bm25_retriever],
|
| 85 |
+
similarity_top_k=100,
|
| 86 |
num_queries=1
|
| 87 |
)
|
| 88 |
|
table_prep.py
CHANGED
|
@@ -95,135 +95,135 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk
|
|
| 95 |
return chunked_docs
|
| 96 |
|
| 97 |
|
| 98 |
-
def table_to_document(table_data, document_id=None):
|
| 99 |
-
|
| 100 |
-
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
|
| 144 |
-
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
|
| 149 |
-
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
|
| 168 |
-
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
-
|
| 224 |
|
| 225 |
-
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
| 95 |
return chunked_docs
|
| 96 |
|
| 97 |
|
| 98 |
+
# def table_to_document(table_data, document_id=None):
|
| 99 |
+
# if not isinstance(table_data, dict):
|
| 100 |
+
# return []
|
| 101 |
|
| 102 |
+
# doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
|
| 103 |
+
# table_num = table_data.get('table_number', 'Неизвестно')
|
| 104 |
+
# table_title = table_data.get('table_title', 'Неизвестно')
|
| 105 |
+
# section = table_data.get('section', 'Неизвестно')
|
| 106 |
+
# table_rows = table_data.get('data', [])
|
| 107 |
|
| 108 |
+
# if not table_rows:
|
| 109 |
+
# return []
|
| 110 |
|
| 111 |
+
# # Build table content
|
| 112 |
+
# content = f"Таблица: {table_num}\n"
|
| 113 |
+
# content += f"Название: {table_title}\n"
|
| 114 |
+
# content += f"Документ: {doc_id}\n"
|
| 115 |
+
# content += f"Раздел: {section}\n"
|
| 116 |
|
| 117 |
+
# headers = table_data.get('headers', [])
|
| 118 |
+
# if headers:
|
| 119 |
+
# content += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 120 |
|
| 121 |
+
# content += "\nДанные таблицы:\n"
|
| 122 |
+
# for row_idx, row in enumerate(table_rows, start=1):
|
| 123 |
+
# if isinstance(row, dict):
|
| 124 |
+
# row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 125 |
+
# content += f"Строка {row_idx}: {row_text}\n"
|
| 126 |
|
| 127 |
+
# # Create base document
|
| 128 |
+
# base_doc = Document(
|
| 129 |
+
# text=content,
|
| 130 |
+
# metadata={
|
| 131 |
+
# "type": "table",
|
| 132 |
+
# "table_number": table_num,
|
| 133 |
+
# "document_id": doc_id,
|
| 134 |
+
# "section": section
|
| 135 |
+
# }
|
| 136 |
+
# )
|
| 137 |
+
# if len(content) > 4000:
|
| 138 |
+
# chunks = chunk_table_document(base_doc)
|
| 139 |
+
# log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 140 |
+
# return chunk_table_document(base_doc)
|
| 141 |
+
# return [base_doc]
|
| 142 |
|
| 143 |
|
| 144 |
+
# def load_table_data(repo_id, hf_token, table_data_dir):
|
| 145 |
+
# try:
|
| 146 |
+
# files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 147 |
+
# table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
|
| 148 |
|
| 149 |
+
# log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
|
| 150 |
|
| 151 |
+
# table_documents = []
|
| 152 |
+
# stats = {
|
| 153 |
+
# 'total_tables': 0,
|
| 154 |
+
# 'total_size': 0,
|
| 155 |
+
# 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
|
| 156 |
+
# }
|
| 157 |
|
| 158 |
+
# for file_path in table_files:
|
| 159 |
+
# try:
|
| 160 |
+
# local_path = hf_hub_download(
|
| 161 |
+
# repo_id=repo_id,
|
| 162 |
+
# filename=file_path,
|
| 163 |
+
# local_dir='',
|
| 164 |
+
# repo_type="dataset",
|
| 165 |
+
# token=hf_token
|
| 166 |
+
# )
|
| 167 |
|
| 168 |
+
# log_message(f"\nОбработка файла: {file_path}")
|
| 169 |
|
| 170 |
+
# with open(local_path, 'r', encoding='utf-8') as f:
|
| 171 |
+
# table_data = json.load(f)
|
| 172 |
|
| 173 |
+
# if isinstance(table_data, dict):
|
| 174 |
+
# document_id = table_data.get('document', 'unknown')
|
| 175 |
|
| 176 |
+
# if 'sheets' in table_data:
|
| 177 |
+
# sorted_sheets = sorted(
|
| 178 |
+
# table_data['sheets'],
|
| 179 |
+
# key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
|
| 180 |
+
# )
|
| 181 |
|
| 182 |
+
# for sheet in sorted_sheets:
|
| 183 |
+
# sheet['document'] = document_id
|
| 184 |
+
# docs_list = table_to_document(sheet, document_id)
|
| 185 |
+
# table_documents.extend(docs_list)
|
| 186 |
|
| 187 |
+
# for doc in docs_list:
|
| 188 |
+
# stats['total_tables'] += 1
|
| 189 |
+
# size = doc.metadata.get('content_size', 0)
|
| 190 |
+
# stats['total_size'] += size
|
| 191 |
+
# stats['by_document'][document_id]['count'] += 1
|
| 192 |
+
# stats['by_document'][document_id]['size'] += size
|
| 193 |
+
# log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
|
| 194 |
+
# else:
|
| 195 |
+
# docs_list = table_to_document(table_data, document_id)
|
| 196 |
+
# table_documents.extend(docs_list)
|
| 197 |
|
| 198 |
+
# for doc in docs_list:
|
| 199 |
+
# stats['total_tables'] += 1
|
| 200 |
+
# size = doc.metadata.get('content_size', 0)
|
| 201 |
+
# stats['total_size'] += size
|
| 202 |
+
# stats['by_document'][document_id]['count'] += 1
|
| 203 |
+
# stats['by_document'][document_id]['size'] += size
|
| 204 |
|
| 205 |
|
| 206 |
+
# except Exception as e:
|
| 207 |
+
# log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
|
| 208 |
+
# continue
|
| 209 |
|
| 210 |
+
# # Log summary statistics
|
| 211 |
+
# log_message("\n" + "=" * 60)
|
| 212 |
+
# log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
|
| 213 |
+
# log_message("=" * 60)
|
| 214 |
+
# log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
|
| 215 |
+
# log_message(f"Общий размер: {stats['total_size']:,} символов")
|
| 216 |
+
# log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
|
| 217 |
|
| 218 |
+
# log_message("\nПо документам:")
|
| 219 |
+
# for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 220 |
+
# log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
|
| 221 |
+
# f"{doc_stats['size']:,} символов")
|
| 222 |
|
| 223 |
+
# log_message("=" * 60)
|
| 224 |
|
| 225 |
+
# return table_documents
|
| 226 |
|
| 227 |
+
# except Exception as e:
|
| 228 |
+
# log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 229 |
+
# return []
|
utils.py
CHANGED
|
@@ -9,6 +9,7 @@ import time
|
|
| 9 |
from index_retriever import rerank_nodes
|
| 10 |
from my_logging import log_message
|
| 11 |
from config import PROMPT_SIMPLE_POISK
|
|
|
|
| 12 |
|
| 13 |
def get_llm_model(model_name):
|
| 14 |
try:
|
|
@@ -172,6 +173,14 @@ def deduplicate_nodes(nodes):
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 177 |
if query_engine is None:
|
|
@@ -179,18 +188,58 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 179 |
|
| 180 |
try:
|
| 181 |
start_time = time.time()
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 185 |
|
| 186 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 187 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 188 |
|
| 189 |
-
#
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
#
|
| 193 |
-
response = query_engine.query(
|
| 194 |
|
| 195 |
end_time = time.time()
|
| 196 |
processing_time = end_time - start_time
|
|
|
|
| 9 |
from index_retriever import rerank_nodes
|
| 10 |
from my_logging import log_message
|
| 11 |
from config import PROMPT_SIMPLE_POISK
|
| 12 |
+
import re
|
| 13 |
|
| 14 |
def get_llm_model(model_name):
|
| 15 |
try:
|
|
|
|
| 173 |
|
| 174 |
return unique_nodes
|
| 175 |
|
| 176 |
+
def normalize_query(query):
|
| 177 |
+
def repl(m):
|
| 178 |
+
cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
|
| 179 |
+
letter = cyr_to_lat.get(m.group(1), m.group(1))
|
| 180 |
+
return f"{letter}{m.group(2)}"
|
| 181 |
+
|
| 182 |
+
return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
|
| 183 |
+
|
| 184 |
|
| 185 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 186 |
if query_engine is None:
|
|
|
|
| 188 |
|
| 189 |
try:
|
| 190 |
start_time = time.time()
|
| 191 |
+
|
| 192 |
+
# NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
|
| 193 |
+
normalized_question = normalize_query(question)
|
| 194 |
+
log_message(f"Original query: {question}")
|
| 195 |
+
log_message(f"Normalized query: {normalized_question}")
|
| 196 |
+
|
| 197 |
+
# Use normalized query for retrieval
|
| 198 |
+
retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
|
| 199 |
+
log_message(f"user query: {question}")
|
| 200 |
|
| 201 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 202 |
|
| 203 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 204 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 205 |
|
| 206 |
+
# Check for connection types
|
| 207 |
+
conn_types_retrieved = {}
|
| 208 |
+
for node in unique_retrieved:
|
| 209 |
+
if node.metadata.get('type') == 'table':
|
| 210 |
+
conn_type = node.metadata.get('connection_type', '')
|
| 211 |
+
if conn_type:
|
| 212 |
+
conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
|
| 213 |
+
|
| 214 |
+
if conn_types_retrieved:
|
| 215 |
+
log_message("CONNECTION TYPES IN RETRIEVED:")
|
| 216 |
+
for ct, cnt in sorted(conn_types_retrieved.items()):
|
| 217 |
+
log_message(f" {ct}: {cnt} chunks")
|
| 218 |
+
|
| 219 |
+
# Check if target type was retrieved
|
| 220 |
+
# Normalize the check as well
|
| 221 |
+
normalized_check = normalize_query('С-25') # Will become C25
|
| 222 |
+
if normalized_check in question or 'С-25' in question or 'C-25' in question:
|
| 223 |
+
if 'C25' in conn_types_retrieved:
|
| 224 |
+
log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
|
| 225 |
+
else:
|
| 226 |
+
log_message("✗ C25 NOT RETRIEVED despite being in query!")
|
| 227 |
+
|
| 228 |
+
# Sample of retrieved tables
|
| 229 |
+
log_message("SAMPLE OF RETRIEVED TABLES:")
|
| 230 |
+
for i, node in enumerate(unique_retrieved[:10]):
|
| 231 |
+
if node.metadata.get('type') == 'table':
|
| 232 |
+
table_num = node.metadata.get('table_number', 'N/A')
|
| 233 |
+
table_title = node.metadata.get('table_title', 'N/A')
|
| 234 |
+
conn_type = node.metadata.get('connection_type', 'N/A')
|
| 235 |
+
doc_id = node.metadata.get('document_id', 'N/A')
|
| 236 |
+
log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
|
| 237 |
+
|
| 238 |
+
# Rerank - use normalized query for consistency
|
| 239 |
+
reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
|
| 240 |
|
| 241 |
+
# CRITICAL FIX: Use normalized query for LLM as well
|
| 242 |
+
response = query_engine.query(normalized_question)
|
| 243 |
|
| 244 |
end_time = time.time()
|
| 245 |
processing_time = end_time - start_time
|