MrSimple07 commited on
Commit
e4fd158
·
1 Parent(s): a90618e

new version top k 60, 0.6, chunk size 4500, chunkrow 50

Browse files
Files changed (4) hide show
  1. config.py +2 -2
  2. documents_prep.py +47 -108
  3. index_retriever.py +4 -4
  4. utils.py +41 -49
config.py CHANGED
@@ -52,8 +52,8 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
- MAX_CHARS_TABLE = 2500
56
- MAX_ROWS_TABLE = 15
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
 
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
+ MAX_CHARS_TABLE = 4500
56
+ MAX_ROWS_TABLE = 50
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
documents_prep.py CHANGED
@@ -34,26 +34,20 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
37
- def normalize_connection_type(s):
38
- # Replace Cyrillic with Latin
39
- s = s.replace('С', 'C').replace('с', 'c')
40
- s = s.replace('У', 'U').replace('у', 'u')
41
- s = s.replace('Т', 'T').replace('т', 't')
42
- s= s.replace('С-', 'C-').replace('с-', 'c-')
43
- s = s.replace('У-', 'U-').replace('у-', 'u-')
44
- s = s.replace('Т-', 'T-').replace('т-', 't-')
45
- # REMOVE ALL HYPHENS for consistent tokenization
46
- s = s.replace('-', '')
47
- return s
48
-
49
- def extract_connection_type(text):
50
  import re
51
- # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
52
- match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
53
- if match:
54
- normalized = normalize_connection_type(match.group(0))
55
- return normalized
56
- return ''
57
 
58
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
59
  headers = table_data.get('headers', [])
@@ -61,9 +55,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
61
  table_num = table_data.get('table_number', 'unknown')
62
  table_title = table_data.get('table_title', '')
63
  section = table_data.get('section', '')
64
- table_description = table_data.get('table_description', '')
65
 
66
  table_num_clean = str(table_num).strip()
 
67
 
68
  import re
69
  if 'приложени' in section.lower():
@@ -81,13 +75,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
81
 
82
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
83
 
84
- # Calculate base metadata size - NOW INCLUDING DESCRIPTION
85
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
86
-
87
- # ADD DESCRIPTION HERE if it exists
88
- if table_description:
89
- base_content += f"ОПИСАНИЕ: {table_description}\n\n"
90
-
91
  base_size = len(base_content)
92
  available_space = max_chars - base_size - 200
93
 
@@ -100,14 +89,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
100
  'type': 'table',
101
  'document_id': doc_id,
102
  'table_number': table_num_clean,
103
- 'table_identifier': table_identifier,
104
- 'table_title': table_title,
105
  'section': section,
106
  'total_rows': len(rows),
107
  'chunk_size': len(content),
108
- 'is_complete_table': True,
109
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
110
-
111
  }
112
 
113
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -133,16 +120,15 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
133
  'type': 'table',
134
  'document_id': doc_id,
135
  'table_number': table_num_clean,
136
- 'table_identifier': table_identifier,
137
- 'table_title': table_title,
138
  'section': section,
139
  'chunk_id': chunk_num,
140
  'row_start': current_rows[0]['_idx'] - 1,
141
  'row_end': current_rows[-1]['_idx'],
142
  'total_rows': len(rows),
143
  'chunk_size': len(content),
144
- 'is_complete_table': False,
145
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
146
  }
147
 
148
  chunks.append(Document(text=content, metadata=metadata))
@@ -168,8 +154,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
168
  'type': 'table',
169
  'document_id': doc_id,
170
  'table_number': table_num_clean,
171
- 'table_identifier': table_identifier,
172
- 'table_title': table_title,
173
  'section': section,
174
  'chunk_id': chunk_num,
175
  'row_start': current_rows[0]['_idx'] - 1,
@@ -184,62 +170,45 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
184
 
185
  return chunks
186
 
 
 
187
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
188
- content = f"ДОКУМЕНТ: {doc_id}\n"
189
- content += f"ТАБЛИЦА: {table_identifier}\n"
190
 
191
- if table_title:
192
- # Normalize the title text itself for better searchability
193
- normalized_title = normalize_connection_type(table_title)
194
- content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
195
-
196
- # Extract and store the normalized connection type
197
- connection_type = extract_connection_type(table_title)
198
- if connection_type:
199
- content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
 
201
- if table_num and table_num != table_identifier:
202
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
203
 
204
  if section:
205
- content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
206
 
207
- content += f"\n{'='*70}\n"
208
 
209
  if headers:
210
- content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
211
- for i, h in enumerate(headers, 1):
212
- # NORMALIZE HEADERS TOO
213
- normalized_header = normalize_connection_type(h)
214
- content += f" {i}. {normalized_header}\n"
215
- content += "\n"
216
-
217
- content += "ДАННЫЕ ТАБЛИЦЫ:\n"
218
  return content
219
 
220
 
221
  def format_single_row(row, idx):
222
- """Format a single row with normalization"""
223
  if isinstance(row, dict):
224
- # NORMALIZE VALUES IN ROWS
225
- parts = []
226
- for k, v in row.items():
227
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
228
- normalized_v = normalize_connection_type(str(v))
229
- parts.append(f"{k}: {normalized_v}")
230
  if parts:
231
  return f"{idx}. {' | '.join(parts)}\n"
232
  elif isinstance(row, list):
233
- # NORMALIZE LIST VALUES
234
- parts = []
235
- for v in row:
236
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
237
- normalized_v = normalize_connection_type(str(v))
238
- parts.append(normalized_v)
239
  if parts:
240
  return f"{idx}. {' | '.join(parts)}\n"
241
  return ""
242
 
 
243
  def format_table_rows(rows):
244
  """Format multiple rows"""
245
  content = ""
@@ -440,8 +409,6 @@ def load_table_documents(repo_id, hf_token, table_dir):
440
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
441
 
442
  all_chunks = []
443
- connection_type_sources = {} # Track which table each type comes from
444
-
445
  for file_path in table_files:
446
  try:
447
  local_path = hf_hub_download(
@@ -458,35 +425,18 @@ def load_table_documents(repo_id, hf_token, table_dir):
458
 
459
  for sheet in data.get('sheets', []):
460
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
461
- table_num = sheet.get('table_number', 'unknown')
462
- table_title = sheet.get('table_title', '')
463
 
 
464
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
465
  all_chunks.extend(chunks)
466
 
467
- # Track connection type source
468
- conn_type = extract_connection_type(table_title)
469
- if conn_type:
470
- if conn_type not in connection_type_sources:
471
- connection_type_sources[conn_type] = []
472
- connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
473
-
474
  except Exception as e:
475
  log_message(f"Error loading {file_path}: {e}")
476
 
477
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
478
-
479
- log_message("="*60)
480
- log_message("CONNECTION TYPES AND THEIR SOURCES:")
481
- for conn_type in sorted(connection_type_sources.keys()):
482
- sources = connection_type_sources[conn_type]
483
- log_message(f" {conn_type}: {len(sources)} tables")
484
- for src in sources:
485
- log_message(f" - {src}")
486
- log_message("="*60)
487
-
488
  return all_chunks
489
 
 
490
  def load_image_documents(repo_id, hf_token, image_dir):
491
  """Load image descriptions"""
492
  log_message("Loading images...")
@@ -534,7 +484,9 @@ def load_image_documents(repo_id, hf_token, image_dir):
534
 
535
  return documents
536
 
 
537
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
 
538
  log_message("="*60)
539
  log_message("STARTING DOCUMENT LOADING")
540
  log_message("="*60)
@@ -546,19 +498,6 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
546
  # Load tables (already chunked)
547
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
548
 
549
- # NEW: Analyze connection types in tables
550
- connection_types = {}
551
- for chunk in table_chunks:
552
- conn_type = chunk.metadata.get('connection_type', '')
553
- if conn_type:
554
- connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
555
-
556
- log_message("="*60)
557
- log_message("CONNECTION TYPES FOUND IN TABLES:")
558
- for conn_type, count in sorted(connection_types.items()):
559
- log_message(f" {conn_type}: {count} chunks")
560
- log_message("="*60)
561
-
562
  # Load images (no chunking needed)
563
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
564
 
 
34
 
35
  return chunked
36
 
37
+ def normalize_text(text):
38
+ if not text:
39
+ return text
40
+
41
+ # Replace Cyrillic 'C' with Latin 'С' (U+0421)
42
+ # This is for welding types like C-25 -> С-25
43
+ text = text.replace('С-', 'C')
44
+
45
+ # Also handle cases like "Type C" or variations
 
 
 
 
46
  import re
47
+ # Match "C" followed by digit or space in context of welding types
48
+ text = re.sub(r'\bС(\d)', r'С\1', text)
49
+
50
+ return text
 
 
51
 
52
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
53
  headers = table_data.get('headers', [])
 
55
  table_num = table_data.get('table_number', 'unknown')
56
  table_title = table_data.get('table_title', '')
57
  section = table_data.get('section', '')
 
58
 
59
  table_num_clean = str(table_num).strip()
60
+ table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
61
 
62
  import re
63
  if 'приложени' in section.lower():
 
75
 
76
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
77
 
78
+ # Calculate base metadata size with NORMALIZED title
79
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
 
 
 
 
 
80
  base_size = len(base_content)
81
  available_space = max_chars - base_size - 200
82
 
 
89
  'type': 'table',
90
  'document_id': doc_id,
91
  'table_number': table_num_clean,
92
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
93
+ 'table_title': table_title_normalized, # NORMALIZED
94
  'section': section,
95
  'total_rows': len(rows),
96
  'chunk_size': len(content),
97
+ 'is_complete_table': True
 
 
98
  }
99
 
100
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
 
120
  'type': 'table',
121
  'document_id': doc_id,
122
  'table_number': table_num_clean,
123
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
124
+ 'table_title': table_title_normalized, # NORMALIZED
125
  'section': section,
126
  'chunk_id': chunk_num,
127
  'row_start': current_rows[0]['_idx'] - 1,
128
  'row_end': current_rows[-1]['_idx'],
129
  'total_rows': len(rows),
130
  'chunk_size': len(content),
131
+ 'is_complete_table': False
 
132
  }
133
 
134
  chunks.append(Document(text=content, metadata=metadata))
 
154
  'type': 'table',
155
  'document_id': doc_id,
156
  'table_number': table_num_clean,
157
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
158
+ 'table_title': table_title_normalized, # NORMALIZED
159
  'section': section,
160
  'chunk_id': chunk_num,
161
  'row_start': current_rows[0]['_idx'] - 1,
 
170
 
171
  return chunks
172
 
173
+
174
+ # MODIFIED: Update format_table_header function
175
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
176
+ content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
 
177
 
178
+ # Add table type/number prominently for matching
179
+ if table_num:
180
+ content += f"ТИП: {normalize_text(table_num)}\n"
 
 
 
 
 
 
181
 
182
+ if table_title:
183
+ content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
184
 
185
  if section:
186
+ content += f"РАЗДЕЛ: {section}\n"
187
 
188
+ content += f"{'='*70}\n"
189
 
190
  if headers:
191
+ header_str = ' | '.join(str(h) for h in headers)
192
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
+
194
+ content += "ДАННЫЕ:\n"
 
 
 
 
195
  return content
196
 
197
 
198
  def format_single_row(row, idx):
199
+ """Format a single row"""
200
  if isinstance(row, dict):
201
+ parts = [f"{k}: {v}" for k, v in row.items()
202
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
203
  if parts:
204
  return f"{idx}. {' | '.join(parts)}\n"
205
  elif isinstance(row, list):
206
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
 
207
  if parts:
208
  return f"{idx}. {' | '.join(parts)}\n"
209
  return ""
210
 
211
+
212
  def format_table_rows(rows):
213
  """Format multiple rows"""
214
  content = ""
 
409
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
410
 
411
  all_chunks = []
 
 
412
  for file_path in table_files:
413
  try:
414
  local_path = hf_hub_download(
 
425
 
426
  for sheet in data.get('sheets', []):
427
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
 
428
 
429
+ # Use the consistent MAX_CHARS_TABLE from config
430
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
431
  all_chunks.extend(chunks)
432
 
 
 
 
 
 
 
 
433
  except Exception as e:
434
  log_message(f"Error loading {file_path}: {e}")
435
 
436
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
 
 
 
 
 
 
 
 
437
  return all_chunks
438
 
439
+
440
  def load_image_documents(repo_id, hf_token, image_dir):
441
  """Load image descriptions"""
442
  log_message("Loading images...")
 
484
 
485
  return documents
486
 
487
+
488
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
489
+ """Main loader - combines all document types"""
490
  log_message("="*60)
491
  log_message("STARTING DOCUMENT LOADING")
492
  log_message("="*60)
 
498
  # Load tables (already chunked)
499
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  # Load images (no chunking needed)
502
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
503
 
index_retriever.py CHANGED
@@ -71,18 +71,18 @@ def create_query_engine(vector_index):
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=100
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=100,
80
- similarity_cutoff=0.55
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
- similarity_top_k=100,
86
  num_queries=1
87
  )
88
 
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
+ similarity_top_k=60
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
+ similarity_top_k=60,
80
+ similarity_cutoff=0.6
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
+ similarity_top_k=120,
86
  num_queries=1
87
  )
88
 
utils.py CHANGED
@@ -9,7 +9,6 @@ import time
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
- import re
13
 
14
  def get_llm_model(model_name):
15
  try:
@@ -173,72 +172,64 @@ def deduplicate_nodes(nodes):
173
 
174
  return unique_nodes
175
 
176
- def normalize_query(query):
177
- def repl(m):
178
- cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
179
- letter = cyr_to_lat.get(m.group(1), m.group(1))
180
- return f"{letter}{m.group(2)}"
181
-
182
- return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
 
184
 
 
185
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
 
 
 
186
  if query_engine is None:
187
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
188
 
189
  try:
190
  start_time = time.time()
191
-
192
- # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
193
- normalized_question = normalize_query(question)
194
- log_message(f"Original query: {question}")
195
- log_message(f"Normalized query: {normalized_question}")
196
-
197
- # Use normalized query for retrieval
198
  retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
199
  log_message(f"user query: {question}")
 
 
200
 
201
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
202
 
203
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
 
 
 
 
 
 
204
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
205
 
206
- # Check for connection types
207
- conn_types_retrieved = {}
208
- for node in unique_retrieved:
209
- if node.metadata.get('type') == 'table':
210
- conn_type = node.metadata.get('connection_type', '')
211
- if conn_type:
212
- conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
213
-
214
- if conn_types_retrieved:
215
- log_message("CONNECTION TYPES IN RETRIEVED:")
216
- for ct, cnt in sorted(conn_types_retrieved.items()):
217
- log_message(f" {ct}: {cnt} chunks")
218
-
219
- # Check if target type was retrieved
220
- # Normalize the check as well
221
- normalized_check = normalize_query('С-25') # Will become C25
222
- if normalized_check in question or 'С-25' in question or 'C-25' in question:
223
- if 'C25' in conn_types_retrieved:
224
- log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
225
- else:
226
- log_message("✗ C25 NOT RETRIEVED despite being in query!")
227
-
228
- # Sample of retrieved tables
229
- log_message("SAMPLE OF RETRIEVED TABLES:")
230
- for i, node in enumerate(unique_retrieved[:10]):
231
- if node.metadata.get('type') == 'table':
232
- table_num = node.metadata.get('table_number', 'N/A')
233
- table_title = node.metadata.get('table_title', 'N/A')
234
- conn_type = node.metadata.get('connection_type', 'N/A')
235
- doc_id = node.metadata.get('document_id', 'N/A')
236
- log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
-
238
- # Rerank - use normalized query for consistency
239
  reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
240
 
241
- # CRITICAL FIX: Use normalized query for LLM as well
242
  response = query_engine.query(normalized_question)
243
 
244
  end_time = time.time()
@@ -255,6 +246,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
255
  Время обработки: {processing_time:.2f} секунд
256
  </div>
257
  </div>"""
 
258
 
259
  chunk_info = []
260
  for node in reranked_nodes:
 
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
 
12
 
13
  def get_llm_model(model_name):
14
  try:
 
172
 
173
  return unique_nodes
174
 
175
+ def debug_search_tables(vector_index, search_term="С-25"):
176
+ """Debug function to find all tables containing a specific term"""
177
+ all_nodes = list(vector_index.docstore.docs.values())
178
+
179
+ matching = []
180
+ for node in all_nodes:
181
+ if node.metadata.get('type') == 'table':
182
+ text = node.get_content()
183
+ if search_term in text or search_term in node.metadata.get('table_title', ''):
184
+ matching.append({
185
+ 'doc_id': node.metadata.get('document_id'),
186
+ 'table_num': node.metadata.get('table_number'),
187
+ 'title': node.metadata.get('table_title', '')[:100]
188
+ })
189
+
190
+ log_message(f"\n{'='*60}")
191
+ log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
192
+ for m in matching:
193
+ log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
194
+ log_message(f"{'='*60}\n")
195
+
196
+ return matching
197
 
198
+ from documents_prep import normalize_text
199
 
200
+ # MODIFIED: Update answer_question function
201
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
202
+ # NORMALIZE the question to convert C to С
203
+ normalized_question = normalize_text(question)
204
+
205
  if query_engine is None:
206
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
207
 
208
  try:
209
  start_time = time.time()
210
+ # Use NORMALIZED question for retrieval
 
 
 
 
 
 
211
  retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
212
  log_message(f"user query: {question}")
213
+ log_message(f"normalized query: {normalized_question}")
214
+
215
 
216
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
217
 
218
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
219
+
220
+ # DEBUG: Log what was retrieved
221
+ log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
222
+ for i, node in enumerate(unique_retrieved): # All debug
223
+ table_num = node.metadata.get('table_number', 'N/A')
224
+ table_title = node.metadata.get('table_title', 'N/A')
225
+ doc_id = node.metadata.get('document_id', 'N/A')
226
+ log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
227
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
228
 
229
+ # Simple reranking with NORMALIZED question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
231
 
232
+ # Direct query without formatting - use normalized question
233
  response = query_engine.query(normalized_question)
234
 
235
  end_time = time.time()
 
246
  Время обработки: {processing_time:.2f} секунд
247
  </div>
248
  </div>"""
249
+ log_message(f"Model Answer: {response.response}")
250
 
251
  chunk_info = []
252
  for node in reranked_nodes: