MrSimple07 commited on
Commit
a90618e
·
1 Parent(s): 566457a

a new version with the normalization = 3rd release

Browse files
Files changed (6) hide show
  1. app.py +26 -23
  2. config.py +2 -1
  3. documents_prep.py +106 -59
  4. index_retriever.py +28 -3
  5. table_prep.py +107 -107
  6. utils.py +54 -5
app.py CHANGED
@@ -149,37 +149,39 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
149
  all_documents = []
150
  chunks_df = None
151
 
 
152
  if use_json_instead_csv and json_files_dir:
153
  log_message("Используем JSON файлы вместо CSV")
154
- from documents_prep import load_json_documents, chunk_text_documents
155
 
156
- # Load JSON docs (returns list of Documents)
157
- json_documents = load_json_documents(repo_id, hf_token, json_files_dir)
158
- # Chunk them
159
- json_chunks = chunk_text_documents(json_documents)
160
- all_documents.extend(json_chunks)
 
 
 
161
  else:
 
162
  if chunks_filename:
163
  log_message("Загружаем данные из CSV")
164
-
165
-
166
- if table_data_dir:
167
- log_message("Добавляю табличные данные")
168
- from documents_prep import load_table_documents
169
 
170
- # load_table_documents already returns chunked documents
171
- table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
172
- log_message(f"Загружено {len(table_chunks)} табличных чанков")
173
- all_documents.extend(table_chunks)
174
-
175
- if image_data_dir:
176
- log_message("Добавляю данные изображений")
177
- from documents_prep import load_image_documents
178
 
179
- # load_image_documents returns documents (no chunking needed)
180
- image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
181
- log_message(f"Загружено {len(image_documents)} документов изображений")
182
- all_documents.extend(image_documents)
 
 
 
183
 
184
  log_message(f"Всего документов после всей обработки: {len(all_documents)}")
185
 
@@ -197,6 +199,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
197
  'table_number': doc.metadata.get('table_number', ''),
198
  'image_number': doc.metadata.get('image_number', ''),
199
  'section': doc.metadata.get('section', ''),
 
200
  })
201
 
202
  log_message(f"Система успешно инициализирована")
 
149
  all_documents = []
150
  chunks_df = None
151
 
152
+ # CHANGED: Use load_all_documents instead of loading separately
153
  if use_json_instead_csv and json_files_dir:
154
  log_message("Используем JSON файлы вместо CSV")
155
+ from documents_prep import load_all_documents
156
 
157
+ # This will handle text, tables, and images all together with proper logging
158
+ all_documents = load_all_documents(
159
+ repo_id=repo_id,
160
+ hf_token=hf_token,
161
+ json_dir=json_files_dir,
162
+ table_dir=table_data_dir if table_data_dir else "",
163
+ image_dir=image_data_dir if image_data_dir else ""
164
+ )
165
  else:
166
+ # OLD PATH: Loading separately (fallback)
167
  if chunks_filename:
168
  log_message("Загружаем данные из CSV")
 
 
 
 
 
169
 
170
+ if table_data_dir:
171
+ log_message("Добавляю табличные данные")
172
+ from documents_prep import load_table_documents
173
+
174
+ table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
175
+ log_message(f"Загружено {len(table_chunks)} табличных чанков")
176
+ all_documents.extend(table_chunks)
 
177
 
178
+ if image_data_dir:
179
+ log_message("Добавляю данные изображений")
180
+ from documents_prep import load_image_documents
181
+
182
+ image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
183
+ log_message(f"Загружено {len(image_documents)} документов изображений")
184
+ all_documents.extend(image_documents)
185
 
186
  log_message(f"Всего документов после всей обработки: {len(all_documents)}")
187
 
 
199
  'table_number': doc.metadata.get('table_number', ''),
200
  'image_number': doc.metadata.get('image_number', ''),
201
  'section': doc.metadata.get('section', ''),
202
+ 'connection_type': doc.metadata.get('connection_type', '') # ADD THIS
203
  })
204
 
205
  log_message(f"Система успешно инициализирована")
config.py CHANGED
@@ -53,10 +53,11 @@ CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
  MAX_CHARS_TABLE = 2500
56
- MAX_ROWS_TABLE = 10
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
 
60
 
61
  ПРАВИЛА АНАЛИЗА ЗАПРОСА:
62
 
 
53
  CHUNK_OVERLAP = 128
54
 
55
  MAX_CHARS_TABLE = 2500
56
+ MAX_ROWS_TABLE = 15
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
60
+ СТРОГО ОТВЕТИТЬ ТОЛЬКО НА РУССКОМ!
61
 
62
  ПРАВИЛА АНАЛИЗА ЗАПРОСА:
63
 
documents_prep.py CHANGED
@@ -34,6 +34,26 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
  headers = table_data.get('headers', [])
@@ -41,6 +61,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
41
  table_num = table_data.get('table_number', 'unknown')
42
  table_title = table_data.get('table_title', '')
43
  section = table_data.get('section', '')
 
44
 
45
  table_num_clean = str(table_num).strip()
46
 
@@ -60,8 +81,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
60
 
61
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
 
63
- # Calculate base metadata size
64
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
 
 
 
 
 
65
  base_size = len(base_content)
66
  available_space = max_chars - base_size - 200
67
 
@@ -79,7 +105,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
79
  'section': section,
80
  'total_rows': len(rows),
81
  'chunk_size': len(content),
82
- 'is_complete_table': True
 
 
83
  }
84
 
85
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -113,7 +141,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
113
  'row_end': current_rows[-1]['_idx'],
114
  'total_rows': len(rows),
115
  'chunk_size': len(content),
116
- 'is_complete_table': False
 
117
  }
118
 
119
  chunks.append(Document(text=content, metadata=metadata))
@@ -155,37 +184,62 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
155
 
156
  return chunks
157
 
158
-
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
- content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
 
 
161
  if table_title:
162
- content += f"НАЗВАНИЕ: {table_title}\n"
 
 
 
 
 
 
 
 
 
 
 
163
  if section:
164
- content += f"РАЗДЕЛ: {section}\n"
165
- content += f"{'='*70}\n"
166
 
167
- if headers:
168
- header_str = ' | '.join(str(h) for h in headers)
169
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
170
 
171
- content += "ДАННЫЕ:\n"
 
 
 
 
 
 
 
 
172
  return content
173
 
174
 
175
  def format_single_row(row, idx):
176
- """Format a single row"""
177
  if isinstance(row, dict):
178
- parts = [f"{k}: {v}" for k, v in row.items()
179
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
180
  if parts:
181
  return f"{idx}. {' | '.join(parts)}\n"
182
  elif isinstance(row, list):
183
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
 
184
  if parts:
185
  return f"{idx}. {' | '.join(parts)}\n"
186
  return ""
187
 
188
-
189
  def format_table_rows(rows):
190
  """Format multiple rows"""
191
  content = ""
@@ -199,40 +253,6 @@ def format_table_footer(table_identifier, doc_id):
199
  """Format table footer"""
200
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
201
 
202
- def load_table_documents(repo_id, hf_token, table_dir):
203
- log_message("Loading tables...")
204
-
205
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
206
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
207
-
208
- all_chunks = []
209
- for file_path in table_files:
210
- try:
211
- local_path = hf_hub_download(
212
- repo_id=repo_id,
213
- filename=file_path,
214
- repo_type="dataset",
215
- token=hf_token
216
- )
217
-
218
- with open(local_path, 'r', encoding='utf-8') as f:
219
- data = json.load(f)
220
-
221
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
222
-
223
- for sheet in data.get('sheets', []):
224
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
225
-
226
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
227
- all_chunks.extend(chunks)
228
-
229
- except Exception as e:
230
- log_message(f"Error loading {file_path}: {e}")
231
-
232
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
233
- return all_chunks
234
-
235
-
236
  def load_json_documents(repo_id, hf_token, json_dir):
237
  import zipfile
238
  import tempfile
@@ -414,13 +434,14 @@ def extract_sections_from_json(json_path):
414
 
415
 
416
  def load_table_documents(repo_id, hf_token, table_dir):
417
- """Load and chunk tables"""
418
  log_message("Loading tables...")
419
 
420
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
421
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
422
 
423
  all_chunks = []
 
 
424
  for file_path in table_files:
425
  try:
426
  local_path = hf_hub_download(
@@ -433,23 +454,38 @@ def load_table_documents(repo_id, hf_token, table_dir):
433
  with open(local_path, 'r', encoding='utf-8') as f:
434
  data = json.load(f)
435
 
436
- # Extract file-level document_id
437
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
438
 
439
  for sheet in data.get('sheets', []):
440
- # Use sheet-level document_id if available, otherwise use file-level
441
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
 
442
 
443
- # CRITICAL: Pass document_id to chunk function
444
- chunks = chunk_table_by_content(sheet, sheet_doc_id)
445
  all_chunks.extend(chunks)
446
 
 
 
 
 
 
 
 
447
  except Exception as e:
448
  log_message(f"Error loading {file_path}: {e}")
449
 
450
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
451
- return all_chunks
452
 
 
 
 
 
 
 
 
 
 
 
453
 
454
  def load_image_documents(repo_id, hf_token, image_dir):
455
  """Load image descriptions"""
@@ -498,9 +534,7 @@ def load_image_documents(repo_id, hf_token, image_dir):
498
 
499
  return documents
500
 
501
-
502
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
503
- """Main loader - combines all document types"""
504
  log_message("="*60)
505
  log_message("STARTING DOCUMENT LOADING")
506
  log_message("="*60)
@@ -512,6 +546,19 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
512
  # Load tables (already chunked)
513
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  # Load images (no chunking needed)
516
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
517
 
 
34
 
35
  return chunked
36
 
37
+ def normalize_connection_type(s):
38
+ # Replace Cyrillic with Latin
39
+ s = s.replace('С', 'C').replace('с', 'c')
40
+ s = s.replace('У', 'U').replace('у', 'u')
41
+ s = s.replace('Т', 'T').replace('т', 't')
42
+ s= s.replace('С-', 'C-').replace('с-', 'c-')
43
+ s = s.replace('У-', 'U-').replace('у-', 'u-')
44
+ s = s.replace('Т-', 'T-').replace('т-', 't-')
45
+ # REMOVE ALL HYPHENS for consistent tokenization
46
+ s = s.replace('-', '')
47
+ return s
48
+
49
+ def extract_connection_type(text):
50
+ import re
51
+ # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
52
+ match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
53
+ if match:
54
+ normalized = normalize_connection_type(match.group(0))
55
+ return normalized
56
+ return ''
57
 
58
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
59
  headers = table_data.get('headers', [])
 
61
  table_num = table_data.get('table_number', 'unknown')
62
  table_title = table_data.get('table_title', '')
63
  section = table_data.get('section', '')
64
+ table_description = table_data.get('table_description', '')
65
 
66
  table_num_clean = str(table_num).strip()
67
 
 
81
 
82
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
83
 
84
+ # Calculate base metadata size - NOW INCLUDING DESCRIPTION
85
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
86
+
87
+ # ADD DESCRIPTION HERE if it exists
88
+ if table_description:
89
+ base_content += f"ОПИСАНИЕ: {table_description}\n\n"
90
+
91
  base_size = len(base_content)
92
  available_space = max_chars - base_size - 200
93
 
 
105
  'section': section,
106
  'total_rows': len(rows),
107
  'chunk_size': len(content),
108
+ 'is_complete_table': True,
109
+ 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
110
+
111
  }
112
 
113
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
 
141
  'row_end': current_rows[-1]['_idx'],
142
  'total_rows': len(rows),
143
  'chunk_size': len(content),
144
+ 'is_complete_table': False,
145
+ 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
146
  }
147
 
148
  chunks.append(Document(text=content, metadata=metadata))
 
184
 
185
  return chunks
186
 
 
187
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
188
+ content = f"ДОКУМЕНТ: {doc_id}\n"
189
+ content += f"ТАБЛИЦА: {table_identifier}\n"
190
+
191
  if table_title:
192
+ # Normalize the title text itself for better searchability
193
+ normalized_title = normalize_connection_type(table_title)
194
+ content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
195
+
196
+ # Extract and store the normalized connection type
197
+ connection_type = extract_connection_type(table_title)
198
+ if connection_type:
199
+ content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
+
201
+ if table_num and table_num != table_identifier:
202
+ content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
203
+
204
  if section:
205
+ content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
 
206
 
207
+ content += f"\n{'='*70}\n"
 
 
208
 
209
+ if headers:
210
+ content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
211
+ for i, h in enumerate(headers, 1):
212
+ # NORMALIZE HEADERS TOO
213
+ normalized_header = normalize_connection_type(h)
214
+ content += f" {i}. {normalized_header}\n"
215
+ content += "\n"
216
+
217
+ content += "ДАННЫЕ ТАБЛИЦЫ:\n"
218
  return content
219
 
220
 
221
  def format_single_row(row, idx):
222
+ """Format a single row with normalization"""
223
  if isinstance(row, dict):
224
+ # NORMALIZE VALUES IN ROWS
225
+ parts = []
226
+ for k, v in row.items():
227
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
228
+ normalized_v = normalize_connection_type(str(v))
229
+ parts.append(f"{k}: {normalized_v}")
230
  if parts:
231
  return f"{idx}. {' | '.join(parts)}\n"
232
  elif isinstance(row, list):
233
+ # NORMALIZE LIST VALUES
234
+ parts = []
235
+ for v in row:
236
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
237
+ normalized_v = normalize_connection_type(str(v))
238
+ parts.append(normalized_v)
239
  if parts:
240
  return f"{idx}. {' | '.join(parts)}\n"
241
  return ""
242
 
 
243
  def format_table_rows(rows):
244
  """Format multiple rows"""
245
  content = ""
 
253
  """Format table footer"""
254
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  def load_json_documents(repo_id, hf_token, json_dir):
257
  import zipfile
258
  import tempfile
 
434
 
435
 
436
  def load_table_documents(repo_id, hf_token, table_dir):
 
437
  log_message("Loading tables...")
438
 
439
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
440
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
441
 
442
  all_chunks = []
443
+ connection_type_sources = {} # Track which table each type comes from
444
+
445
  for file_path in table_files:
446
  try:
447
  local_path = hf_hub_download(
 
454
  with open(local_path, 'r', encoding='utf-8') as f:
455
  data = json.load(f)
456
 
 
457
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
458
 
459
  for sheet in data.get('sheets', []):
 
460
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
461
+ table_num = sheet.get('table_number', 'unknown')
462
+ table_title = sheet.get('table_title', '')
463
 
464
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
 
465
  all_chunks.extend(chunks)
466
 
467
+ # Track connection type source
468
+ conn_type = extract_connection_type(table_title)
469
+ if conn_type:
470
+ if conn_type not in connection_type_sources:
471
+ connection_type_sources[conn_type] = []
472
+ connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
473
+
474
  except Exception as e:
475
  log_message(f"Error loading {file_path}: {e}")
476
 
477
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
478
 
479
+ log_message("="*60)
480
+ log_message("CONNECTION TYPES AND THEIR SOURCES:")
481
+ for conn_type in sorted(connection_type_sources.keys()):
482
+ sources = connection_type_sources[conn_type]
483
+ log_message(f" {conn_type}: {len(sources)} tables")
484
+ for src in sources:
485
+ log_message(f" - {src}")
486
+ log_message("="*60)
487
+
488
+ return all_chunks
489
 
490
  def load_image_documents(repo_id, hf_token, image_dir):
491
  """Load image descriptions"""
 
534
 
535
  return documents
536
 
 
537
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
 
538
  log_message("="*60)
539
  log_message("STARTING DOCUMENT LOADING")
540
  log_message("="*60)
 
546
  # Load tables (already chunked)
547
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
548
 
549
+ # NEW: Analyze connection types in tables
550
+ connection_types = {}
551
+ for chunk in table_chunks:
552
+ conn_type = chunk.metadata.get('connection_type', '')
553
+ if conn_type:
554
+ connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
555
+
556
+ log_message("="*60)
557
+ log_message("CONNECTION TYPES FOUND IN TABLES:")
558
+ for conn_type, count in sorted(connection_types.items()):
559
+ log_message(f" {conn_type}: {count} chunks")
560
+ log_message("="*60)
561
+
562
  # Load images (no chunking needed)
563
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
564
 
index_retriever.py CHANGED
@@ -10,8 +10,33 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  return VectorStoreIndex.from_documents(documents)
14
 
 
15
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
16
  if not nodes or not reranker:
17
  return nodes[:top_k]
@@ -46,18 +71,18 @@ def create_query_engine(vector_index):
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
- similarity_top_k=70
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
- similarity_top_k=70,
55
  similarity_cutoff=0.55
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
- similarity_top_k=70,
61
  num_queries=1
62
  )
63
 
 
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
+
14
+ connection_type_sources = {}
15
+ table_count = 0
16
+
17
+ for doc in documents:
18
+ if doc.metadata.get('type') == 'table':
19
+ table_count += 1
20
+ conn_type = doc.metadata.get('connection_type', '')
21
+ if conn_type:
22
+ table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
23
+ if conn_type not in connection_type_sources:
24
+ connection_type_sources[conn_type] = []
25
+ connection_type_sources[conn_type].append(table_id)
26
+
27
+ log_message("="*60)
28
+ log_message(f"INDEXING {table_count} TABLE CHUNKS")
29
+ log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
30
+ for conn_type in sorted(connection_type_sources.keys()):
31
+ sources = list(set(connection_type_sources[conn_type])) # Unique sources
32
+ log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
33
+ for src in sources:
34
+ log_message(f" - {src}")
35
+ log_message("="*60)
36
+
37
  return VectorStoreIndex.from_documents(documents)
38
 
39
+
40
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
41
  if not nodes or not reranker:
42
  return nodes[:top_k]
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
+ similarity_top_k=100
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
+ similarity_top_k=100,
80
  similarity_cutoff=0.55
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
+ similarity_top_k=100,
86
  num_queries=1
87
  )
88
 
table_prep.py CHANGED
@@ -95,135 +95,135 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk
95
  return chunked_docs
96
 
97
 
98
- def table_to_document(table_data, document_id=None):
99
- if not isinstance(table_data, dict):
100
- return []
101
 
102
- doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
103
- table_num = table_data.get('table_number', 'Неизвестно')
104
- table_title = table_data.get('table_title', 'Неизвестно')
105
- section = table_data.get('section', 'Неизвестно')
106
- table_rows = table_data.get('data', [])
107
 
108
- if not table_rows:
109
- return []
110
 
111
- # Build table content
112
- content = f"Таблица: {table_num}\n"
113
- content += f"Название: {table_title}\n"
114
- content += f"Документ: {doc_id}\n"
115
- content += f"Раздел: {section}\n"
116
 
117
- headers = table_data.get('headers', [])
118
- if headers:
119
- content += f"\nЗаголовки: {' | '.join(headers)}\n"
120
 
121
- content += "\nДанные таблицы:\n"
122
- for row_idx, row in enumerate(table_rows, start=1):
123
- if isinstance(row, dict):
124
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
125
- content += f"Строка {row_idx}: {row_text}\n"
126
 
127
- # Create base document
128
- base_doc = Document(
129
- text=content,
130
- metadata={
131
- "type": "table",
132
- "table_number": table_num,
133
- "document_id": doc_id,
134
- "section": section
135
- }
136
- )
137
- if len(content) > 4000:
138
- chunks = chunk_table_document(base_doc)
139
- log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
140
- return chunk_table_document(base_doc)
141
- return [base_doc]
142
 
143
 
144
- def load_table_data(repo_id, hf_token, table_data_dir):
145
- try:
146
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
148
 
149
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
150
 
151
- table_documents = []
152
- stats = {
153
- 'total_tables': 0,
154
- 'total_size': 0,
155
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
156
- }
157
 
158
- for file_path in table_files:
159
- try:
160
- local_path = hf_hub_download(
161
- repo_id=repo_id,
162
- filename=file_path,
163
- local_dir='',
164
- repo_type="dataset",
165
- token=hf_token
166
- )
167
 
168
- log_message(f"\nОбработка файла: {file_path}")
169
 
170
- with open(local_path, 'r', encoding='utf-8') as f:
171
- table_data = json.load(f)
172
 
173
- if isinstance(table_data, dict):
174
- document_id = table_data.get('document', 'unknown')
175
 
176
- if 'sheets' in table_data:
177
- sorted_sheets = sorted(
178
- table_data['sheets'],
179
- key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
180
- )
181
 
182
- for sheet in sorted_sheets:
183
- sheet['document'] = document_id
184
- docs_list = table_to_document(sheet, document_id)
185
- table_documents.extend(docs_list)
186
 
187
- for doc in docs_list:
188
- stats['total_tables'] += 1
189
- size = doc.metadata.get('content_size', 0)
190
- stats['total_size'] += size
191
- stats['by_document'][document_id]['count'] += 1
192
- stats['by_document'][document_id]['size'] += size
193
- log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
194
- else:
195
- docs_list = table_to_document(table_data, document_id)
196
- table_documents.extend(docs_list)
197
 
198
- for doc in docs_list:
199
- stats['total_tables'] += 1
200
- size = doc.metadata.get('content_size', 0)
201
- stats['total_size'] += size
202
- stats['by_document'][document_id]['count'] += 1
203
- stats['by_document'][document_id]['size'] += size
204
 
205
 
206
- except Exception as e:
207
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
208
- continue
209
 
210
- # Log summary statistics
211
- log_message("\n" + "=" * 60)
212
- log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
213
- log_message("=" * 60)
214
- log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
215
- log_message(f"Общий размер: {stats['total_size']:,} символов")
216
- log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
217
 
218
- log_message("\nПо документам:")
219
- for doc_id, doc_stats in sorted(stats['by_document'].items()):
220
- log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
221
- f"{doc_stats['size']:,} символов")
222
 
223
- log_message("=" * 60)
224
 
225
- return table_documents
226
 
227
- except Exception as e:
228
- log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
229
- return []
 
95
  return chunked_docs
96
 
97
 
98
+ # def table_to_document(table_data, document_id=None):
99
+ # if not isinstance(table_data, dict):
100
+ # return []
101
 
102
+ # doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
103
+ # table_num = table_data.get('table_number', 'Неизвестно')
104
+ # table_title = table_data.get('table_title', 'Неизвестно')
105
+ # section = table_data.get('section', 'Неизвестно')
106
+ # table_rows = table_data.get('data', [])
107
 
108
+ # if not table_rows:
109
+ # return []
110
 
111
+ # # Build table content
112
+ # content = f"Таблица: {table_num}\n"
113
+ # content += f"Название: {table_title}\n"
114
+ # content += f"Документ: {doc_id}\n"
115
+ # content += f"Раздел: {section}\n"
116
 
117
+ # headers = table_data.get('headers', [])
118
+ # if headers:
119
+ # content += f"\nЗаголовки: {' | '.join(headers)}\n"
120
 
121
+ # content += "\nДанные таблицы:\n"
122
+ # for row_idx, row in enumerate(table_rows, start=1):
123
+ # if isinstance(row, dict):
124
+ # row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
125
+ # content += f"Строка {row_idx}: {row_text}\n"
126
 
127
+ # # Create base document
128
+ # base_doc = Document(
129
+ # text=content,
130
+ # metadata={
131
+ # "type": "table",
132
+ # "table_number": table_num,
133
+ # "document_id": doc_id,
134
+ # "section": section
135
+ # }
136
+ # )
137
+ # if len(content) > 4000:
138
+ # chunks = chunk_table_document(base_doc)
139
+ # log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
140
+ # return chunk_table_document(base_doc)
141
+ # return [base_doc]
142
 
143
 
144
+ # def load_table_data(repo_id, hf_token, table_data_dir):
145
+ # try:
146
+ # files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
+ # table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
148
 
149
+ # log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
150
 
151
+ # table_documents = []
152
+ # stats = {
153
+ # 'total_tables': 0,
154
+ # 'total_size': 0,
155
+ # 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
156
+ # }
157
 
158
+ # for file_path in table_files:
159
+ # try:
160
+ # local_path = hf_hub_download(
161
+ # repo_id=repo_id,
162
+ # filename=file_path,
163
+ # local_dir='',
164
+ # repo_type="dataset",
165
+ # token=hf_token
166
+ # )
167
 
168
+ # log_message(f"\nОбработка файла: {file_path}")
169
 
170
+ # with open(local_path, 'r', encoding='utf-8') as f:
171
+ # table_data = json.load(f)
172
 
173
+ # if isinstance(table_data, dict):
174
+ # document_id = table_data.get('document', 'unknown')
175
 
176
+ # if 'sheets' in table_data:
177
+ # sorted_sheets = sorted(
178
+ # table_data['sheets'],
179
+ # key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
180
+ # )
181
 
182
+ # for sheet in sorted_sheets:
183
+ # sheet['document'] = document_id
184
+ # docs_list = table_to_document(sheet, document_id)
185
+ # table_documents.extend(docs_list)
186
 
187
+ # for doc in docs_list:
188
+ # stats['total_tables'] += 1
189
+ # size = doc.metadata.get('content_size', 0)
190
+ # stats['total_size'] += size
191
+ # stats['by_document'][document_id]['count'] += 1
192
+ # stats['by_document'][document_id]['size'] += size
193
+ # log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
194
+ # else:
195
+ # docs_list = table_to_document(table_data, document_id)
196
+ # table_documents.extend(docs_list)
197
 
198
+ # for doc in docs_list:
199
+ # stats['total_tables'] += 1
200
+ # size = doc.metadata.get('content_size', 0)
201
+ # stats['total_size'] += size
202
+ # stats['by_document'][document_id]['count'] += 1
203
+ # stats['by_document'][document_id]['size'] += size
204
 
205
 
206
+ # except Exception as e:
207
+ # log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
208
+ # continue
209
 
210
+ # # Log summary statistics
211
+ # log_message("\n" + "=" * 60)
212
+ # log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
213
+ # log_message("=" * 60)
214
+ # log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
215
+ # log_message(f"Общий размер: {stats['total_size']:,} символов")
216
+ # log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
217
 
218
+ # log_message("\nПо документам:")
219
+ # for doc_id, doc_stats in sorted(stats['by_document'].items()):
220
+ # log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
221
+ # f"{doc_stats['size']:,} символов")
222
 
223
+ # log_message("=" * 60)
224
 
225
+ # return table_documents
226
 
227
+ # except Exception as e:
228
+ # log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
229
+ # return []
utils.py CHANGED
@@ -9,6 +9,7 @@ import time
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
 
12
 
13
  def get_llm_model(model_name):
14
  try:
@@ -172,6 +173,14 @@ def deduplicate_nodes(nodes):
172
 
173
  return unique_nodes
174
 
 
 
 
 
 
 
 
 
175
 
176
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
177
  if query_engine is None:
@@ -179,18 +188,58 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
179
 
180
  try:
181
  start_time = time.time()
182
- retrieved_nodes = query_engine.retriever.retrieve(question)
 
 
 
 
 
 
 
 
183
 
184
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
185
 
186
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
187
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
188
 
189
- # Simple reranking
190
- reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # Direct query without formatting
193
- response = query_engine.query(question)
194
 
195
  end_time = time.time()
196
  processing_time = end_time - start_time
 
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
+ import re
13
 
14
  def get_llm_model(model_name):
15
  try:
 
173
 
174
  return unique_nodes
175
 
176
+ def normalize_query(query):
177
+ def repl(m):
178
+ cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
179
+ letter = cyr_to_lat.get(m.group(1), m.group(1))
180
+ return f"{letter}{m.group(2)}"
181
+
182
+ return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
183
+
184
 
185
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
186
  if query_engine is None:
 
188
 
189
  try:
190
  start_time = time.time()
191
+
192
+ # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
193
+ normalized_question = normalize_query(question)
194
+ log_message(f"Original query: {question}")
195
+ log_message(f"Normalized query: {normalized_question}")
196
+
197
+ # Use normalized query for retrieval
198
+ retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
199
+ log_message(f"user query: {question}")
200
 
201
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
202
 
203
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
204
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
205
 
206
+ # Check for connection types
207
+ conn_types_retrieved = {}
208
+ for node in unique_retrieved:
209
+ if node.metadata.get('type') == 'table':
210
+ conn_type = node.metadata.get('connection_type', '')
211
+ if conn_type:
212
+ conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
213
+
214
+ if conn_types_retrieved:
215
+ log_message("CONNECTION TYPES IN RETRIEVED:")
216
+ for ct, cnt in sorted(conn_types_retrieved.items()):
217
+ log_message(f" {ct}: {cnt} chunks")
218
+
219
+ # Check if target type was retrieved
220
+ # Normalize the check as well
221
+ normalized_check = normalize_query('С-25') # Will become C25
222
+ if normalized_check in question or 'С-25' in question or 'C-25' in question:
223
+ if 'C25' in conn_types_retrieved:
224
+ log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
225
+ else:
226
+ log_message("✗ C25 NOT RETRIEVED despite being in query!")
227
+
228
+ # Sample of retrieved tables
229
+ log_message("SAMPLE OF RETRIEVED TABLES:")
230
+ for i, node in enumerate(unique_retrieved[:10]):
231
+ if node.metadata.get('type') == 'table':
232
+ table_num = node.metadata.get('table_number', 'N/A')
233
+ table_title = node.metadata.get('table_title', 'N/A')
234
+ conn_type = node.metadata.get('connection_type', 'N/A')
235
+ doc_id = node.metadata.get('document_id', 'N/A')
236
+ log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
+
238
+ # Rerank - use normalized query for consistency
239
+ reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
240
 
241
+ # CRITICAL FIX: Use normalized query for LLM as well
242
+ response = query_engine.query(normalized_question)
243
 
244
  end_time = time.time()
245
  processing_time = end_time - start_time