MrSimple07 commited on
Commit
9c77451
·
1 Parent(s): 60178fd

fixing normalizing hypens

Browse files
Files changed (3) hide show
  1. documents_prep.py +28 -24
  2. index_retriever.py +13 -6
  3. utils.py +18 -2
documents_prep.py CHANGED
@@ -39,16 +39,17 @@ def normalize_connection_type(s):
39
  s = s.replace('С', 'C').replace('с', 'c')
40
  s = s.replace('У', 'U').replace('у', 'u')
41
  s = s.replace('Т', 'T').replace('т', 't')
42
- # REMOVE HYPHENS for consistent tokenization
43
  s = s.replace('-', '')
44
  return s
45
 
46
  def extract_connection_type(text):
47
  import re
48
- # Match with or without hyphen
49
- match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
50
  if match:
51
- return normalize_connection_type(match.group(0))
 
52
  return ''
53
 
54
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
@@ -181,23 +182,17 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
181
  return chunks
182
 
183
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
184
- # Start with clear identification
185
  content = f"ДОКУМЕНТ: {doc_id}\n"
186
  content += f"ТАБЛИЦА: {table_identifier}\n"
187
 
188
- # Extract and emphasize the connection type if present
189
  if table_title:
190
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
191
 
192
- # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
193
- import re
194
- type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
195
- if type_match:
196
- connection_type = type_match.group(0)
197
- # NORMALIZE: Convert Cyrillic to Latin for consistency
198
- connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
199
- # Show BOTH in content for searchability
200
- content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
201
 
202
  if table_num and table_num != table_identifier:
203
  content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
@@ -207,7 +202,6 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
207
 
208
  content += f"\n{'='*70}\n"
209
 
210
- # Add headers with better formatting
211
  if headers:
212
  content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
213
  for i, h in enumerate(headers, 1):
@@ -432,6 +426,8 @@ def load_table_documents(repo_id, hf_token, table_dir):
432
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
433
 
434
  all_chunks = []
 
 
435
  for file_path in table_files:
436
  try:
437
  local_path = hf_hub_download(
@@ -448,27 +444,35 @@ def load_table_documents(repo_id, hf_token, table_dir):
448
 
449
  for sheet in data.get('sheets', []):
450
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
 
451
 
452
- # Use the consistent MAX_CHARS_TABLE from config
453
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
454
  all_chunks.extend(chunks)
455
 
 
 
 
 
 
 
 
456
  except Exception as e:
457
  log_message(f"Error loading {file_path}: {e}")
458
 
459
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
460
 
461
  log_message("="*60)
462
- log_message("CONNECTION TYPE ENCODING CHECK:")
463
- for chunk in all_chunks[:50]: # Check first 50
464
- conn_type = chunk.metadata.get('connection_type', '')
465
- if 'C' in conn_type or 'С' in conn_type:
466
- # Show both representations
467
- log_message(f" Original: '{conn_type}' | Bytes: {conn_type.encode('utf-8')}")
468
  log_message("="*60)
 
469
  return all_chunks
470
 
471
-
472
  def load_image_documents(repo_id, hf_token, image_dir):
473
  """Load image descriptions"""
474
  log_message("Loading images...")
 
39
  s = s.replace('С', 'C').replace('с', 'c')
40
  s = s.replace('У', 'U').replace('у', 'u')
41
  s = s.replace('Т', 'T').replace('т', 't')
42
+ # REMOVE ALL HYPHENS for consistent tokenization
43
  s = s.replace('-', '')
44
  return s
45
 
46
  def extract_connection_type(text):
47
  import re
48
+ # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
49
+ match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
50
  if match:
51
+ normalized = normalize_connection_type(match.group(0))
52
+ return normalized
53
  return ''
54
 
55
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
 
182
  return chunks
183
 
184
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
 
185
  content = f"ДОКУМЕНТ: {doc_id}\n"
186
  content += f"ТАБЛИЦА: {table_identifier}\n"
187
 
 
188
  if table_title:
189
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
190
 
191
+ # Extract and normalize connection type
192
+ connection_type = extract_connection_type(table_title)
193
+ if connection_type:
194
+ # Show normalized version for searchability
195
+ content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
 
 
 
 
196
 
197
  if table_num and table_num != table_identifier:
198
  content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
 
202
 
203
  content += f"\n{'='*70}\n"
204
 
 
205
  if headers:
206
  content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
207
  for i, h in enumerate(headers, 1):
 
426
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
427
 
428
  all_chunks = []
429
+ connection_type_sources = {} # Track which table each type comes from
430
+
431
  for file_path in table_files:
432
  try:
433
  local_path = hf_hub_download(
 
444
 
445
  for sheet in data.get('sheets', []):
446
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
447
+ table_num = sheet.get('table_number', 'unknown')
448
+ table_title = sheet.get('table_title', '')
449
 
 
450
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
451
  all_chunks.extend(chunks)
452
 
453
+ # Track connection type source
454
+ conn_type = extract_connection_type(table_title)
455
+ if conn_type:
456
+ if conn_type not in connection_type_sources:
457
+ connection_type_sources[conn_type] = []
458
+ connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
459
+
460
  except Exception as e:
461
  log_message(f"Error loading {file_path}: {e}")
462
 
463
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
464
 
465
  log_message("="*60)
466
+ log_message("CONNECTION TYPES AND THEIR SOURCES:")
467
+ for conn_type in sorted(connection_type_sources.keys()):
468
+ sources = connection_type_sources[conn_type]
469
+ log_message(f" {conn_type}: {len(sources)} tables")
470
+ for src in sources:
471
+ log_message(f" - {src}")
472
  log_message("="*60)
473
+
474
  return all_chunks
475
 
 
476
  def load_image_documents(repo_id, hf_token, image_dir):
477
  """Load image descriptions"""
478
  log_message("Loading images...")
index_retriever.py CHANGED
@@ -11,25 +11,32 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
14
- # NEW: Analyze connection types before indexing
15
- connection_types = {}
16
  table_count = 0
 
17
  for doc in documents:
18
  if doc.metadata.get('type') == 'table':
19
  table_count += 1
20
  conn_type = doc.metadata.get('connection_type', '')
21
  if conn_type:
22
- connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
 
 
 
23
 
24
  log_message("="*60)
25
  log_message(f"INDEXING {table_count} TABLE CHUNKS")
26
- log_message("CONNECTION TYPES IN INDEX:")
27
- for conn_type, count in sorted(connection_types.items()):
28
- log_message(f" {conn_type}: {count} chunks")
 
 
 
29
  log_message("="*60)
30
 
31
  return VectorStoreIndex.from_documents(documents)
32
 
 
33
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
34
  if not nodes or not reranker:
35
  return nodes[:top_k]
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
14
+ connection_type_sources = {}
 
15
  table_count = 0
16
+
17
  for doc in documents:
18
  if doc.metadata.get('type') == 'table':
19
  table_count += 1
20
  conn_type = doc.metadata.get('connection_type', '')
21
  if conn_type:
22
+ table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
23
+ if conn_type not in connection_type_sources:
24
+ connection_type_sources[conn_type] = []
25
+ connection_type_sources[conn_type].append(table_id)
26
 
27
  log_message("="*60)
28
  log_message(f"INDEXING {table_count} TABLE CHUNKS")
29
+ log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
30
+ for conn_type in sorted(connection_type_sources.keys()):
31
+ sources = list(set(connection_type_sources[conn_type])) # Unique sources
32
+ log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
33
+ for src in sources:
34
+ log_message(f" - {src}")
35
  log_message("="*60)
36
 
37
  return VectorStoreIndex.from_documents(documents)
38
 
39
+
40
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
41
  if not nodes or not reranker:
42
  return nodes[:top_k]
utils.py CHANGED
@@ -173,6 +173,16 @@ def deduplicate_nodes(nodes):
173
  return unique_nodes
174
 
175
 
 
 
 
 
 
 
 
 
 
 
176
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
177
  if query_engine is None:
178
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
@@ -180,8 +190,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
180
  try:
181
  start_time = time.time()
182
 
183
- # DON'T normalize - use original query directly
184
- retrieved_nodes = query_engine.retriever.retrieve(question)
 
 
 
 
 
 
185
  log_message(f"user query: {question}")
186
 
187
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
 
173
  return unique_nodes
174
 
175
 
176
+ def normalize_query(query):
177
+ """Normalize query to match stored format"""
178
+ import re
179
+ # Replace Cyrillic connection types with Latin
180
+ query = query.replace('С-', 'C-').replace('с-', 'c-')
181
+ query = query.replace('У-', 'U-').replace('у-', 'u-')
182
+ query = query.replace('Т-', 'T-').replace('т-', 't-')
183
+ return query
184
+
185
+
186
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
187
  if query_engine is None:
188
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
 
190
  try:
191
  start_time = time.time()
192
 
193
+ # NORMALIZE QUERY: Convert Cyrillic to Latin
194
+ normalized_question = normalize_query(question)
195
+ log_message(f"Original query: {question}")
196
+ if normalized_question != question:
197
+ log_message(f"Normalized query: {normalized_question}")
198
+
199
+ # Use normalized query for retrieval
200
+ retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
201
  log_message(f"user query: {question}")
202
 
203
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")