MrSimple07 commited on
Commit
8d6a517
·
1 Parent(s): 6ff1953

added debugging functions for the c25

Browse files
Files changed (2) hide show
  1. documents_prep.py +38 -14
  2. utils.py +30 -0
documents_prep.py CHANGED
@@ -34,6 +34,11 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
 
 
 
 
 
37
 
38
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
  headers = table_data.get('headers', [])
@@ -41,6 +46,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
41
  table_num = table_data.get('table_number', 'unknown')
42
  table_title = table_data.get('table_title', '')
43
  section = table_data.get('section', '')
 
44
 
45
  table_num_clean = str(table_num).strip()
46
 
@@ -60,8 +66,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
60
 
61
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
 
63
- # Calculate base metadata size
64
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
 
 
 
 
 
65
  base_size = len(base_content)
66
  available_space = max_chars - base_size - 200
67
 
@@ -79,7 +90,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
79
  'section': section,
80
  'total_rows': len(rows),
81
  'chunk_size': len(content),
82
- 'is_complete_table': True
 
 
83
  }
84
 
85
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -155,27 +168,38 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
155
 
156
  return chunks
157
 
158
-
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
- content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
161
-
162
- # Add table type/number prominently for matching
163
- if table_num:
164
- content += f"ТИП: {table_num}\n"
165
 
 
166
  if table_title:
167
- content += f"НАЗВАНИЕ: {table_title}\n"
 
 
 
 
 
 
 
 
 
 
168
 
169
  if section:
170
- content += f"РАЗДЕЛ: {section}\n"
171
 
172
- content += f"{'='*70}\n"
173
 
 
174
  if headers:
175
- header_str = ' | '.join(str(h) for h in headers)
176
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
 
 
177
 
178
- content += "ДАННЫЕ:\n"
179
  return content
180
 
181
 
 
34
 
35
  return chunked
36
 
37
+ def extract_connection_type(text):
38
+ """Extract connection type like С-25, У-14, etc. from text"""
39
+ import re
40
+ match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', text)
41
+ return match.group(0) if match else ''
42
 
43
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
44
  headers = table_data.get('headers', [])
 
46
  table_num = table_data.get('table_number', 'unknown')
47
  table_title = table_data.get('table_title', '')
48
  section = table_data.get('section', '')
49
+ table_description = table_data.get('table_description', '') # NEW
50
 
51
  table_num_clean = str(table_num).strip()
52
 
 
66
 
67
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
68
 
69
+ # Calculate base metadata size - NOW INCLUDING DESCRIPTION
70
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
71
+
72
+ # ADD DESCRIPTION HERE if it exists
73
+ if table_description:
74
+ base_content += f"ОПИСАНИЕ: {table_description}\n\n"
75
+
76
  base_size = len(base_content)
77
  available_space = max_chars - base_size - 200
78
 
 
90
  'section': section,
91
  'total_rows': len(rows),
92
  'chunk_size': len(content),
93
+ 'is_complete_table': True,
94
+ 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
95
+
96
  }
97
 
98
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
 
168
 
169
  return chunks
170
 
 
171
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
172
+ # Start with clear identification
173
+ content = f"ДОКУМЕНТ: {doc_id}\n"
174
+ content += f"ТАБЛИЦА: {table_identifier}\n"
 
 
175
 
176
+ # Extract and emphasize the connection type if present
177
  if table_title:
178
+ content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
179
+
180
+ # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
181
+ import re
182
+ type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
183
+ if type_match:
184
+ connection_type = type_match.group(0)
185
+ content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
186
+
187
+ if table_num and table_num != table_identifier:
188
+ content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
189
 
190
  if section:
191
+ content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
192
 
193
+ content += f"\n{'='*70}\n"
194
 
195
+ # Add headers with better formatting
196
  if headers:
197
+ content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
198
+ for i, h in enumerate(headers, 1):
199
+ content += f" {i}. {h}\n"
200
+ content += "\n"
201
 
202
+ content += "ДАННЫЕ ТАБЛИЦЫ:\n"
203
  return content
204
 
205
 
utils.py CHANGED
@@ -172,6 +172,28 @@ def deduplicate_nodes(nodes):
172
 
173
  return unique_nodes
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
177
  if query_engine is None:
@@ -186,6 +208,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
186
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
187
 
188
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
 
 
 
 
 
 
189
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
190
 
191
  # Simple reranking
 
172
 
173
  return unique_nodes
174
 
175
+ def debug_search_tables(vector_index, search_term="С-25"):
176
+ """Debug function to find all tables containing a specific term"""
177
+ all_nodes = list(vector_index.docstore.docs.values())
178
+
179
+ matching = []
180
+ for node in all_nodes:
181
+ if node.metadata.get('type') == 'table':
182
+ text = node.get_content()
183
+ if search_term in text or search_term in node.metadata.get('table_title', ''):
184
+ matching.append({
185
+ 'doc_id': node.metadata.get('document_id'),
186
+ 'table_num': node.metadata.get('table_number'),
187
+ 'title': node.metadata.get('table_title', '')[:100]
188
+ })
189
+
190
+ log_message(f"\n{'='*60}")
191
+ log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
192
+ for m in matching:
193
+ log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
194
+ log_message(f"{'='*60}\n")
195
+
196
+ return matching
197
 
198
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
199
  if query_engine is None:
 
208
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
209
 
210
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
211
+
212
+ # DEBUG: Log what was retrieved
213
+ log_message(f"RETRIEVED: {len(unique_retrieved)} nodes")
214
+ for i, node in enumerate(unique_retrieved): # All debug
215
+ table_num = node.metadata.get('table_number', 'N/A')
216
+ table_title = node.metadata.get('table_title', 'N/A')
217
+ doc_id = node.metadata.get('document_id', 'N/A')
218
+ log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
219
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
220
 
221
  # Simple reranking