MrSimple07 commited on
Commit
379f6e4
·
1 Parent(s): 15ae02f

new keyboard based approachj

Browse files
Files changed (2) hide show
  1. documents_prep.py +43 -13
  2. utils.py +18 -2
documents_prep.py CHANGED
@@ -195,6 +195,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
195
  normalized_rows.append(row)
196
 
197
  # Log normalization stats with examples
 
 
 
 
 
 
198
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
199
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
200
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
@@ -221,19 +227,43 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
221
  if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
222
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
223
 
224
- metadata = {
225
- 'type': 'table',
226
- 'document_id': doc_id,
227
- 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
228
- 'table_identifier': table_identifier,
229
- 'table_title': table_title,
230
- 'section': section,
231
- 'sheet_name': sheet_name,
232
- 'total_rows': len(normalized_rows),
233
- 'chunk_size': len(content),
234
- 'is_complete_table': True,
235
- 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
236
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
239
  return [Document(text=content, metadata=metadata)]
 
195
  normalized_rows.append(row)
196
 
197
  # Log normalization stats with examples
198
+ if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
199
+ sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
200
+ cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
201
+ if cyrillic_chars:
202
+ log_message(f" ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
203
+
204
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
205
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
206
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
 
227
  if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
228
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
229
 
230
+ metadata = {
231
+ 'type': 'table',
232
+ 'document_id': doc_id,
233
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
234
+ 'table_identifier': table_identifier,
235
+ 'table_title': table_title,
236
+ 'section': section,
237
+ 'sheet_name': sheet_name,
238
+ 'total_rows': len(normalized_rows),
239
+ 'chunk_size': len(content),
240
+ 'is_complete_table': True,
241
+
242
+ # ADD THESE - extracted steel grades for better matching
243
+ 'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
244
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
245
+ }
246
+
247
+ # Add this helper function:
248
+ def extract_steel_grades_from_table(rows, title):
249
+ """Extract all steel grade mentions for metadata"""
250
+ import re
251
+ grades = set()
252
+
253
+ # Pattern for steel grades (both normalized and original)
254
+ pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
255
+
256
+ # Check title
257
+ if title:
258
+ grades.update(re.findall(pattern, str(title), re.IGNORECASE))
259
+
260
+ # Check rows (limit to first 20 rows to avoid bloat)
261
+ for row in rows[:20]:
262
+ if isinstance(row, dict):
263
+ for v in row.values():
264
+ grades.update(re.findall(pattern, str(v), re.IGNORECASE))
265
+
266
+ return list(grades)[:50]
267
 
268
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
269
  return [Document(text=content, metadata=metadata)]
utils.py CHANGED
@@ -197,12 +197,28 @@ def debug_search_tables(vector_index, search_term="С-25"):
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
201
 
202
  normalized_question = normalize_text(question)
203
- log_message(f"Normalized question: {normalized_question}")
204
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
205
- log_message(f"After steel normalization: {normalized_question_2}")
206
  if change_list:
207
  log_message(f"Query changes: {', '.join(change_list)}")
208
  if query_engine is None:
 
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
+ def enhance_query_for_steel_grades(query):
201
+ """Expand query with related terms for better steel grade retrieval"""
202
+ import re
203
+
204
+ # Detect if query contains steel grades
205
+ steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
206
+ matches = re.findall(steel_pattern, query, re.IGNORECASE)
207
+
208
+ if matches:
209
+ # Add contextual terms
210
+ enhanced = query + " стандарт материал марка стали применение"
211
+ log_message(f"Enhanced query with steel context: {enhanced}")
212
+ return enhanced
213
+
214
+ return query
215
+
216
+
217
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
218
 
219
  normalized_question = normalize_text(question)
 
220
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
221
+ normalized_question_2 = enhance_query_for_steel_grades(normalized_question_2)
222
  if change_list:
223
  log_message(f"Query changes: {', '.join(change_list)}")
224
  if query_engine is None: