Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

379f6e4

1 Parent(s): 15ae02f

new keyboard based approachj

Browse files

Files changed (2) hide show

documents_prep.py +43 -13
utils.py +18 -2

documents_prep.py CHANGED Viewed

@@ -195,6 +195,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             normalized_rows.append(row)
     # Log normalization stats with examples
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
@@ -221,19 +227,43 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
-            'section': section,
-            'sheet_name': sheet_name,
-            'total_rows': len(normalized_rows),
-            'chunk_size': len(content),
-            'is_complete_table': True,
-            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
-        }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]

             normalized_rows.append(row)
     # Log normalization stats with examples
+    if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
+        sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
+        cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
+        if cyrillic_chars:
+            log_message(f"    ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
     if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
+    metadata = {
+        'type': 'table',
+        'document_id': doc_id,
+        'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+        'table_identifier': table_identifier,
+        'table_title': table_title,
+        'section': section,
+        'sheet_name': sheet_name,
+        'total_rows': len(normalized_rows),
+        'chunk_size': len(content),
+        'is_complete_table': True,
+        # ADD THESE - extracted steel grades for better matching
+        'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
+        'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
+    }
+    # Add this helper function:
+    def extract_steel_grades_from_table(rows, title):
+        """Extract all steel grade mentions for metadata"""
+        import re
+        grades = set()
+        # Pattern for steel grades (both normalized and original)
+        pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
+        # Check title
+        if title:
+            grades.update(re.findall(pattern, str(title), re.IGNORECASE))
+        # Check rows (limit to first 20 rows to avoid bloat)
+        for row in rows[:20]:
+            if isinstance(row, dict):
+                for v in row.values():
+                    grades.update(re.findall(pattern, str(v), re.IGNORECASE))
+        return list(grades)[:50]
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]

utils.py CHANGED Viewed

@@ -197,12 +197,28 @@ def debug_search_tables(vector_index, search_term="С-25"):
 from documents_prep import normalize_text, normalize_steel_designations
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
     normalized_question = normalize_text(question)
-    log_message(f"Normalized question: {normalized_question}")
     normalized_question_2, query_changes, change_list = normalize_steel_designations(question)  # FIX: 3 values
-    log_message(f"After steel normalization: {normalized_question_2}")
     if change_list:
         log_message(f"Query changes: {', '.join(change_list)}")
     if query_engine is None:

 from documents_prep import normalize_text, normalize_steel_designations
+def enhance_query_for_steel_grades(query):
+    """Expand query with related terms for better steel grade retrieval"""
+    import re
+    # Detect if query contains steel grades
+    steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
+    matches = re.findall(steel_pattern, query, re.IGNORECASE)
+    if matches:
+        # Add contextual terms
+        enhanced = query + " стандарт материал марка стали применение"
+        log_message(f"Enhanced query with steel context: {enhanced}")
+        return enhanced
+    return query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
     normalized_question = normalize_text(question)
     normalized_question_2, query_changes, change_list = normalize_steel_designations(question)  # FIX: 3 values
+    normalized_question_2 = enhance_query_for_steel_grades(normalized_question_2)
     if change_list:
         log_message(f"Query changes: {', '.join(change_list)}")
     if query_engine is None: