Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

0d6b2c5

1 Parent(s): 3dcab53

old version with fixed, 3000, 30

Browse files

Files changed (3) hide show

config.py +1 -1
documents_prep.py +31 -61
utils.py +25 -47

config.py CHANGED Viewed

@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 4000
 MAX_ROWS_TABLE = 30

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 3000
 MAX_ROWS_TABLE = 30

documents_prep.py CHANGED Viewed

@@ -26,9 +26,9 @@ import re
 def normalize_steel_designations(text):
     """
-    Normalize steel designations by converting Latin letters to Cyrillic.
-    Handles patterns like 08X18H10T → 08Х18Н10Т.
-    Useful when aligning with Russian technical documentation.
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
@@ -39,24 +39,25 @@ def normalize_steel_designations(text):
     changes_count = 0
     changes_list = []
-    # Mapping of Latin to Cyrillic for steel designations
     replacements = {
-        'X': 'Х',  # Latin X → Cyrillic Х
-        'H': 'Н',  # Latin H → Cyrillic Н
-        'T': 'Т',  # Latin T → Cyrillic Т
-        'C': 'С',  # Latin C → Cyrillic С
-        'B': 'В',  # Latin B → Cyrillic В
-        'K': 'К',  # Latin K → Cyrillic К
-        'M': 'М',  # Latin M → Cyrillic М
-        'A': 'А',  # Latin A → Cyrillic А
-        'P': 'Р',  # Latin P → Cyrillic Р
     }
-    # Pattern for steel grades (digits + letters)
     pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
-    # Pattern for welding wire designations (e.g. CB-08X19H10)
-    pattern_wire = r'\b[CSС][BVВ]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
@@ -75,7 +76,6 @@ def normalize_steel_designations(text):
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
@@ -195,12 +195,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             normalized_rows.append(row)
     # Log normalization stats with examples
-    if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
-        sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
-        cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
-        if cyrillic_chars:
-            log_message(f"    ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
@@ -227,43 +221,19 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
-    metadata = {
-        'type': 'table',
-        'document_id': doc_id,
-        'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-        'table_identifier': table_identifier,
-        'table_title': table_title,
-        'section': section,
-        'sheet_name': sheet_name,
-        'total_rows': len(normalized_rows),
-        'chunk_size': len(content),
-        'is_complete_table': True,
-        # ADD THESE - extracted steel grades for better matching
-        'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
-        'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
-    }
-    # Add this helper function:
-    def extract_steel_grades_from_table(rows, title):
-        """Extract all steel grade mentions for metadata"""
-        import re
-        grades = set()
-        # Pattern for steel grades (both normalized and original)
-        pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
-        # Check title
-        if title:
-            grades.update(re.findall(pattern, str(title), re.IGNORECASE))
-        # Check rows (limit to first 20 rows to avoid bloat)
-        for row in rows[:20]:
-            if isinstance(row, dict):
-                for v in row.values():
-                    grades.update(re.findall(pattern, str(v), re.IGNORECASE))
-        return list(grades)[:50]
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]

 def normalize_steel_designations(text):
     """
+    Normalize steel designations by converting Cyrillic letters to Latin.
+    This improves search/retrieval since embedding models work better with Latin.
+    Handles patterns like 08Х18Н10Т → 08X18H10T
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
     changes_count = 0
     changes_list = []
+    # Mapping of Cyrillic to Latin for steel designations
     replacements = {
+        'Х': 'X',  # Cyrillic Kha → Latin X
+        'Н': 'H',  # Cyrillic En → Latin H
+        'Т': 'T',  # Cyrillic Te → Latin T
+        'С': 'C',  # Cyrillic Es → Latin C
+        'В': 'B',  # Cyrillic Ve → Latin B
+        'К': 'K',  # Cyrillic Ka → Latin K
+        'М': 'M',  # Cyrillic Em → Latin M
+        'А': 'A',  # Cyrillic A → Latin A
+        'Р': 'P',  # Cyrillic Er → Latin P
     }
+    # Pattern: starts with digits, then letters+digits (steel grade pattern)
+    # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
     pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
+    # Also match welding wire patterns like СВ-08Х19Н10
+    pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
             normalized_rows.append(row)
     # Log normalization stats with examples
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
     if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'sheet_name': sheet_name,
+            'total_rows': len(normalized_rows),
+            'chunk_size': len(content),
+            'is_complete_table': True,
+            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
+        }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]

utils.py CHANGED Viewed

@@ -197,71 +197,47 @@ def debug_search_tables(vector_index, search_term="С-25"):
 from documents_prep import normalize_text, normalize_steel_designations
-def expand_query_with_llm(query, llm_model):
-    """Generate 5 alternative query formulations using LLM"""
-    try:
-        from config import QUERY_EXPANSION_PROMPT
-        expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=query)
-        log_message(f"Generating query variations for: {query}")
-        response = llm_model.complete(expansion_prompt)
-        # Parse response - split by newlines and filter empty
-        variations = [line.strip() for line in response.text.split('\n') if line.strip()]
-        variations = variations[:5]  # Take only first 5
-        if variations:
-            log_message(f"Generated {len(variations)} query variations:")
-            for i, var in enumerate(variations, 1):
-                log_message(f"  {i}. {var}")
-            # Combine original + variations
-            combined_query = query + " " + " ".join(variations)
-            return combined_query
-        else:
-            log_message("No variations generated, using original query")
-            return query
-    except Exception as e:
-        log_message(f"Error generating query variations: {e}")
-        return query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
-    # Apply normalizations
     normalized_question = normalize_text(question)
-    normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
     if change_list:
-        log_message(f"Query changes: {', '.join(change_list)}")
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
-        # EXPAND QUERY USING LLM
-        from utils import get_llm_model
-        llm = get_llm_model(current_model)
-        expanded_query = expand_query_with_llm(normalized_question_2, llm)
-        # Use expanded query for retrieval
-        retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
         log_message(f"user query: {question}")
         log_message(f"normalized query: {normalized_question}")
         log_message(f"after steel normalization: {normalized_question_2}")
-        log_message(f"expanded query length: {len(expanded_query)} chars")
         log_message(f"Steel grades normalized in query: {query_changes}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
         for i, node in enumerate(unique_retrieved):
             node_type = node.metadata.get('type', 'text')
             doc_id = node.metadata.get('document_id', 'N/A')
@@ -270,6 +246,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
                 table_num = node.metadata.get('table_number', 'N/A')
                 table_id = node.metadata.get('table_identifier', 'N/A')
                 table_title = node.metadata.get('table_title', 'N/A')
                 content_preview = node.text[:200].replace('\n', ' ')
                 log_message(f"  [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
                 log_message(f"      Title: {table_title[:80]}")
@@ -280,10 +257,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
         reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
-                                     top_k=rerank_top_k)
-        # Use ORIGINAL normalized question for final answer generation
         response = query_engine.query(normalized_question_2)
         end_time = time.time()

 from documents_prep import normalize_text, normalize_steel_designations
+def enhance_query_for_steel_grades(query):
+    """Expand query with related terms for better steel grade retrieval"""
+    import re
+    # Detect if query contains steel grades
+    steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
+    matches = re.findall(steel_pattern, query, re.IGNORECASE)
+    if matches:
+        # Add contextual terms
+        enhanced = query + " стандарт материал марка стали применение"
+        log_message(f"Enhanced query with steel context: {enhanced}")
+        return enhanced
+    return query
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
     normalized_question = normalize_text(question)
+    normalized_question_2, query_changes, change_list = normalize_steel_designations(question)  # FIX: 3 values
     if change_list:
+        log_message(f"Query changes: {', '.join(change_list)}")
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        retrieved_nodes = query_engine.retriever.retrieve(normalized_question_2)
         log_message(f"user query: {question}")
         log_message(f"normalized query: {normalized_question}")
         log_message(f"after steel normalization: {normalized_question_2}")
         log_message(f"Steel grades normalized in query: {query_changes}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
+        # IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
         log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
         for i, node in enumerate(unique_retrieved):
             node_type = node.metadata.get('type', 'text')
             doc_id = node.metadata.get('document_id', 'N/A')
                 table_num = node.metadata.get('table_number', 'N/A')
                 table_id = node.metadata.get('table_identifier', 'N/A')
                 table_title = node.metadata.get('table_title', 'N/A')
+                # Show first 200 chars of content to verify it's the right table
                 content_preview = node.text[:200].replace('\n', ' ')
                 log_message(f"  [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
                 log_message(f"      Title: {table_title[:80]}")
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
         reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
+                                     top_k=rerank_top_k)  # NOW PARAMETERIZED
+        # Direct query without formatting - use normalized question
         response = query_engine.query(normalized_question_2)
         end_time = time.time()