Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

30336c3

1 Parent(s): 4834e86

normalize anyways + max row = 15 + max chars = 3000

Browse files

Files changed (2) hide show

config.py +2 -2
documents_prep.py +21 -4

config.py CHANGED Viewed

@@ -52,8 +52,8 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 2000
-MAX_ROWS_TABLE = 40
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 3000
+MAX_ROWS_TABLE = 15
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

documents_prep.py CHANGED Viewed

@@ -34,11 +34,16 @@ def chunk_text_documents(documents):
     return chunked
 def extract_connection_type(text):
-    """Extract connection type like С-25, У-14, etc. from text"""
     import re
-    match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', text)
-    return match.group(0) if match else ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -183,7 +188,10 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
@@ -443,6 +451,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks

     return chunked
+def normalize_connection_type(s):
+    # Replace Cyrillic С/с with Latin C/c
+    return s.replace('С', 'C').replace('с', 'c')
 def extract_connection_type(text):
     import re
+    match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
+    if match:
+        return normalize_connection_type(match.group(0))
+    return ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
         type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
             connection_type = type_match.group(0)
+            # NORMALIZE: Convert Cyrillic to Latin for consistency
+            connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
+            # Show BOTH in content for searchability
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
     if table_num and table_num != table_identifier:
         content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    log_message("="*60)
+    log_message("CONNECTION TYPE ENCODING CHECK:")
+    for chunk in all_chunks[:50]:  # Check first 50
+        conn_type = chunk.metadata.get('connection_type', '')
+        if 'C' in conn_type or 'С' in conn_type:
+            # Show both representations
+            log_message(f"  Original: '{conn_type}' | Bytes: {conn_type.encode('utf-8')}")
+    log_message("="*60)
     return all_chunks