Spaces:

MrSimple01
/

RAG_AIEXP_1

Sleeping

App Files Files Community

MrSimple07 commited on Oct 14, 2025

Commit

e4fd158

1 Parent(s): a90618e

new version top k 60, 0.6, chunk size 4500, chunkrow 50

Browse files

Files changed (4) hide show

config.py +2 -2
documents_prep.py +47 -108
index_retriever.py +4 -4
utils.py +41 -49

config.py CHANGED Viewed

@@ -52,8 +52,8 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 2500
-MAX_ROWS_TABLE = 15
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 4500
+MAX_ROWS_TABLE = 50
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

documents_prep.py CHANGED Viewed

@@ -34,26 +34,20 @@ def chunk_text_documents(documents):
     return chunked
-def normalize_connection_type(s):
-    # Replace Cyrillic with Latin
-    s = s.replace('С', 'C').replace('с', 'c')
-    s = s.replace('У', 'U').replace('у', 'u')
-    s = s.replace('Т', 'T').replace('т', 't')
-    s= s.replace('С-', 'C-').replace('с-', 'c-')
-    s = s.replace('У-', 'U-').replace('у-', 'u-')
-    s = s.replace('Т-', 'T-').replace('т-', 't-')
-    # REMOVE ALL HYPHENS for consistent tokenization
-    s = s.replace('-', '')
-    return s
-def extract_connection_type(text):
     import re
-    # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
-    match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
-    if match:
-        normalized = normalize_connection_type(match.group(0))
-        return normalized
-    return ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -61,9 +55,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_description = table_data.get('table_description', '')
     table_num_clean = str(table_num).strip()
     import re
     if 'приложени' in section.lower():
@@ -81,13 +75,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size - NOW INCLUDING DESCRIPTION
-    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
-    # ADD DESCRIPTION HERE if it exists
-    if table_description:
-        base_content += f"ОПИСАНИЕ: {table_description}\n\n"
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -100,14 +89,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
-            'is_complete_table': True,
-            'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -133,16 +120,15 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
-                'table_identifier': table_identifier,
-                'table_title': table_title,
                 'section': section,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
-                'is_complete_table': False,
-                'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
             }
             chunks.append(Document(text=content, metadata=metadata))
@@ -168,8 +154,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
@@ -184,62 +170,45 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ДОКУМЕНТ: {doc_id}\n"
-    content += f"ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        # Normalize the title text itself for better searchability
-        normalized_title = normalize_connection_type(table_title)
-        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
-        # Extract and store the normalized connection type
-        connection_type = extract_connection_type(table_title)
-        if connection_type:
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
-    if table_num and table_num != table_identifier:
-        content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if section:
-        content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
-    content += f"\n{'='*70}\n"
     if headers:
-        content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
-        for i, h in enumerate(headers, 1):
-            # NORMALIZE HEADERS TOO
-            normalized_header = normalize_connection_type(h)
-            content += f"  {i}. {normalized_header}\n"
-        content += "\n"
-    content += "ДАННЫЕ ТАБЛИЦЫ:\n"
     return content
 def format_single_row(row, idx):
-    """Format a single row with normalization"""
     if isinstance(row, dict):
-        # NORMALIZE VALUES IN ROWS
-        parts = []
-        for k, v in row.items():
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(f"{k}: {normalized_v}")
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
-        # NORMALIZE LIST VALUES
-        parts = []
-        for v in row:
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(normalized_v)
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""
@@ -440,8 +409,6 @@ def load_table_documents(repo_id, hf_token, table_dir):
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
-    connection_type_sources = {}  # Track which table each type comes from
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -458,35 +425,18 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                table_num = sheet.get('table_number', 'unknown')
-                table_title = sheet.get('table_title', '')
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
-                # Track connection type source
-                conn_type = extract_connection_type(table_title)
-                if conn_type:
-                    if conn_type not in connection_type_sources:
-                        connection_type_sources[conn_type] = []
-                    connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    log_message("="*60)
-    log_message("CONNECTION TYPES AND THEIR SOURCES:")
-    for conn_type in sorted(connection_type_sources.keys()):
-        sources = connection_type_sources[conn_type]
-        log_message(f"  {conn_type}: {len(sources)} tables")
-        for src in sources:
-            log_message(f"    - {src}")
-    log_message("="*60)
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")
@@ -534,7 +484,9 @@ def load_image_documents(repo_id, hf_token, image_dir):
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)
@@ -546,19 +498,6 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     # Load tables (already chunked)
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
-    # NEW: Analyze connection types in tables
-    connection_types = {}
-    for chunk in table_chunks:
-        conn_type = chunk.metadata.get('connection_type', '')
-        if conn_type:
-            connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
-    log_message("="*60)
-    log_message("CONNECTION TYPES FOUND IN TABLES:")
-    for conn_type, count in sorted(connection_types.items()):
-        log_message(f"  {conn_type}: {count} chunks")
-    log_message("="*60)
     # Load images (no chunking needed)
     image_docs = load_image_documents(repo_id, hf_token, image_dir)

     return chunked
+def normalize_text(text):
+    if not text:
+        return text
+    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
+    # This is for welding types like C-25 -> С-25
+    text = text.replace('С-', 'C')
+    # Also handle cases like "Type C" or variations
     import re
+    # Match "C" followed by digit or space in context of welding types
+    text = re.sub(r'\bС(\d)', r'С\1', text)
+    return text
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
+    table_title_normalized = normalize_text(str(table_title))  # NORMALIZE TITLE
     import re
     if 'приложени' in section.lower():
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size with NORMALIZED title
+    base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': normalize_text(table_identifier),  # NORMALIZE identifier
+            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
+            'is_complete_table': True
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
+                'table_identifier': normalize_text(table_identifier),  # NORMALIZE
+                'table_title': table_title_normalized,  # NORMALIZED
                 'section': section,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
+                'is_complete_table': False
             }
             chunks.append(Document(text=content, metadata=metadata))
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': normalize_text(table_identifier),  # NORMALIZE
+            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
     return chunks
+# MODIFIED: Update format_table_header function
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
+    # Add table type/number prominently for matching
+    if table_num:
+        content += f"ТИП: {normalize_text(table_num)}\n"
+    if table_title:
+        content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
     if section:
+        content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n"
     if headers:
+        header_str = ' | '.join(str(h) for h in headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
     return content
 def format_single_row(row, idx):
+    """Format a single row"""
     if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # Use the consistent MAX_CHARS_TABLE from config
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)
     # Load tables (already chunked)
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
     image_docs = load_image_documents(repo_id, hf_token, image_dir)

index_retriever.py CHANGED Viewed

@@ -71,18 +71,18 @@ def create_query_engine(vector_index):
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=100
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=100,
-            similarity_cutoff=0.55
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=100,
             num_queries=1
         )

         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=60
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=60,
+            similarity_cutoff=0.6
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=120,
             num_queries=1
         )

utils.py CHANGED Viewed

@@ -9,7 +9,6 @@ import time
 from index_retriever import rerank_nodes
 from my_logging import log_message
 from config import PROMPT_SIMPLE_POISK
-import re
 def get_llm_model(model_name):
     try:
@@ -173,72 +172,64 @@ def deduplicate_nodes(nodes):
     return unique_nodes
-def normalize_query(query):
-    def repl(m):
-        cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
-        letter = cyr_to_lat.get(m.group(1), m.group(1))
-        return f"{letter}{m.group(2)}"
-    return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
-        # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
-        normalized_question = normalize_query(question)
-        log_message(f"Original query: {question}")
-        log_message(f"Normalized query: {normalized_question}")
-        # Use normalized query for retrieval
         retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
         log_message(f"user query: {question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
-        # Check for connection types
-        conn_types_retrieved = {}
-        for node in unique_retrieved:
-            if node.metadata.get('type') == 'table':
-                conn_type = node.metadata.get('connection_type', '')
-                if conn_type:
-                    conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
-        if conn_types_retrieved:
-            log_message("CONNECTION TYPES IN RETRIEVED:")
-            for ct, cnt in sorted(conn_types_retrieved.items()):
-                log_message(f"  {ct}: {cnt} chunks")
-        # Check if target type was retrieved
-        # Normalize the check as well
-        normalized_check = normalize_query('С-25')  # Will become C25
-        if normalized_check in question or 'С-25' in question or 'C-25' in question:
-            if 'C25' in conn_types_retrieved:
-                log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
-            else:
-                log_message("✗ C25 NOT RETRIEVED despite being in query!")
-        # Sample of retrieved tables
-        log_message("SAMPLE OF RETRIEVED TABLES:")
-        for i, node in enumerate(unique_retrieved[:10]):
-            if node.metadata.get('type') == 'table':
-                table_num = node.metadata.get('table_number', 'N/A')
-                table_title = node.metadata.get('table_title', 'N/A')
-                conn_type = node.metadata.get('connection_type', 'N/A')
-                doc_id = node.metadata.get('document_id', 'N/A')
-                log_message(f"  [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
-        # Rerank - use normalized query for consistency
         reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
-        # CRITICAL FIX: Use normalized query for LLM as well
         response = query_engine.query(normalized_question)
         end_time = time.time()
@@ -255,6 +246,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         Время обработки: {processing_time:.2f} секунд
         </div>
         </div>"""
         chunk_info = []
         for node in reranked_nodes:

 from index_retriever import rerank_nodes
 from my_logging import log_message
 from config import PROMPT_SIMPLE_POISK
 def get_llm_model(model_name):
     try:
     return unique_nodes
+def debug_search_tables(vector_index, search_term="С-25"):
+    """Debug function to find all tables containing a specific term"""
+    all_nodes = list(vector_index.docstore.docs.values())
+    matching = []
+    for node in all_nodes:
+        if node.metadata.get('type') == 'table':
+            text = node.get_content()
+            if search_term in text or search_term in node.metadata.get('table_title', ''):
+                matching.append({
+                    'doc_id': node.metadata.get('document_id'),
+                    'table_num': node.metadata.get('table_number'),
+                    'title': node.metadata.get('table_title', '')[:100]
+                })
+    log_message(f"\n{'='*60}")
+    log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
+    for m in matching:
+        log_message(f"  • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
+    log_message(f"{'='*60}\n")
+    return matching
+from documents_prep import normalize_text
+# MODIFIED: Update answer_question function
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
+    # NORMALIZE the question to convert C to С
+    normalized_question = normalize_text(question)
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        # Use NORMALIZED question for retrieval
         retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
         log_message(f"user query: {question}")
+        log_message(f"normalized query: {normalized_question}")
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
+        # DEBUG: Log what was retrieved
+        log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
+        for i, node in enumerate(unique_retrieved):  # All debug
+            table_num = node.metadata.get('table_number', 'N/A')
+            table_title = node.metadata.get('table_title', 'N/A')
+            doc_id = node.metadata.get('document_id', 'N/A')
+            log_message(f"  [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Simple reranking with NORMALIZED question
         reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
+        # Direct query without formatting - use normalized question
         response = query_engine.query(normalized_question)
         end_time = time.time()
         Время обработки: {processing_time:.2f} секунд
         </div>
         </div>"""
+        log_message(f"Model Answer: {response.response}")
         chunk_info = []
         for node in reranked_nodes: