Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

7565a55

1 Parent(s): 2edec29

max chars = 2000 for tables + new answer_question

Browse files

Files changed (2) hide show

documents_prep.py +137 -63
utils.py +20 -7

documents_prep.py CHANGED Viewed

@@ -53,7 +53,8 @@ def normalize_doc_id(doc_id):
     return doc_id
-def chunk_table_by_rows(table_data, doc_id, max_rows=10):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
@@ -62,7 +63,6 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
     # NORMALIZE document ID
     doc_id = normalize_doc_id(doc_id)
     table_num_clean = str(table_num).strip()
     # Create section-aware identifier
@@ -82,9 +82,15 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    if len(rows) <= max_rows:
-        content = format_table_content(table_data, headers, rows, doc_id, table_identifier)
-        chunk_size = len(content)
         metadata = {
             'type': 'table',
@@ -94,30 +100,62 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
             'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
-            'chunk_size': chunk_size,
             'is_complete_table': True
         }
-        log_message(f"    Chunk: 1/1, {chunk_size} chars, doc={doc_id}, table={table_identifier}")
         return [Document(text=content, metadata=metadata)]
     chunks = []
-    overlap = 1
-    for i in range(0, len(rows), max_rows - overlap):
-        chunk_rows = rows[i:min(i+max_rows, len(rows))]
-        chunk_num = i // (max_rows - overlap)
-        content = format_table_content(
-            table_data,
-            headers,
-            chunk_rows,
-            table_identifier,
-            chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
-        )
-        chunk_size = len(content)
         metadata = {
             'type': 'table',
@@ -127,28 +165,21 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
             'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
-            'row_start': i,
-            'row_end': i + len(chunk_rows),
             'total_rows': len(rows),
-            'chunk_size': chunk_size,
-            'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
             'is_complete_table': False
         }
-        log_message(f"    Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
         chunks.append(Document(text=content, metadata=metadata))
     return chunks
-def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
-    doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
-    table_num = table_data.get('table_number', 'unknown')
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    # Use enhanced identifier
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
     content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
@@ -161,48 +192,91 @@ def format_table_content(table_data, headers, rows, table_identifier, chunk_info
     # Enhanced search keywords
     content += f"Это таблица {table_identifier} из документа {doc_id}. "
-    content += f"Идентификатор таблицы: {table_identifier}. "
-    content += f"Номер: {table_num}. "
-    content += f"Документ: {doc_id}. "
     if section:
-        content += f"Находится в разделе: {section}. "
         if 'приложени' in section.lower():
             content += f"Таблица из приложения. "
     if table_title:
-        content += f"Название таблицы: {table_title}. "
-        content += f"Таблица о: {table_title}. "
-    content += f"Поиск: таблица {table_identifier} {doc_id}. "
-    if chunk_info:
-        content += f"\n{chunk_info}\n"
-    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
-    content += f"="*70 + "\n\n"
     if headers:
         header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ СТОЛБЦОВ:\n{header_str}\n\n"
-    content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
-    for idx, row in enumerate(rows, 1):
-        if isinstance(row, dict):
-            parts = [f"{k}: {v}" for k, v in row.items()
-                    if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-            if parts:
-                content += f"{idx}. {' | '.join(parts)}\n"
-        elif isinstance(row, list):
-            parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-            if parts:
-                content += f"{idx}. {' | '.join(parts)}\n"
-    content += f"\n{'='*70}\n"
-    content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
     return content
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
@@ -411,7 +485,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
                 # CRITICAL: Pass document_id to chunk function
-                chunks = chunk_table_by_rows(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
         except Exception as e:

     return doc_id
+def chunk_table_by_content(table_data, doc_id, max_chars=2000):
+    """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     # NORMALIZE document ID
     doc_id = normalize_doc_id(doc_id)
     table_num_clean = str(table_num).strip()
     # Create section-aware identifier
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size (everything except row data)
+    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
+    base_size = len(base_content)
+    available_space = max_chars - base_size - 200  # Reserve 200 chars for footer
+    # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows(rows)
+    if base_size + len(full_rows_content) <= max_chars:
+        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
+            'chunk_size': len(content),
             'is_complete_table': True
         }
+        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
+    # Otherwise, chunk by content size
     chunks = []
+    current_rows = []
+    current_size = 0
+    chunk_num = 0
+    for i, row in enumerate(rows):
+        row_text = format_single_row(row, i + 1)
+        row_size = len(row_text)
+        # If adding this row exceeds limit, save current chunk
+        if current_size + row_size > available_space and current_rows:
+            content = base_content + format_table_rows(current_rows)
+            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+            content += format_table_footer(table_identifier, doc_id)
+            metadata = {
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num_clean,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
+                'section': section,
+                'chunk_id': chunk_num,
+                'row_start': current_rows[0]['_idx'] - 1,
+                'row_end': current_rows[-1]['_idx'],
+                'total_rows': len(rows),
+                'chunk_size': len(content),
+                'is_complete_table': False
+            }
+            chunks.append(Document(text=content, metadata=metadata))
+            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+            chunk_num += 1
+            current_rows = []
+            current_size = 0
+        # Add row index for tracking
+        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
+        row_copy['_idx'] = i + 1
+        current_rows.append(row_copy)
+        current_size += row_size
+    # Add final chunk if rows remain
+    if current_rows:
+        content = base_content + format_table_rows(current_rows)
+        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+        content += format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
+            'row_start': current_rows[0]['_idx'] - 1,
+            'row_end': current_rows[-1]['_idx'],
             'total_rows': len(rows),
+            'chunk_size': len(content),
             'is_complete_table': False
         }
         chunks.append(Document(text=content, metadata=metadata))
+        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
     return chunks
+def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    """Format consistent table header"""
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
     content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
     # Enhanced search keywords
     content += f"Это таблица {table_identifier} из документа {doc_id}. "
+    content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
     if section:
+        content += f"Раздел: {section}. "
         if 'приложени' in section.lower():
             content += f"Таблица из приложения. "
     if table_title:
+        content += f"Название: {table_title}. "
+    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
     if headers:
         header_str = ' | '.join(str(h) for h in headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
     return content
+def format_single_row(row, idx):
+    """Format a single row"""
+    if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    return ""
+def format_table_rows(rows):
+    """Format multiple rows"""
+    content = ""
+    for row in rows:
+        idx = row.get('_idx', 0)
+        content += format_single_row(row, idx)
+    return content
+def format_table_footer(table_identifier, doc_id):
+    """Format table footer"""
+    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
+# Update load_table_documents to use new function
+def load_table_documents(repo_id, hf_token, table_dir):
+    """Load and chunk tables by content size"""
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # Use content-based chunking instead of row-based
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
                 # CRITICAL: Pass document_id to chunk function
+                chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
         except Exception as e:

utils.py CHANGED Viewed

@@ -62,20 +62,33 @@ def answer_question(question, query_engine, reranker):
                     source_label += f" {title}"
             else:
                 source_label = f"[{doc_id}]"
-            context_parts.append(f"{source_label}\n{n.text[:500]}")  # Limit context per chunk
         context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
-        # Use only CUSTOM_PROMPT from config
         from config import CUSTOM_PROMPT
         prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
-        log_message(f"\nPROMPT:\n{prompt[:300]}...\n")  # Log first 1000 chars of prompt
-        response = query_engine.query(prompt)
         sources = format_sources(reranked)
-        for i in reranked:
-            log_message(f"---\n{i.text[:500]}\n...")
-        return response.response, sources
     except Exception as e:
         log_message(f"Error: {e}")

                     source_label += f" {title}"
             else:
                 source_label = f"[{doc_id}]"
+            context_parts.append(f"{source_label}\n{n.text}")  # Use FULL text, not [:500]
         context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
+        # Use CUSTOM_PROMPT from config
         from config import CUSTOM_PROMPT
         prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
+        log_message(f"\nPROMPT LENGTH: {len(prompt)} chars\n")
+        # CRITICAL FIX: Call LLM directly instead of query_engine.query()
+        from llama_index.core import Settings
+        response = Settings.llm.complete(prompt)
         sources = format_sources(reranked)
+        # Log retrieved chunks
+        log_message(f"\n{'='*70}")
+        log_message("RETRIEVED CHUNKS:")
+        for i, node in enumerate(reranked, 1):
+            log_message(f"\n--- Chunk {i} ---")
+            log_message(f"Document: {node.metadata.get('document_id', 'unknown')}")
+            log_message(f"Type: {node.metadata.get('type', 'unknown')}")
+            if node.metadata.get('type') == 'table':
+                log_message(f"Table: {node.metadata.get('table_identifier', 'unknown')}")
+            log_message(f"Text preview: {node.text[:500]}...")
+        return response.text, sources
     except Exception as e:
         log_message(f"Error: {e}")