Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

0b6ee4f

1 Parent(s): c7a9dbd

simplest version

Browse files

Files changed (2) hide show

documents_prep.py +86 -72
utils.py +12 -28

documents_prep.py CHANGED Viewed

@@ -38,6 +38,21 @@ def chunk_text_documents(documents):
     return chunked
 def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
@@ -45,38 +60,37 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    # Enhanced table identification
     table_num_clean = str(table_num).strip()
-    # Create unique table identifier with section context
     if 'приложени' in section.lower():
-        # Extract appendix number
-        import re
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
         if appendix_match:
             appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num_clean} (Приложение {appendix_num})"
         else:
-            table_identifier = f"{table_num_clean} ({section[:30]})"
     else:
         table_identifier = table_num_clean
     if not rows:
-        log_message(f"  ⚠️  Table {table_identifier} ({doc_id}): Empty table, skipping")
         return []
-    log_message(f"  📊 Processing Table {table_identifier} ({doc_id}): {len(rows)} rows, {len(headers)} columns")
-    # For small tables
     if len(rows) <= max_rows:
-        content = format_table_content(table_data, headers, rows, table_identifier)
         chunk_size = len(content)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,  # NEW: unique identifier
             'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
@@ -84,27 +98,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
             'is_complete_table': True
         }
-        log_message(f"    ✓ Single chunk created:")
-        log_message(f"      Metadata: {metadata}")
         return [Document(text=content, metadata=metadata)]
-    # For large tables with chunking
     chunks = []
     overlap = 3
-    chunk_num = 0
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]
-        chunk_info = f"Часть {chunk_num+1}: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
         content = format_table_content(
             table_data,
             headers,
-            chunk_rows,
             table_identifier,
-            chunk_info=chunk_info
         )
         chunk_size = len(content)
@@ -113,7 +124,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,  # NEW
             'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
@@ -121,82 +132,77 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
             'row_end': i + len(chunk_rows),
             'total_rows': len(rows),
             'chunk_size': chunk_size,
-            'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
             'is_complete_table': False
         }
-        chunks.append(Document(text=content, metadata=metadata))
-        log_message(f"    Chunk {chunk_num+1} created:")
-        log_message(f"      Rows: {i}-{i+len(chunk_rows)}, Size: {chunk_size} chars")
-        log_message(f"      Metadata: {metadata}")
-        chunk_num += 1
-    log_message(f"  ✓ Table {table_identifier} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
     return chunks
-def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
-    doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    # Use enhanced identifier
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
-    content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
-    content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
-    content += f"{'='*70}\n\n"
-    # Enhanced search keywords
-    content += f"Это таблица {table_identifier} из документа {doc_id}. "
-    content += f"Идентификатор таблицы: {table_identifier}. "
-    content += f"Номер: {table_num}. "
-    content += f"Документ: {doc_id}. "
-    if section:
-        content += f"Находится в разделе: {section}. "
-        if 'приложени' in section.lower():
-            content += f"Таблица из приложения. "
     if table_title:
-        content += f"Название таблицы: {table_title}. "
-        content += f"Таблица о: {table_title}. "
-    content += f"Поиск: таблица {table_identifier} {doc_id}. "
     if chunk_info:
-        content += f"\n{chunk_info}\n"
-    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
-    content += f"="*70 + "\n\n"
     if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ СТОЛБЦОВ:\n{header_str}\n\n"
-    content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
     for idx, row in enumerate(rows, 1):
         if isinstance(row, dict):
             parts = [f"{k}: {v}" for k, v in row.items()
-                    if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
             if parts:
                 content += f"{idx}. {' | '.join(parts)}\n"
         elif isinstance(row, list):
-            parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
             if parts:
                 content += f"{idx}. {' | '.join(parts)}\n"
-    content += f"\n{'='*70}\n"
-    content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
     return content
 def load_json_documents(repo_id, hf_token, json_dir):
@@ -328,7 +334,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
     return documents
 def extract_sections_from_json(json_path):
-    """Extract sections from a single JSON file"""
     documents = []
     try:
@@ -336,8 +341,8 @@ def extract_sections_from_json(json_path):
             data = json.load(f)
         doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
-        # Extract all section levels
         for section in data.get('sections', []):
             if section.get('section_text', '').strip():
                 documents.append(Document(
@@ -345,11 +350,11 @@ def extract_sections_from_json(json_path):
                     metadata={
                         'type': 'text',
                         'document_id': doc_id,
-                        'section_id': section.get('section_id', '')
                     }
                 ))
-            # Subsections
             for subsection in section.get('subsections', []):
                 if subsection.get('subsection_text', '').strip():
                     documents.append(Document(
@@ -357,11 +362,11 @@ def extract_sections_from_json(json_path):
                         metadata={
                             'type': 'text',
                             'document_id': doc_id,
-                            'section_id': subsection.get('subsection_id', '')
                         }
                     ))
-                # Sub-subsections
                 for sub_sub in subsection.get('sub_subsections', []):
                     if sub_sub.get('sub_subsection_text', '').strip():
                         documents.append(Document(
@@ -369,7 +374,8 @@ def extract_sections_from_json(json_path):
                             metadata={
                                 'type': 'text',
                                 'document_id': doc_id,
-                                'section_id': sub_sub.get('sub_subsection_id', '')
                             }
                         ))
@@ -380,13 +386,14 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
-    """Load and chunk tables"""
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -399,21 +406,28 @@ def load_table_documents(repo_id, hf_token, table_dir):
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            # Extract file-level document_id
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
-                # Use sheet-level document_id if available, otherwise use file-level
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # CRITICAL: Pass document_id to chunk function
                 chunks = chunk_table_by_rows(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks

     return chunked
+def normalize_doc_id(doc_id):
+    """Normalize document ID for consistent matching"""
+    if not doc_id or doc_id == 'unknown':
+        return doc_id
+    doc_id = str(doc_id).strip()
+    # Normalize spacing: "ГОСТ Р" variations
+    import re
+    doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
+    doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
+    return doc_id
 def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # NORMALIZE document ID
+    doc_id = normalize_doc_id(doc_id)
     table_num_clean = str(table_num).strip()
+    # Create section-aware identifier
+    import re
     if 'приложени' in section.lower():
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
         if appendix_match:
             appendix_num = appendix_match.group(1).upper()
+            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
         else:
+            table_identifier = table_num_clean
     else:
         table_identifier = table_num_clean
     if not rows:
         return []
+    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
     if len(rows) <= max_rows:
+        content = format_table_content(table_data, headers, rows, doc_id, table_identifier)
         chunk_size = len(content)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': table_identifier,
             'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
             'is_complete_table': True
         }
+        log_message(f"    Chunk: 1/1, {chunk_size} chars, doc={doc_id}, table={table_identifier}")
         return [Document(text=content, metadata=metadata)]
     chunks = []
     overlap = 3
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]
+        chunk_num = i // (max_rows - overlap)
         content = format_table_content(
             table_data,
             headers,
+            chunk_rows,
+            doc_id,
             table_identifier,
+            chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
         )
         chunk_size = len(content)
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': table_identifier,
             'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
             'row_end': i + len(chunk_rows),
             'total_rows': len(rows),
             'chunk_size': chunk_size,
+            'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
             'is_complete_table': False
         }
+        log_message(f"    Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
+        chunks.append(Document(text=content, metadata=metadata))
     return chunks
+def format_table_content(table_data, headers, rows, doc_id, table_identifier, chunk_info=""):
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Build content with multiple search variations
     content = f"ДОКУМЕНТ: {doc_id}\n"
     content += f"ТАБЛИЦА: {table_identifier}\n"
+    # Add search variations for document ID
+    doc_variations = [doc_id]
+    if 'Р' in doc_id:
+        doc_variations.append(doc_id.replace(' Р ', ' Р'))
+        doc_variations.append(doc_id.replace(' Р ', 'Р'))
+    for var in set(doc_variations):
+        content += f"ДОКУМЕНТ_ВАРИАНТ: {var}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n\n"
+    # Enhanced search text
+    content += f"Документ {doc_id}. "
+    content += f"Таблица {table_identifier}. "
+    content += f"Номер таблицы {table_num}. "
     if table_title:
+        content += f"Название: {table_title}. "
+    if section:
+        content += f"Раздел: {section}. "
+    # Add more search patterns
+    content += f"Таблицы документа {doc_id}. "
+    content += f"Содержание {doc_id}. "
     if chunk_info:
+        content += f"{chunk_info}. "
+    content += f"\n\nДАННЫЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
     if headers:
+        content += f"СТОЛБЦЫ: {' | '.join(str(h) for h in headers)}\n\n"
     for idx, row in enumerate(rows, 1):
         if isinstance(row, dict):
             parts = [f"{k}: {v}" for k, v in row.items()
+                    if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
             if parts:
                 content += f"{idx}. {' | '.join(parts)}\n"
         elif isinstance(row, list):
+            parts = [str(v) for v in row
+                    if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
             if parts:
                 content += f"{idx}. {' | '.join(parts)}\n"
     return content
 def load_json_documents(repo_id, hf_token, json_dir):
     return documents
 def extract_sections_from_json(json_path):
     documents = []
     try:
             data = json.load(f)
         doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+        doc_id = normalize_doc_id(doc_id)  # NORMALIZE
         for section in data.get('sections', []):
             if section.get('section_text', '').strip():
                 documents.append(Document(
                     metadata={
                         'type': 'text',
                         'document_id': doc_id,
+                        'section_id': section.get('section_id', ''),
+                        'chunk_size': len(section['section_text'])
                     }
                 ))
             for subsection in section.get('subsections', []):
                 if subsection.get('subsection_text', '').strip():
                     documents.append(Document(
                         metadata={
                             'type': 'text',
                             'document_id': doc_id,
+                            'section_id': subsection.get('subsection_id', ''),
+                            'chunk_size': len(subsection['subsection_text'])
                         }
                     ))
                 for sub_sub in subsection.get('sub_subsections', []):
                     if sub_sub.get('sub_subsection_text', '').strip():
                         documents.append(Document(
                             metadata={
                                 'type': 'text',
                                 'document_id': doc_id,
+                                'section_id': sub_sub.get('sub_subsection_id', ''),
+                                'chunk_size': len(sub_sub['sub_subsection_text'])
                             }
                         ))
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
+    doc_table_count = {}
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            file_doc_id = normalize_doc_id(data.get('document_id', data.get('document', 'unknown')))
             for sheet in data.get('sheets', []):
+                sheet_doc_id = normalize_doc_id(sheet.get('document_id', sheet.get('document', file_doc_id)))
                 chunks = chunk_table_by_rows(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
+                if sheet_doc_id not in doc_table_count:
+                    doc_table_count[sheet_doc_id] = 0
+                doc_table_count[sheet_doc_id] += len(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
+    log_message(f"\n{'='*60}")
+    log_message("TABLE LOADING SUMMARY:")
+    for doc_id, count in sorted(doc_table_count.items()):
+        log_message(f"  {doc_id}: {count} table chunks")
+    log_message(f"TOTAL: {len(all_chunks)} table chunks")
+    log_message(f"{'='*60}\n")
     return all_chunks

utils.py CHANGED Viewed

@@ -41,33 +41,19 @@ def preprocess_query(question):
     question_lower = question.lower()
-    # Enhanced table detection with appendix
-    table_patterns = [
-        r'табли[цу]\w*\s+([а-яa-z0-9\.]+)(?:\s+(?:из\s+)?приложени[яеий]\s+(\d+|[а-я]))?',
-        r'табли[цу]\w*\s+(?:№|номер)?\s*([а-яa-z0-9\.]+)',
-    ]
-    doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
     enhanced_query = question
-    for pattern in table_patterns:
-        table_match = re.search(pattern, question_lower)
-        if table_match:
-            table_num = table_match.group(1).upper()
-            enhanced_query += f" таблица номер {table_num}"
-            # Add appendix context if mentioned
-            if len(table_match.groups()) > 1 and table_match.group(2):
-                appendix_num = table_match.group(2).upper()
-                enhanced_query += f" приложение {appendix_num}"
-            break
     if doc_match:
-        doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
-        enhanced_query += f" документ {doc_id}"
-        # Add variations for better matching
-        enhanced_query += f" {doc_match.group(1).upper()}Р {doc_match.group(2)}"
     return enhanced_query
@@ -119,7 +105,7 @@ def answer_question(question, query_engine, reranker):
             context_parts.append(f"{source_label}\n{n.text}")
         context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
         prompt = f"""Ты эксперт по технической документации.
 КОНТЕКСТ:
@@ -129,10 +115,8 @@ def answer_question(question, query_engine, reranker):
 ИНСТРУКЦИИ:
 1. Используй ТОЛЬКО контекст выше
-2. Если спрашивают содержание таблицы - ОБЯЗАТЕЛЬНО приведи ВСЕ данные из таблицы
-3. Укажи источник: документ и номер таблицы
-4. Если таблица разбита на части - объедини информацию
-5. Если информации нет - четко скажи об этом
 ОТВЕТ:"""

     question_lower = question.lower()
+    # Extract document ID and normalize
+    doc_match = re.search(r'(гост|нп|му)\s*р?\s*[№-]*\s*([0-9\.-]+)', question_lower)
     enhanced_query = question
     if doc_match:
+        doc_type = doc_match.group(1).upper()
+        doc_num = doc_match.group(2)
+        # Add normalized versions
+        enhanced_query += f" {doc_type} Р {doc_num}"
+        enhanced_query += f" {doc_type}Р {doc_num}"
+        enhanced_query += f" {doc_type} {doc_num}"
     return enhanced_query
             context_parts.append(f"{source_label}\n{n.text}")
         context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
+        from config import CUSTOM_PROMPT
         prompt = f"""Ты эксперт по технической документации.
 КОНТЕКСТ:
 ИНСТРУКЦИИ:
 1. Используй ТОЛЬКО контекст выше
+2. Укажи источник: документ и номер таблицы
+3. Если информации нет - четко скажи об этом
 ОТВЕТ:"""