Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 13, 2025

Commit

0fa3553

1 Parent(s): affe7a3

old version of documents prep

Browse files

Files changed (1) hide show

documents_prep.py +25 -102

documents_prep.py CHANGED Viewed

@@ -34,26 +34,6 @@ def chunk_text_documents(documents):
     return chunked
-def normalize_connection_type(s):
-    # Replace Cyrillic with Latin
-    s = s.replace('С', 'C').replace('с', 'c')
-    s = s.replace('У', 'U').replace('у', 'u')
-    s = s.replace('Т', 'T').replace('т', 't')
-    s= s.replace('С-', 'C-').replace('с-', 'c-')
-    s = s.replace('У-', 'U-').replace('у-', 'u-')
-    s = s.replace('Т-', 'T-').replace('т-', 't-')
-    # REMOVE ALL HYPHENS for consistent tokenization
-    s = s.replace('-', '')
-    return s
-def extract_connection_type(text):
-    import re
-    # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
-    match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
-    if match:
-        normalized = normalize_connection_type(match.group(0))
-        return normalized
-    return ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -61,7 +41,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_description = table_data.get('table_description', '')
     table_num_clean = str(table_num).strip()
@@ -81,13 +60,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size - NOW INCLUDING DESCRIPTION
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
-    # ADD DESCRIPTION HERE if it exists
-    if table_description:
-        base_content += f"ОПИСАНИЕ: {table_description}\n\n"
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -105,9 +79,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
-            'is_complete_table': True,
-            'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -141,8 +113,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
-                'is_complete_table': False,
-                'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
             }
             chunks.append(Document(text=content, metadata=metadata))
@@ -184,62 +155,44 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ДОКУМЕНТ: {doc_id}\n"
-    content += f"ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        # Normalize the title text itself for better searchability
-        normalized_title = normalize_connection_type(table_title)
-        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
-        # Extract and store the normalized connection type
-        connection_type = extract_connection_type(table_title)
-        if connection_type:
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
-    if table_num and table_num != table_identifier:
-        content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if section:
-        content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
-    content += f"\n{'='*70}\n"
     if headers:
-        content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
-        for i, h in enumerate(headers, 1):
-            # NORMALIZE HEADERS TOO
-            normalized_header = normalize_connection_type(h)
-            content += f"  {i}. {normalized_header}\n"
-        content += "\n"
-    content += "ДАННЫЕ ТАБЛИЦЫ:\n"
     return content
 def format_single_row(row, idx):
-    """Format a single row with normalization"""
     if isinstance(row, dict):
-        # NORMALIZE VALUES IN ROWS
-        parts = []
-        for k, v in row.items():
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(f"{k}: {normalized_v}")
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
-        # NORMALIZE LIST VALUES
-        parts = []
-        for v in row:
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(normalized_v)
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""
@@ -440,8 +393,6 @@ def load_table_documents(repo_id, hf_token, table_dir):
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
-    connection_type_sources = {}  # Track which table each type comes from
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -458,35 +409,18 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                table_num = sheet.get('table_number', 'unknown')
-                table_title = sheet.get('table_title', '')
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
-                # Track connection type source
-                conn_type = extract_connection_type(table_title)
-                if conn_type:
-                    if conn_type not in connection_type_sources:
-                        connection_type_sources[conn_type] = []
-                    connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    log_message("="*60)
-    log_message("CONNECTION TYPES AND THEIR SOURCES:")
-    for conn_type in sorted(connection_type_sources.keys()):
-        sources = connection_type_sources[conn_type]
-        log_message(f"  {conn_type}: {len(sources)} tables")
-        for src in sources:
-            log_message(f"    - {src}")
-    log_message("="*60)
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")
@@ -534,7 +468,9 @@ def load_image_documents(repo_id, hf_token, image_dir):
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)
@@ -546,19 +482,6 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     # Load tables (already chunked)
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
-    # NEW: Analyze connection types in tables
-    connection_types = {}
-    for chunk in table_chunks:
-        conn_type = chunk.metadata.get('connection_type', '')
-        if conn_type:
-            connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
-    log_message("="*60)
-    log_message("CONNECTION TYPES FOUND IN TABLES:")
-    for conn_type, count in sorted(connection_types.items()):
-        log_message(f"  {conn_type}: {count} chunks")
-    log_message("="*60)
     # Load images (no chunking needed)
     image_docs = load_image_documents(repo_id, hf_token, image_dir)

     return chunked
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
+            'is_complete_table': True
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
+                'is_complete_table': False
             }
             chunks.append(Document(text=content, metadata=metadata))
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
+    # Add table type/number prominently for matching
+    if table_num:
+        content += f"ТИП: {table_num}\n"
+    if table_title:
+        content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
+        content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n"
     if headers:
+        header_str = ' | '.join(str(h) for h in headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
     return content
 def format_single_row(row, idx):
+    """Format a single row"""
     if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
         if parts:
             return f"{idx}. {' | '.join(parts)}\n"
     return ""
 def format_table_rows(rows):
     """Format multiple rows"""
     content = ""
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # Use the consistent MAX_CHARS_TABLE from config
                 chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
     log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     """Load image descriptions"""
     log_message("Loading images...")
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
     log_message("="*60)
     log_message("STARTING DOCUMENT LOADING")
     log_message("="*60)
     # Load tables (already chunked)
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
     image_docs = load_image_documents(repo_id, hf_token, image_dir)