Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

05822e9

1 Parent(s): 7565a55

max chars = 2000 + removed normalize_doc_id

Browse files

Files changed (1) hide show

documents_prep.py +4 -25

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 1024
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
@@ -38,21 +38,6 @@ def chunk_text_documents(documents):
     return chunked
-def normalize_doc_id(doc_id):
-    """Normalize document ID for consistent matching"""
-    if not doc_id or doc_id == 'unknown':
-        return doc_id
-    doc_id = str(doc_id).strip()
-    # Normalize spacing: "ГОСТ Р" variations
-    import re
-    doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
-    doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
-    return doc_id
 def chunk_table_by_content(table_data, doc_id, max_chars=2000):
     """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
@@ -60,9 +45,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    # NORMALIZE document ID
-    doc_id = normalize_doc_id(doc_id)
     table_num_clean = str(table_num).strip()
     # Create section-aware identifier
@@ -85,7 +68,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
     # Calculate base metadata size (everything except row data)
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
-    available_space = max_chars - base_size - 200  # Reserve 200 chars for footer
     # If entire table fits, return as one chunk
     full_rows_content = format_table_rows(rows)
@@ -239,10 +222,7 @@ def format_table_footer(table_identifier, doc_id):
     """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
-# Update load_table_documents to use new function
 def load_table_documents(repo_id, hf_token, table_dir):
-    """Load and chunk tables by content size"""
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -266,8 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # Use content-based chunking instead of row-based
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
                 all_chunks.extend(chunks)
         except Exception as e:

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
     return chunked
 def chunk_table_by_content(table_data, doc_id, max_chars=2000):
     """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
     # Create section-aware identifier
     # Calculate base metadata size (everything except row data)
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
+    available_space = max_chars - base_size - 200
     # If entire table fits, return as one chunk
     full_rows_content = format_table_rows(rows)
     """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
                 all_chunks.extend(chunks)
         except Exception as e: