Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

0b34162

1 Parent(s): 6b0d121

simplest version

Browse files

Files changed (1) hide show

documents_prep.py +56 -27

documents_prep.py CHANGED Viewed

@@ -23,54 +23,64 @@ def chunk_text_documents(documents):
         for i, chunk in enumerate(chunks):
             chunk.metadata.update({
                 'chunk_id': i,
-                'total_chunks': len(chunks)
             })
             chunked.append(chunk)
-    log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
     return chunked
-def chunk_table_by_rows(table_data, doc_id, max_rows=50):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_num_clean = str(table_num).replace('№', '').strip()
     if not rows:
         return []
-    if 'document_id' not in table_data:
-        table_data['document_id'] = doc_id
     if len(rows) <= max_rows:
         content = format_table_content(table_data, headers, rows)
         return [Document(
             text=content,
             metadata={
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
-                'table_number_original': table_num,
                 'table_title': table_title,
                 'section': section,
                 'total_rows': len(rows),
                 'is_complete_table': True
             }
         )]
     chunks = []
-    overlap = 5
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]
-        chunk_info = f"Часть таблицы: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
-        if i > 0:
-            chunk_info += " (с перекрытием для контекста)"
         content = format_table_content(
             table_data,
@@ -79,25 +89,31 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=50):
             chunk_info=chunk_info
         )
         chunks.append(Document(
             text=content,
             metadata={
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
-                'table_number_original': table_num,
                 'table_title': table_title,
                 'section': section,
-                'chunk_id': i // (max_rows - overlap),
                 'row_start': i,
                 'row_end': i + len(chunk_rows),
                 'total_rows': len(rows),
                 'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
                 'is_complete_table': False
             }
         ))
     log_message(f"  📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
     return chunks
@@ -220,16 +236,13 @@ def load_json_documents(repo_id, hf_token, json_dir):
                     try:
                         file_content = zf.read(json_file)
-                        if file_content.startswith(b'\xff\xfe') or file_content.startswith(b'\xfe\xff'):
-                            log_message(f"      ✗ Skipping: {json_file} (appears to be UTF-16 encoded)")
-                            stats['failed'] += 1
-                            continue
-                        if not file_content.strip().startswith(b'{'):
-                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
                             stats['failed'] += 1
                             continue
                         try:
                             text_content = file_content.decode('utf-8')
                         except UnicodeDecodeError:
@@ -237,11 +250,21 @@ def load_json_documents(repo_id, hf_token, json_dir):
                                 text_content = file_content.decode('utf-8-sig')
                             except UnicodeDecodeError:
                                 try:
-                                    text_content = file_content.decode('windows-1251')
                                 except UnicodeDecodeError:
-                                    log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
-                                    stats['failed'] += 1
-                                    continue
                         with tempfile.NamedTemporaryFile(mode='w', delete=False,
                                                         suffix='.json', encoding='utf-8') as tmp:
@@ -395,19 +418,25 @@ def load_image_documents(repo_id, hf_token, image_dir):
                 content += f"Описание: {row.get('Описание изображение', '')}\n"
                 content += f"Раздел: {row.get('Раздел документа', '')}\n"
                 documents.append(Document(
                     text=content,
                     metadata={
                         'type': 'image',
                         'document_id': str(row.get('Обозначение документа', 'unknown')),
                         'image_number': str(row.get('№ Изображения', 'unknown')),
-                        'section': str(row.get('Раздел документа', ''))
                     }
                 ))
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(documents)} images")
     return documents

         for i, chunk in enumerate(chunks):
             chunk.metadata.update({
                 'chunk_id': i,
+                'total_chunks': len(chunks),
+                'chunk_size': len(chunk.text)  # Add chunk size
             })
             chunked.append(chunk)
+    # Log statistics
+    if chunked:
+        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
+        min_size = min(len(c.text) for c in chunked)
+        max_size = max(len(c.text) for c in chunked)
+        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
+        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
     return chunked
+def chunk_table_by_rows(table_data, doc_id, max_rows=10):  # Reduced from 30
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Keep original format
+    table_num_clean = str(table_num).strip()
     if not rows:
         return []
+    # For small tables, keep as single chunk
     if len(rows) <= max_rows:
         content = format_table_content(table_data, headers, rows)
+        chunk_size = len(content)
+        log_message(f"  📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → 1 chunk ({chunk_size} chars)")
         return [Document(
             text=content,
             metadata={
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
                 'table_title': table_title,
                 'section': section,
                 'total_rows': len(rows),
+                'chunk_size': chunk_size,
                 'is_complete_table': True
             }
         )]
+    # For large tables, chunk with overlap
     chunks = []
+    overlap = 3  # Reduced overlap
+    chunk_num = 0
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]
+        chunk_info = f"Часть {chunk_num+1}: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
         content = format_table_content(
             table_data,
             chunk_info=chunk_info
         )
+        chunk_size = len(content)
         chunks.append(Document(
             text=content,
             metadata={
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
                 'table_title': table_title,
                 'section': section,
+                'chunk_id': chunk_num,
                 'row_start': i,
                 'row_end': i + len(chunk_rows),
                 'total_rows': len(rows),
+                'chunk_size': chunk_size,
                 'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
                 'is_complete_table': False
             }
         ))
+        chunk_num += 1
     log_message(f"  📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
+    for idx, chunk in enumerate(chunks):
+        log_message(f"    Chunk {idx+1}: rows {chunk.metadata['row_start']}-{chunk.metadata['row_end']} ({chunk.metadata['chunk_size']} chars)")
     return chunks
                     try:
                         file_content = zf.read(json_file)
+                        # Skip if file is too small
+                        if len(file_content) < 10:
+                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
                             stats['failed'] += 1
                             continue
+                        # Try UTF-8 first (most common)
                         try:
                             text_content = file_content.decode('utf-8')
                         except UnicodeDecodeError:
                                 text_content = file_content.decode('utf-8-sig')
                             except UnicodeDecodeError:
                                 try:
+                                    # Try UTF-16 (the issue you're seeing)
+                                    text_content = file_content.decode('utf-16')
                                 except UnicodeDecodeError:
+                                    try:
+                                        text_content = file_content.decode('windows-1251')
+                                    except UnicodeDecodeError:
+                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
+                                        stats['failed'] += 1
+                                        continue
+                        # Validate JSON structure
+                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
+                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
+                            stats['failed'] += 1
+                            continue
                         with tempfile.NamedTemporaryFile(mode='w', delete=False,
                                                         suffix='.json', encoding='utf-8') as tmp:
                 content += f"Описание: {row.get('Описание изображение', '')}\n"
                 content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                chunk_size = len(content)
                 documents.append(Document(
                     text=content,
                     metadata={
                         'type': 'image',
                         'document_id': str(row.get('Обозначение документа', 'unknown')),
                         'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', '')),
+                        'chunk_size': chunk_size
                     }
                 ))
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
+    if documents:
+        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
+        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
     return documents