Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

0b28542

1 Parent(s): 9985d37

simplest version

Browse files

Files changed (1) hide show

documents_prep.py +131 -23

documents_prep.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
@@ -41,6 +42,10 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     if not rows:
         return []
     # Small table: keep whole
     if len(rows) <= max_rows:
         content = format_table_content(table_data, headers, rows)
@@ -60,8 +65,12 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     chunks = []
     for i in range(0, len(rows), max_rows):
         chunk_rows = rows[i:i+max_rows]
-        content = format_table_content(table_data, headers, chunk_rows,
-                                       chunk_info=f"Rows {i+1}-{i+len(chunk_rows)}")
         chunks.append(Document(
             text=content,
@@ -78,18 +87,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
             }
         ))
-    log_message(f"  📊 Table {table_num}: {len(rows)} rows → {len(chunks)} chunks")
     return chunks
 def format_table_content(table_data, headers, rows, chunk_info=""):
     """Format table for semantic search"""
-    doc_id = table_data.get('document_id', 'unknown')
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    content = f"Документ: {doc_id}\n"
     content += f"Таблица: {table_num}\n"
     if table_title:
         content += f"Название: {table_title}\n"
@@ -97,29 +112,52 @@ def format_table_content(table_data, headers, rows, chunk_info=""):
         content += f"Раздел: {section}\n"
     if chunk_info:
         content += f"{chunk_info}\n"
-    content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n\n"
-    # Add rows
     for row in rows:
         if isinstance(row, dict):
             parts = [f"{k}: {v}" for k, v in row.items()
                     if v and str(v).strip() and str(v) != 'nan']
-            content += ' | '.join(parts) + "\n"
         elif isinstance(row, list):
             parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
-            content += ' | '.join(parts) + "\n"
     return content
 def load_json_documents(repo_id, hf_token, json_dir):
-    """Load text sections from JSON"""
     log_message("Loading JSON documents...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
     documents = []
     for file_path in json_files:
         try:
             local_path = hf_hub_download(
@@ -129,26 +167,91 @@ def load_json_documents(repo_id, hf_token, json_dir):
                 token=hf_token
             )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
-            # Extract sections
-            for section in data.get('sections', []):
-                if section.get('section_text', '').strip():
                     documents.append(Document(
-                        text=section['section_text'],
                         metadata={
                             'type': 'text',
                             'document_id': doc_id,
-                            'section_id': section.get('section_id', '')
                         }
                     ))
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(documents)} text sections")
     return documents
@@ -172,10 +275,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            doc_id = data.get('document_id', 'unknown')
             for sheet in data.get('sheets', []):
-                chunks = chunk_table_by_rows(sheet, doc_id)
                 all_chunks.extend(chunks)
         except Exception as e:

 import json
+import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
     if not rows:
         return []
+    # Ensure table_data has document_id for format_table_content
+    if 'document_id' not in table_data:
+        table_data['document_id'] = doc_id
     # Small table: keep whole
     if len(rows) <= max_rows:
         content = format_table_content(table_data, headers, rows)
     chunks = []
     for i in range(0, len(rows), max_rows):
         chunk_rows = rows[i:i+max_rows]
+        content = format_table_content(
+            table_data,
+            headers,
+            chunk_rows,
+            chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
+        )
         chunks.append(Document(
             text=content,
             }
         ))
+    log_message(f"  📊 Table {table_num} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
     return chunks
 def format_table_content(table_data, headers, rows, chunk_info=""):
     """Format table for semantic search"""
+    doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Normalize table number
+    if table_num and table_num != 'unknown':
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+    content = f"=== ТАБЛИЦА ===\n"
+    content += f"Документ: {doc_id}\n"
     content += f"Таблица: {table_num}\n"
     if table_title:
         content += f"Название: {table_title}\n"
         content += f"Раздел: {section}\n"
     if chunk_info:
         content += f"{chunk_info}\n"
+    content += f"================\n\n"
+    # Add searchable description
+    content += f"Это таблица {table_num} из документа {doc_id}. "
+    if table_title:
+        content += f"{table_title}. "
+    if section:
+        content += f"Находится в разделе: {section}. "
+    content += f"\n\n"
+    # Headers
+    if headers:
+        header_str = ' | '.join(str(h) for h in headers)
+        content += f"Колонки: {header_str}\n\n"
+    # Rows
     for row in rows:
         if isinstance(row, dict):
             parts = [f"{k}: {v}" for k, v in row.items()
                     if v and str(v).strip() and str(v) != 'nan']
+            if parts:
+                content += ' | '.join(parts) + "\n"
         elif isinstance(row, list):
             parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
+            if parts:
+                content += ' | '.join(parts) + "\n"
     return content
 def load_json_documents(repo_id, hf_token, json_dir):
+    """Load text sections from JSON (including ZIPs)"""
+    import zipfile
+    import tempfile
     log_message("Loading JSON documents...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
+    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
+    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
     documents = []
+    # Load direct JSON files
     for file_path in json_files:
         try:
             local_path = hf_hub_download(
                 token=hf_token
             )
+            docs = extract_sections_from_json(local_path)
+            documents.extend(docs)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    # Extract and load ZIP files
+    for zip_path in zip_files:
+        try:
+            local_zip = hf_hub_download(
+                repo_id=repo_id,
+                filename=zip_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with zipfile.ZipFile(local_zip, 'r') as zf:
+                for json_file in zf.namelist():
+                    if json_file.endswith('.json') and not json_file.startswith('__MACOSX'):
+                        with zf.open(json_file) as f:
+                            with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
+                                tmp.write(f.read())
+                                tmp_path = tmp.name
+                            docs = extract_sections_from_json(tmp_path)
+                            documents.extend(docs)
+                            import os
+                            os.unlink(tmp_path)
+        except Exception as e:
+            log_message(f"Error loading ZIP {zip_path}: {e}")
+    log_message(f"✓ Loaded {len(documents)} text sections")
+    return documents
+def extract_sections_from_json(json_path):
+    """Extract sections from a single JSON file"""
+    documents = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+        # Extract all section levels
+        for section in data.get('sections', []):
+            if section.get('section_text', '').strip():
+                documents.append(Document(
+                    text=section['section_text'],
+                    metadata={
+                        'type': 'text',
+                        'document_id': doc_id,
+                        'section_id': section.get('section_id', '')
+                    }
+                ))
+            # Subsections
+            for subsection in section.get('subsections', []):
+                if subsection.get('subsection_text', '').strip():
                     documents.append(Document(
+                        text=subsection['subsection_text'],
                         metadata={
                             'type': 'text',
                             'document_id': doc_id,
+                            'section_id': subsection.get('subsection_id', '')
                         }
                     ))
+                # Sub-subsections
+                for sub_sub in subsection.get('sub_subsections', []):
+                    if sub_sub.get('sub_subsection_text', '').strip():
+                        documents.append(Document(
+                            text=sub_sub['sub_subsection_text'],
+                            metadata={
+                                'type': 'text',
+                                'document_id': doc_id,
+                                'section_id': sub_sub.get('sub_subsection_id', '')
+                            }
+                        ))
+    except Exception as e:
+        log_message(f"Error extracting from {json_path}: {e}")
     return documents
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            # Extract file-level document_id
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
+                # Use sheet-level document_id if available, otherwise use file-level
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # CRITICAL: Pass document_id to chunk function
+                chunks = chunk_table_by_rows(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)
         except Exception as e: