Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

9c9aff4

1 Parent(s): 04f5154

big debug change

Browse files

Files changed (2) hide show

app.py +26 -23
documents_prep.py +3 -1

app.py CHANGED Viewed

@@ -149,37 +149,39 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
         all_documents = []
         chunks_df = None
         if use_json_instead_csv and json_files_dir:
             log_message("Используем JSON файлы вместо CSV")
-            from documents_prep import load_json_documents, chunk_text_documents
-            # Load JSON docs (returns list of Documents)
-            json_documents = load_json_documents(repo_id, hf_token, json_files_dir)
-            # Chunk them
-            json_chunks = chunk_text_documents(json_documents)
-            all_documents.extend(json_chunks)
         else:
             if chunks_filename:
                 log_message("Загружаем данные из CSV")
-        if table_data_dir:
-            log_message("Добавляю табличные данные")
-            from documents_prep import load_table_documents
-            # load_table_documents already returns chunked documents
-            table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
-            log_message(f"Загружено {len(table_chunks)} табличных чанков")
-            all_documents.extend(table_chunks)
-        if image_data_dir:
-            log_message("Добавляю данные изображений")
-            from documents_prep import load_image_documents
-            # load_image_documents returns documents (no chunking needed)
-            image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
-            log_message(f"Загружено {len(image_documents)} документов изображений")
-            all_documents.extend(image_documents)
         log_message(f"Всего документов после всей обработки: {len(all_documents)}")
@@ -197,6 +199,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
             })
         log_message(f"Система успешно инициализирована")

         all_documents = []
         chunks_df = None
+        # CHANGED: Use load_all_documents instead of loading separately
         if use_json_instead_csv and json_files_dir:
             log_message("Используем JSON файлы вместо CSV")
+            from documents_prep import load_all_documents
+            # This will handle text, tables, and images all together with proper logging
+            all_documents = load_all_documents(
+                repo_id=repo_id,
+                hf_token=hf_token,
+                json_dir=json_files_dir,
+                table_dir=table_data_dir if table_data_dir else "",
+                image_dir=image_data_dir if image_data_dir else ""
+            )
         else:
+            # OLD PATH: Loading separately (fallback)
             if chunks_filename:
                 log_message("Загружаем данные из CSV")
+            if table_data_dir:
+                log_message("Добавляю табличные данные")
+                from documents_prep import load_table_documents
+                table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
+                log_message(f"Загружено {len(table_chunks)} табличных чанков")
+                all_documents.extend(table_chunks)
+            if image_data_dir:
+                log_message("Добавляю данные изображений")
+                from documents_prep import load_image_documents
+                image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
+                log_message(f"Загружено {len(image_documents)} документов изображений")
+                all_documents.extend(image_documents)
         log_message(f"Всего документов после всей обработки: {len(all_documents)}")
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
+                'connection_type': doc.metadata.get('connection_type', '')  # ADD THIS
             })
         log_message(f"Система успешно инициализирована")

documents_prep.py CHANGED Viewed

@@ -126,7 +126,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
-                'is_complete_table': False
             }
             chunks.append(Document(text=content, metadata=metadata))
@@ -491,6 +492,7 @@ def load_image_documents(repo_id, hf_token, image_dir):
         log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
     log_message("="*60)

                 'row_end': current_rows[-1]['_idx'],
                 'total_rows': len(rows),
                 'chunk_size': len(content),
+                'is_complete_table': False,
+                'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
             }
             chunks.append(Document(text=content, metadata=metadata))
         log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
     return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
     log_message("="*60)