Spaces:

MrSimple01
/

RAG_AIEXP_1

Sleeping

App Files Files Community

MrSimple07 commited on Oct 19, 2025

Commit

db48355

1 Parent(s): af31260

new info about uploaded files + new main_utils + ui buttons

Browse files

Files changed (4) hide show

app.py +117 -117
converters/converter.py +103 -14
documents_prep.py +23 -2
utils.py → main_utils.py +0 -0

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_documents, load_image_documents
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys
@@ -10,116 +10,37 @@ from config import (
     HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
-from converters.converter import convert_single_excel_to_json, convert_single_excel_to_csv
-def merge_table_chunks(chunk_info):
-    merged = {}
-    for chunk in chunk_info:
-        doc_type = chunk.get('type', 'text')
-        doc_id = chunk.get('document_id', 'unknown')
-        if doc_type == 'table' or doc_type == 'table_row':
-            table_num = chunk.get('table_number', '')
-            key = f"{doc_id}_{table_num}"
-            if key not in merged:
-                merged[key] = {
-                    'document_id': doc_id,
-                    'type': 'table',
-                    'table_number': table_num,
-                    'section_id': chunk.get('section_id', 'unknown'),
-                    'chunk_text': chunk.get('chunk_text', '')
-                }
-            else:
-                merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
-        else:
-            unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
-            merged[unique_key] = chunk
-    return list(merged.values())
-def create_chunks_display_html(chunk_info):
-    if not chunk_info:
-        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
-    merged_chunks = merge_table_chunks(chunk_info)
-    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
-    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
-    for i, chunk in enumerate(merged_chunks):
-        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
-        section_display = get_section_display(chunk)
-        formatted_content = get_formatted_content(chunk)
-        html += f"""
-        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
-            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
-            <strong style='color: black;'>Содержание:</strong><br>
-            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
-                {formatted_content}
-            </div>
-        </div>
-        """
-    html += "</div>"
-    return html
-def get_section_display(chunk):
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    doc_type = chunk.get('type', 'text')
-    if doc_type == 'table' and chunk.get('table_number'):
-        table_num = chunk.get('table_number')
-        if not str(table_num).startswith('№'):
-            table_num = f"№{table_num}"
-        return f"таблица {table_num}"
-    if doc_type == 'image' and chunk.get('image_number'):
-        image_num = chunk.get('image_number')
-        if not str(image_num).startswith('№'):
-            image_num = f"№{image_num}"
-        return f"рисунок {image_num}"
-    if section_path:
-        return section_path
-    elif section_id and section_id != 'unknown':
-        return section_id
-    return section_id
-def get_formatted_content(chunk):
-    document_id = chunk.get('document_id', 'unknown')
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    section_text = chunk.get('section_text', '')
-    parent_section = chunk.get('parent_section', '')
-    parent_title = chunk.get('parent_title', '')
-    level = chunk.get('level', '')
-    chunk_text = chunk.get('chunk_text', '')
-    doc_type = chunk.get('type', 'text')
-    # For text documents
-    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
-        current_section = section_path if section_path else section_id
-        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
-        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
-    else:
-        current_section = section_path if section_path else section_id
-        clean_text = chunk_text
-        if section_text and chunk_text.startswith(section_text):
-            section_title = section_text
-        elif chunk_text.startswith(f"{current_section} "):
-            clean_text = chunk_text[len(f"{current_section} "):].strip()
-            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
         else:
-            section_title = section_text if section_text else current_section
-        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
@@ -190,7 +111,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
-                'connection_type': doc.metadata.get('connection_type', '')  # ADD THIS
             })
         log_message(f"Система успешно инициализирована")
@@ -225,15 +146,15 @@ def switch_model(model_name, vector_index):
         return None, f"❌ {error_msg}"
 retrieval_params = {
-    'vector_top_k': 50,
-    'bm25_top_k': 50,
-    'similarity_cutoff': 0.55,
-    'hybrid_top_k': 100,
     'rerank_top_k': 20
 }
-def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
-                       similarity_cutoff=0.55, hybrid_top_k=100):
     try:
         from config import CUSTOM_PROMPT
         from index_retriever import create_query_engine as create_index_query_engine
@@ -424,7 +345,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     vector_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
-                        value=50,
                         step=10,
                         label="Vector Top K",
                         info="Количество результатов из векторного поиска"
@@ -434,7 +355,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     bm25_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
-                        value=50,
                         step=10,
                         label="BM25 Top K",
                         info="Количество результатов из BM25 поиска"
@@ -445,7 +366,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     similarity_cutoff = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
-                        value=0.55,
                         step=0.05,
                         label="Similarity Cutoff",
                         info="Минимальный порог схожести для векторного поиска"
@@ -455,7 +376,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     hybrid_top_k = gr.Slider(
                         minimum=10,
                         maximum=300,
-                        value=100,
                         step=10,
                         label="Hybrid Top K",
                         info="Количество результатов из гибридного поиска"
@@ -497,7 +418,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
             gr.Markdown("### Текущие параметры:")
             current_params_display = gr.Textbox(
-                value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
                 label="",
                 interactive=False,
                 lines=2
@@ -520,6 +441,85 @@ Rerank Top K: {retrieval_params['rerank_top_k']}"""
                 outputs=[current_params_display]
             )
         switch_btn.click(
             fn=switch_model_func,
             inputs=[model_dropdown],

 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_documents, load_image_documents
+from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys
     HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
+from converters.converter import process_uploaded_file, convert_single_excel_to_json, convert_single_excel_to_csv
+from main_utils import *
+def restart_system():
+    """Перезапуск системы для применения новых документов"""
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    try:
+        log_message("Начало перезапуска системы...")
+        query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+            repo_id=HF_REPO_ID,
+            hf_token=HF_TOKEN,
+            download_dir=DOWNLOAD_DIR,
+            json_files_dir=JSON_FILES_DIR,
+            table_data_dir=TABLE_DATA_DIR,
+            image_data_dir=IMAGE_DATA_DIR,
+            use_json_instead_csv=True,
+        )
+        if query_engine:
+            log_message("Система успешно перезапущена")
+            return "✅ Система успешно перезапущена! Новые документы загружены."
         else:
+            return "❌ Ошибка при перезапуске системы"
+    except Exception as e:
+        error_msg = f"Ошибка перезапуска: {str(e)}"
+        log_message(error_msg)
+        return f"❌ {error_msg}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
+                'connection_type': doc.metadata.get('connection_type', '')
             })
         log_message(f"Система успешно инициализирована")
         return None, f"❌ {error_msg}"
 retrieval_params = {
+    'vector_top_k': 70,
+    'bm25_top_k': 70,
+    'similarity_cutoff': 0.45,
+    'hybrid_top_k': 140,
     'rerank_top_k': 20
 }
+def create_query_engine(vector_index, vector_top_k=70, bm25_top_k=70,
+                       similarity_cutoff=0.45, hybrid_top_k=140):
     try:
         from config import CUSTOM_PROMPT
         from index_retriever import create_query_engine as create_index_query_engine
                     vector_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
+                        value=70,
                         step=10,
                         label="Vector Top K",
                         info="Количество результатов из векторного поиска"
                     bm25_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
+                        value=70,
                         step=10,
                         label="BM25 Top K",
                         info="Количество результатов из BM25 поиска"
                     similarity_cutoff = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
+                        value=0.45,
                         step=0.05,
                         label="Similarity Cutoff",
                         info="Минимальный порог схожести для векторного поиска"
                     hybrid_top_k = gr.Slider(
                         minimum=10,
                         maximum=300,
+                        value=140,
                         step=10,
                         label="Hybrid Top K",
                         info="Количество результатов из гибридного поиска"
             gr.Markdown("### Текущие параметры:")
             current_params_display = gr.Textbox(
+                value="Vector: 70 | BM25: 70 | Cutoff: 0.45 | Hybrid: 140 | Rerank: 20",
                 label="",
                 interactive=False,
                 lines=2
                 outputs=[current_params_display]
             )
+        with gr.Tab("📤 Загрузка документов"):
+            gr.Markdown("""
+            ### Загрузка новых документов в систему
+            Выберите тип документа и загрузите файл. Система автоматически обработает и добавит его в базу знаний.
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    file_type_radio = gr.Radio(
+                        choices=["Таблица", "Изображение (метаданные)", "JSON документ"],
+                        value="Таблица",
+                        label="Тип документа",
+                        info="Выберите тип загружаемого документа"
+                    )
+                    file_upload = gr.File(
+                        label="Выберите файл",
+                        file_types=[".xlsx", ".xls", ".csv", ".json"],
+                        type="filepath"
+                    )
+                    with gr.Row():
+                        upload_btn = gr.Button("📤 Загрузить и обработать", variant="primary", size="lg")
+                        restart_btn = gr.Button("🔄 Перезапустить систему", variant="secondary", size="lg")
+                    upload_status = gr.Textbox(
+                        label="Статус загрузки",
+                        value="Ожидание загрузки файла...",
+                        interactive=False,
+                        lines=8
+                    )
+                    restart_status = gr.Textbox(
+                        label="Статус перезапуска",
+                        value="Система готова к работе",
+                        interactive=False,
+                        lines=2
+                    )
+                with gr.Column(scale=1):
+                    gr.Markdown("""
+                    ### Требования к файлам:
+                    **Таблицы (Excel → JSON):**
+                    - Формат: .xlsx или .xls
+                    - Обязательные колонки:
+                    - Номер таблицы
+                    - Обозначение документа
+                    - Раздел документа
+                    - Название таблицы
+                    **Изображения (Excel → CSV):**
+                    - Формат: .xlsx, .xls или .csv
+                    - Метаданные изображений
+                    **JSON документы:**
+                    - Формат: .json
+                    - Структурированные данные
+                    ### Процесс загрузки:
+                    1. Выберите тип документа
+                    2. Загрузите файл
+                    3. Дождитесь обработки
+                    4. Нажмите "Перезапустить систему"
+                    """)
+        upload_btn.click(
+            fn=process_uploaded_file,
+            inputs=[file_upload, file_type_radio],
+            outputs=[upload_status]
+        )
+        restart_btn.click(
+            fn=restart_system,
+            inputs=[],
+            outputs=[restart_status]
+        )
         switch_btn.click(
             fn=switch_model_func,
             inputs=[model_dropdown],

converters/converter.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from config import *
-from utils import log_message
 import json
 import pandas as pd
 import os
@@ -13,35 +13,99 @@ def process_uploaded_file(file, file_type):
         from huggingface_hub import HfApi
         import tempfile
         import shutil
-        # Создаем временную директорию
         with tempfile.TemporaryDirectory() as temp_dir:
-            # Сохраняем загруженный файл
-            file_path = os.path.join(temp_dir, file.name)
-            shutil.copy(file.name, file_path)
-            # Определяем целевую директорию на HuggingFace
             if file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
-                # Конвертируем Excel в JSON
-                if file.name.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
                 else:
                     upload_file = file_path
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
-                # Конвертируем Excel в CSV
-                if file.name.endswith(('.xlsx', '.xls')):
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
                 else:
                     upload_file = file_path
             else:  # JSON документ
                 target_dir = JSON_FILES_DIR
                 upload_file = file_path
             # Загружаем на HuggingFace
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
@@ -51,8 +115,13 @@ def process_uploaded_file(file, file_type):
                 repo_type="dataset"
             )
-            log_message(f"Файл {file.name} успешно загружен в {target_dir}")
-            return f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n⚠️ Перезапустите систему для применения изменений"
     except Exception as e:
         error_msg = f"Ошибка обработки файла: {str(e)}"
@@ -69,12 +138,18 @@ def convert_single_excel_to_json(excel_path, output_dir):
         "sheets": []
     }
     for sheet_name, df in df_dict.items():
         if df.empty or "Номер таблицы" not in df.columns:
             continue
         df = df.dropna(how='all').fillna("")
         grouped = df.groupby("Номер таблицы")
         for table_number, group in grouped:
             group = group.reset_index(drop=True)
@@ -98,6 +173,10 @@ def convert_single_excel_to_json(excel_path, output_dir):
                 sheet_data["data"].append(row_dict)
             result["sheets"].append(sheet_data)
     json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
     json_path = os.path.join(output_dir, json_filename)
@@ -105,12 +184,22 @@ def convert_single_excel_to_json(excel_path, output_dir):
     with open(json_path, 'w', encoding='utf-8') as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
     return json_path
 def convert_single_excel_to_csv(excel_path, output_dir):
     """Конвертация одного Excel файла в CSV для изображений"""
     df = pd.read_excel(excel_path)
     csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
     csv_path = os.path.join(output_dir, csv_filename)
     df.to_csv(csv_path, index=False, encoding='utf-8')
-    return csv_path

 from config import *
+from my_logging import log_message
 import json
 import pandas as pd
 import os
         from huggingface_hub import HfApi
         import tempfile
         import shutil
         with tempfile.TemporaryDirectory() as temp_dir:
+            source_path = file if isinstance(file, str) else file.name
+            filename = os.path.basename(source_path)
+            file_path = os.path.join(temp_dir, filename)
+            log_message(f"Начало обработки файла: {filename}")
+            log_message(f"Тип документа: {file_type}")
+            if os.path.abspath(source_path) != os.path.abspath(file_path):
+                shutil.copy(source_path, file_path)
+            else:
+                file_path = source_path
+            # Get original file size
+            original_size_bytes = os.path.getsize(file_path)
+            original_size_mb = original_size_bytes / (1024 * 1024)
+            status_info = []
+            status_info.append(f"📁 Исходный файл: {filename}")
+            status_info.append(f"📦 Размер файла: {original_size_mb:.2f} МБ ({original_size_bytes:,} байт)")
             if file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
+                if filename.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
+                    # Get processed file size
+                    processed_size_bytes = os.path.getsize(json_path)
+                    processed_size_mb = processed_size_bytes / (1024 * 1024)
+                    with open(json_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                    total_rows = sum(len(sheet['data']) for sheet in data['sheets'])
+                    status_info.append(f"📊 Всего таблиц: {len(data['sheets'])}")
+                    status_info.append(f"📄 Листов в документе: {data['total_sheets']}")
+                    status_info.append(f"📝 Всего строк данных: {total_rows:,}")
+                    status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
+                    status_info.append(f"📤 Загружен как: {os.path.basename(json_path)}")
                 else:
                     upload_file = file_path
+                    status_info.append(f"📤 Загружен как: {filename}")
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
+                if filename.endswith(('.xlsx', '.xls')):
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
+                    # Get processed file size
+                    processed_size_bytes = os.path.getsize(csv_path)
+                    processed_size_mb = processed_size_bytes / (1024 * 1024)
+                    df = pd.read_csv(csv_path)
+                    status_info.append(f"🖼️ Записей изображений: {len(df):,}")
+                    status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
+                    status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
+                    status_info.append(f"📤 Загружен как: {os.path.basename(csv_path)}")
                 else:
                     upload_file = file_path
+                    try:
+                        df = pd.read_csv(upload_file)
+                        status_info.append(f"🖼️ Записей изображений: {len(df):,}")
+                        status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
+                    except:
+                        pass
+                    status_info.append(f"📤 Загружен как: {filename}")
             else:  # JSON документ
                 target_dir = JSON_FILES_DIR
                 upload_file = file_path
+                try:
+                    with open(upload_file, 'r', encoding='utf-8') as f:
+                        json_data = json.load(f)
+                    if isinstance(json_data, list):
+                        status_info.append(f"📝 Документов в JSON: {len(json_data):,}")
+                    elif isinstance(json_data, dict):
+                        status_info.append(f"📝 JSON объект (словарь)")
+                        # Count keys if it's structured data
+                        if 'sheets' in json_data:
+                            status_info.append(f"📊 Таблиц в документе: {len(json_data.get('sheets', []))}")
+                        status_info.append(f"🔑 Ключей верхнего уровня: {len(json_data.keys())}")
+                except:
+                    pass
+                status_info.append(f"📤 Загружен как: {filename}")
             # Загружаем на HuggingFace
+            log_message(f"Загрузка на HuggingFace: {target_dir}/{os.path.basename(upload_file)}")
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
                 repo_type="dataset"
             )
+            log_message(f"Файл {filename} успешно загружен в {target_dir}")
+            result_message = f"✅ Файл успешно загружен и обработан\n\n"
+            result_message += "\n".join(status_info)
+            result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"
+            return result_message
     except Exception as e:
         error_msg = f"Ошибка обработки файла: {str(e)}"
         "sheets": []
     }
+    log_message(f"Обработка файла: {os.path.basename(excel_path)}")
+    log_message(f"Найдено листов: {len(df_dict)}")
+    total_tables = 0
     for sheet_name, df in df_dict.items():
         if df.empty or "Номер таблицы" not in df.columns:
+            log_message(f"  Лист '{sheet_name}': пропущен (пустой или отсутствует колонка 'Номер таблицы')")
             continue
         df = df.dropna(how='all').fillna("")
         grouped = df.groupby("Номер таблицы")
+        sheet_tables = 0
         for table_number, group in grouped:
             group = group.reset_index(drop=True)
                 sheet_data["data"].append(row_dict)
             result["sheets"].append(sheet_data)
+            sheet_tables += 1
+        total_tables += sheet_tables
+        log_message(f"  Лист '{sheet_name}': обработано таблиц: {sheet_tables}")
     json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
     json_path = os.path.join(output_dir, json_filename)
     with open(json_path, 'w', encoding='utf-8') as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+    log_message(f"Конвертация завершена. Всего таблиц обработано: {total_tables}")
+    log_message(f"Результат сохранен: {json_filename}")
     return json_path
 def convert_single_excel_to_csv(excel_path, output_dir):
     """Конвертация одного Excel файла в CSV для изображений"""
+    log_message(f"Конвертация Excel в CSV: {os.path.basename(excel_path)}")
     df = pd.read_excel(excel_path)
     csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
     csv_path = os.path.join(output_dir, csv_filename)
     df.to_csv(csv_path, index=False, encoding='utf-8')
+    log_message(f"  Строк обработано: {len(df)}")
+    log_message(f"  Колонок: {len(df.columns)}")
+    log_message(f"  Результат сохранен: {csv_filename}")
+    return csv_path

documents_prep.py CHANGED Viewed

@@ -515,7 +515,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     log_message("="*60)
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     tables_processed = 0
@@ -529,6 +529,16 @@ def load_table_documents(repo_id, hf_token, table_dir):
                 token=hf_token
             )
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
@@ -551,11 +561,12 @@ def load_table_documents(repo_id, hf_token, table_dir):
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     log_message("Loading images...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
     documents = []
     for file_path in csv_files:
@@ -567,6 +578,16 @@ def load_image_documents(repo_id, hf_token, image_dir):
                 token=hf_token
             )
             df = pd.read_csv(local_path)
             for _, row in df.iterrows():

     log_message("Loading tables...")
     log_message("="*60)
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and (f.endswith('.json') or f.endswith('.xlsx') or f.endswith('.xls'))]
     all_chunks = []
     tables_processed = 0
                 token=hf_token
             )
+            # Convert Excel to JSON if needed
+            if file_path.endswith(('.xlsx', '.xls')):
+                from converters.converter import convert_single_excel_to_json
+                import tempfile
+                import os
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    json_path = convert_single_excel_to_json(local_path, temp_dir)
+                    local_path = json_path
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
     return all_chunks
 def load_image_documents(repo_id, hf_token, image_dir):
     log_message("Loading images...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and (f.endswith('.csv') or f.endswith('.xlsx') or f.endswith('.xls'))]
     documents = []
     for file_path in csv_files:
                 token=hf_token
             )
+            # Convert Excel to CSV if needed
+            if file_path.endswith(('.xlsx', '.xls')):
+                from converters.converter import convert_single_excel_to_csv
+                import tempfile
+                import os
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    csv_path = convert_single_excel_to_csv(local_path, temp_dir)
+                    local_path = csv_path
             df = pd.read_csv(local_path)
             for _, row in df.iterrows():

utils.py → main_utils.py RENAMED Viewed

File without changes