Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 20, 2025

Commit

da3779b

1 Parent(s): c825d58

Fixed HF_REPO_ID + Added force_download=True + Enhanced status messages

Browse files

Files changed (4) hide show

app.py +12 -2
config.py +1 -2
converters/converter.py +31 -26
documents_prep.py +2 -1

app.py CHANGED Viewed

@@ -18,6 +18,17 @@ def restart_system():
     try:
         log_message("Начало перезапуска системы...")
         query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
             repo_id=HF_REPO_ID,
@@ -40,7 +51,6 @@ def restart_system():
         log_message(error_msg)
         return f"❌ {error_msg}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
@@ -451,7 +461,7 @@ Rerank Top K: {retrieval_params['rerank_top_k']}"""
             with gr.Row():
                 with gr.Column(scale=2):
                     file_type_radio = gr.Radio(
-                        choices=["Таблица", "Изображение (метаданные)", "JSON документ"],
                         value="Таблица",
                         label="Тип документа",
                         info="Выберите тип загружаемого документа"

     try:
         log_message("Начало перезапуска системы...")
+        log_message("Очистка кэша HuggingFace...")
+        # Clear HuggingFace cache to force fresh download
+        import shutil
+        cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
+        if os.path.exists(cache_dir):
+            try:
+                shutil.rmtree(cache_dir)
+                log_message("✓ Кэш очищен")
+            except:
+                log_message("⚠ Не удалось очистить кэш полностью")
         query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
             repo_id=HF_REPO_ID,
         log_message(error_msg)
         return f"❌ {error_msg}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
             with gr.Row():
                 with gr.Column(scale=2):
                     file_type_radio = gr.Radio(
+                        choices=["Таблица", "Изображение", "Текстовый JSON"],
                         value="Таблица",
                         label="Тип документа",
                         info="Выберите тип загружаемого документа"

config.py CHANGED Viewed

@@ -5,7 +5,6 @@ SIMILARITY_THRESHOLD = 0.7
 RAG_FILES_DIR = "rag_files"
 PROCESSED_DATA_FILE = "processed_chunks.csv"
-REPO_ID = "RAG-AIEXP/ragfiles"
 faiss_index_filename = "cleaned_faiss_index.index"
 CHUNKS_FILENAME = "processed_chunks.csv"
 TABLE_DATA_DIR = "Табличные данные_JSON"
@@ -15,7 +14,7 @@ JSON_FILES_DIR ="JSON"
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
 HF_TOKEN = os.getenv('HF_TOKEN')
 AVAILABLE_MODELS = {

 RAG_FILES_DIR = "rag_files"
 PROCESSED_DATA_FILE = "processed_chunks.csv"
 faiss_index_filename = "cleaned_faiss_index.index"
 CHUNKS_FILENAME = "processed_chunks.csv"
 TABLE_DATA_DIR = "Табличные данные_JSON"
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+HF_REPO_ID = "RAG-AIEXP/ragfiles"
 HF_TOKEN = os.getenv('HF_TOKEN')
 AVAILABLE_MODELS = {

converters/converter.py CHANGED Viewed

@@ -27,7 +27,6 @@ def process_uploaded_file(file, file_type):
             else:
                 file_path = source_path
-            # Get original file size
             original_size_bytes = os.path.getsize(file_path)
             original_size_mb = original_size_bytes / (1024 * 1024)
@@ -35,13 +34,32 @@ def process_uploaded_file(file, file_type):
             status_info.append(f"📁 Исходный файл: {filename}")
             status_info.append(f"📦 Размер файла: {original_size_mb:.2f} МБ ({original_size_bytes:,} байт)")
-            if file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
                 if filename.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
-                    # Get processed file size
                     processed_size_bytes = os.path.getsize(json_path)
                     processed_size_mb = processed_size_bytes / (1024 * 1024)
@@ -55,9 +73,11 @@ def process_uploaded_file(file, file_type):
                     status_info.append(f"📝 Всего строк данных: {total_rows:,}")
                     status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
                     status_info.append(f"📤 Загружен как: {os.path.basename(json_path)}")
                 else:
                     upload_file = file_path
                     status_info.append(f"📤 Загружен как: {filename}")
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
@@ -65,7 +85,6 @@ def process_uploaded_file(file, file_type):
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
-                    # Get processed file size
                     processed_size_bytes = os.path.getsize(csv_path)
                     processed_size_mb = processed_size_bytes / (1024 * 1024)
@@ -74,6 +93,7 @@ def process_uploaded_file(file, file_type):
                     status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
                     status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
                     status_info.append(f"📤 Загружен как: {os.path.basename(csv_path)}")
                 else:
                     upload_file = file_path
                     try:
@@ -83,33 +103,17 @@ def process_uploaded_file(file, file_type):
                     except:
                         pass
                     status_info.append(f"📤 Загружен как: {filename}")
-            else:  # JSON документ
-                target_dir = JSON_FILES_DIR
-                upload_file = file_path
-                try:
-                    with open(upload_file, 'r', encoding='utf-8') as f:
-                        json_data = json.load(f)
-                    if isinstance(json_data, list):
-                        status_info.append(f"📝 Документов в JSON: {len(json_data):,}")
-                    elif isinstance(json_data, dict):
-                        status_info.append(f"📝 JSON объект (словарь)")
-                        # Count keys if it's structured data
-                        if 'sheets' in json_data:
-                            status_info.append(f"📊 Таблиц в документе: {len(json_data.get('sheets', []))}")
-                        status_info.append(f"🔑 Ключей верхнего уровня: {len(json_data.keys())}")
-                except:
-                    pass
-                status_info.append(f"📤 Загружен как: {filename}")
             # Загружаем на HuggingFace
-            log_message(f"Загрузка на HuggingFace: {target_dir}/{os.path.basename(upload_file)}")
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
-                path_in_repo=f"{target_dir}/{os.path.basename(upload_file)}",
                 repo_id=HF_REPO_ID,
                 token=HF_TOKEN,
                 repo_type="dataset"
@@ -119,6 +123,7 @@ def process_uploaded_file(file, file_type):
             result_message = f"✅ Файл успешно загружен и обработан\n\n"
             result_message += "\n".join(status_info)
             result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"
             return result_message

             else:
                 file_path = source_path
             original_size_bytes = os.path.getsize(file_path)
             original_size_mb = original_size_bytes / (1024 * 1024)
             status_info.append(f"📁 Исходный файл: {filename}")
             status_info.append(f"📦 Размер файла: {original_size_mb:.2f} МБ ({original_size_bytes:,} байт)")
+            # ADDED: Text document handling
+            if file_type == "JSON документ":
+                target_dir = JSON_FILES_DIR
+                upload_file = file_path
+                try:
+                    with open(upload_file, 'r', encoding='utf-8') as f:
+                        json_data = json.load(f)
+                    # Count sections
+                    section_count = len(json_data.get('sections', []))
+                    status_info.append(f"📝 Разделов в документе: {section_count}")
+                    status_info.append(f"📄 ID документа: {json_data.get('document_metadata', {}).get('document_id', 'N/A')}")
+                except Exception as e:
+                    return f"❌ Ошибка чтения JSON: {str(e)}"
+                status_info.append(f"📤 Загружен как: {filename}")
+                status_info.append(f"📂 Целевая директория: {target_dir}")
+            elif file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
                 if filename.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
                     processed_size_bytes = os.path.getsize(json_path)
                     processed_size_mb = processed_size_bytes / (1024 * 1024)
                     status_info.append(f"📝 Всего строк данных: {total_rows:,}")
                     status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
                     status_info.append(f"📤 Загружен как: {os.path.basename(json_path)}")
+                    status_info.append(f"📂 Целевая директория: {target_dir}")
                 else:
                     upload_file = file_path
                     status_info.append(f"📤 Загружен как: {filename}")
+                    status_info.append(f"📂 Целевая директория: {target_dir}")
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
                     processed_size_bytes = os.path.getsize(csv_path)
                     processed_size_mb = processed_size_bytes / (1024 * 1024)
                     status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
                     status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
                     status_info.append(f"📤 Загружен как: {os.path.basename(csv_path)}")
+                    status_info.append(f"📂 Целевая директория: {target_dir}")
                 else:
                     upload_file = file_path
                     try:
                     except:
                         pass
                     status_info.append(f"📤 Загружен как: {filename}")
+                    status_info.append(f"📂 Целевая директория: {target_dir}")
             # Загружаем на HuggingFace
+            upload_path = f"{target_dir}/{os.path.basename(upload_file)}"
+            log_message(f"Загрузка на HuggingFace: {upload_path}")
+            status_info.append(f"⬆️ Загрузка в репозиторий...")
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
+                path_in_repo=upload_path,
                 repo_id=HF_REPO_ID,
                 token=HF_TOKEN,
                 repo_type="dataset"
             result_message = f"✅ Файл успешно загружен и обработан\n\n"
             result_message += "\n".join(status_info)
+            result_message += f"\n\n✅ Файл добавлен в: {upload_path}"
             result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"
             return result_message

documents_prep.py CHANGED Viewed

@@ -382,7 +382,8 @@ def load_json_documents(repo_id, hf_token, json_dir):
                 repo_id=repo_id,
                 filename=zip_path,
                 repo_type="dataset",
-                token=hf_token
             )
             with zipfile.ZipFile(local_zip, 'r') as zf:

                 repo_id=repo_id,
                 filename=zip_path,
                 repo_type="dataset",
+                token=hf_token,
+                force_download=True
             )
             with zipfile.ZipFile(local_zip, 'r') as zf: