Spaces:
Sleeping
Sleeping
Commit
·
d3d0d1e
1
Parent(s):
d6eea71
fixing the json zip file reading
Browse files- app.py +0 -58
- documents_prep.py +137 -4
app.py
CHANGED
|
@@ -138,48 +138,6 @@ def create_demo_interface():
|
|
| 138 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
|
| 139 |
)
|
| 140 |
|
| 141 |
-
with gr.Tab("📊 История чата"):
|
| 142 |
-
gr.Markdown("### История ваших вопросов и ответов")
|
| 143 |
-
|
| 144 |
-
with gr.Row():
|
| 145 |
-
refresh_history_btn = gr.Button("🔄 Обновить историю", variant="secondary")
|
| 146 |
-
clear_history_btn = gr.Button("🗑️ Очистить историю", variant="secondary")
|
| 147 |
-
|
| 148 |
-
history_output = gr.HTML(
|
| 149 |
-
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>История пуста</div>",
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
history_status = gr.Textbox(
|
| 153 |
-
label="Статус",
|
| 154 |
-
interactive=False,
|
| 155 |
-
visible=False
|
| 156 |
-
)
|
| 157 |
-
|
| 158 |
-
with gr.Tab("ℹ️ Информация о системе"):
|
| 159 |
-
gr.Markdown("""
|
| 160 |
-
### О системе AIEXP
|
| 161 |
-
|
| 162 |
-
**AIEXP (Artificial Intelligence Expert)** - это интеллектуальная система для работы с нормативной документацией.
|
| 163 |
-
|
| 164 |
-
#### Возможности:
|
| 165 |
-
- 🔍 Поиск информации в нормативных документах
|
| 166 |
-
- 📊 Работа с таблицами и изображениями
|
| 167 |
-
- 🤖 Поддержка различных языковых моделей
|
| 168 |
-
- 📈 Гибридный поиск с переранжировкой
|
| 169 |
-
- 📝 История диалогов
|
| 170 |
-
|
| 171 |
-
#### Поддерживаемые типы данных:
|
| 172 |
-
- **Текстовые документы** - разделы и подразделы нормативных актов
|
| 173 |
-
- **Таблицы** - структурированные данные в табличном формате
|
| 174 |
-
- **Изображения** - описания и метаданные изображений
|
| 175 |
-
|
| 176 |
-
#### Технические характеристики:
|
| 177 |
-
- Векторный поиск на основе sentence-transformers
|
| 178 |
-
- BM25 поиск для точного совпадения терминов
|
| 179 |
-
- Cross-encoder переранжировка результатов
|
| 180 |
-
- Поддержка Google Gemini и OpenAI моделей
|
| 181 |
-
""")
|
| 182 |
-
|
| 183 |
switch_btn.click(
|
| 184 |
fn=handle_model_switch,
|
| 185 |
inputs=[model_dropdown],
|
|
@@ -197,22 +155,6 @@ def create_demo_interface():
|
|
| 197 |
inputs=[question_input],
|
| 198 |
outputs=[answer_output, sources_output]
|
| 199 |
)
|
| 200 |
-
|
| 201 |
-
refresh_history_btn.click(
|
| 202 |
-
fn=get_chat_history_html,
|
| 203 |
-
outputs=[history_output]
|
| 204 |
-
)
|
| 205 |
-
|
| 206 |
-
clear_history_btn.click(
|
| 207 |
-
fn=clear_chat_history,
|
| 208 |
-
outputs=[history_status]
|
| 209 |
-
).then(
|
| 210 |
-
fn=get_chat_history_html,
|
| 211 |
-
outputs=[history_output]
|
| 212 |
-
).then(
|
| 213 |
-
fn=lambda: gr.update(visible=False),
|
| 214 |
-
outputs=[history_status]
|
| 215 |
-
)
|
| 216 |
|
| 217 |
return demo
|
| 218 |
|
|
|
|
| 138 |
value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
|
| 139 |
)
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
switch_btn.click(
|
| 142 |
fn=handle_model_switch,
|
| 143 |
inputs=[model_dropdown],
|
|
|
|
| 155 |
inputs=[question_input],
|
| 156 |
outputs=[answer_output, sources_output]
|
| 157 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
return demo
|
| 160 |
|
documents_prep.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
|
|
|
| 4 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 5 |
from llama_index.core import Document
|
| 6 |
import logging
|
|
@@ -103,20 +104,82 @@ class DocumentsPreparation:
|
|
| 103 |
|
| 104 |
return documents
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def load_json_documents(self):
|
| 107 |
log_message("Начинаю загрузку JSON документов")
|
| 108 |
|
| 109 |
try:
|
| 110 |
files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
json_files = [f for f in files if f.startswith(self.json_files_dir) and f.endswith('.json')]
|
| 112 |
|
| 113 |
-
log_message(f"Найдено {len(json_files)} JSON файлов")
|
| 114 |
|
| 115 |
all_documents = []
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
for file_path in json_files:
|
| 118 |
try:
|
| 119 |
-
log_message(f"Обрабатываю файл: {file_path}")
|
| 120 |
local_path = hf_hub_download(
|
| 121 |
repo_id=self.repo_id,
|
| 122 |
filename=file_path,
|
|
@@ -179,19 +242,89 @@ class DocumentsPreparation:
|
|
| 179 |
}
|
| 180 |
)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
def load_table_documents(self):
|
| 183 |
log_message("Начинаю загрузку табличных данных")
|
| 184 |
|
| 185 |
try:
|
| 186 |
files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
table_files = [f for f in files if f.startswith(self.table_data_dir) and f.endswith('.json')]
|
| 188 |
|
| 189 |
-
log_message(f"Найдено {len(
|
| 190 |
|
| 191 |
table_documents = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
for file_path in table_files:
|
| 193 |
try:
|
| 194 |
-
log_message(f"Обрабатываю
|
| 195 |
local_path = hf_hub_download(
|
| 196 |
repo_id=self.repo_id,
|
| 197 |
filename=file_path,
|
|
|
|
| 1 |
import json
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
+
import zipfile
|
| 5 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 6 |
from llama_index.core import Document
|
| 7 |
import logging
|
|
|
|
| 104 |
|
| 105 |
return documents
|
| 106 |
|
| 107 |
+
def extract_zip_and_process_json(self, zip_path):
|
| 108 |
+
"""Extract ZIP file and process JSON files inside"""
|
| 109 |
+
documents = []
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 113 |
+
# Get list of files in ZIP
|
| 114 |
+
zip_files = zip_ref.namelist()
|
| 115 |
+
json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
|
| 116 |
+
|
| 117 |
+
log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
|
| 118 |
+
|
| 119 |
+
for json_file in json_files:
|
| 120 |
+
try:
|
| 121 |
+
log_message(f"Обрабатываю файл из архива: {json_file}")
|
| 122 |
+
|
| 123 |
+
# Read JSON file from ZIP
|
| 124 |
+
with zip_ref.open(json_file) as f:
|
| 125 |
+
json_data = json.load(f)
|
| 126 |
+
|
| 127 |
+
document_metadata = json_data.get('document_metadata', {})
|
| 128 |
+
document_id = document_metadata.get('document_id', 'unknown')
|
| 129 |
+
document_name = document_metadata.get('document_name', 'unknown')
|
| 130 |
+
|
| 131 |
+
docs = self.extract_text_from_json(json_data, document_id, document_name)
|
| 132 |
+
documents.extend(docs)
|
| 133 |
+
|
| 134 |
+
log_message(f"Извлечено {len(docs)} документов из {json_file}")
|
| 135 |
+
|
| 136 |
+
except Exception as e:
|
| 137 |
+
log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
|
| 142 |
+
|
| 143 |
+
return documents
|
| 144 |
+
|
| 145 |
def load_json_documents(self):
|
| 146 |
log_message("Начинаю загрузку JSON документов")
|
| 147 |
|
| 148 |
try:
|
| 149 |
files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
|
| 150 |
+
|
| 151 |
+
# Look for ZIP files in the JSON directory
|
| 152 |
+
zip_files = [f for f in files if f.startswith(self.json_files_dir) and f.endswith('.zip')]
|
| 153 |
+
# Also look for direct JSON files (fallback)
|
| 154 |
json_files = [f for f in files if f.startswith(self.json_files_dir) and f.endswith('.json')]
|
| 155 |
|
| 156 |
+
log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
|
| 157 |
|
| 158 |
all_documents = []
|
| 159 |
|
| 160 |
+
# Process ZIP files first
|
| 161 |
+
for zip_file_path in zip_files:
|
| 162 |
+
try:
|
| 163 |
+
log_message(f"Загружаю ZIP архив: {zip_file_path}")
|
| 164 |
+
local_zip_path = hf_hub_download(
|
| 165 |
+
repo_id=self.repo_id,
|
| 166 |
+
filename=zip_file_path,
|
| 167 |
+
local_dir=self.download_dir,
|
| 168 |
+
repo_type="dataset",
|
| 169 |
+
token=self.hf_token
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
documents = self.extract_zip_and_process_json(local_zip_path)
|
| 173 |
+
all_documents.extend(documents)
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
|
| 177 |
+
continue
|
| 178 |
+
|
| 179 |
+
# Process direct JSON files (if any)
|
| 180 |
for file_path in json_files:
|
| 181 |
try:
|
| 182 |
+
log_message(f"Обрабатываю прямой JSON файл: {file_path}")
|
| 183 |
local_path = hf_hub_download(
|
| 184 |
repo_id=self.repo_id,
|
| 185 |
filename=file_path,
|
|
|
|
| 242 |
}
|
| 243 |
)
|
| 244 |
|
| 245 |
+
def extract_zip_and_process_tables(self, zip_path):
|
| 246 |
+
"""Extract ZIP file and process table JSON files inside"""
|
| 247 |
+
documents = []
|
| 248 |
+
|
| 249 |
+
try:
|
| 250 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 251 |
+
# Get list of files in ZIP
|
| 252 |
+
zip_files = zip_ref.namelist()
|
| 253 |
+
json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
|
| 254 |
+
|
| 255 |
+
log_message(f"Найдено {len(json_files)} JSON файлов таблиц в архиве")
|
| 256 |
+
|
| 257 |
+
for json_file in json_files:
|
| 258 |
+
try:
|
| 259 |
+
log_message(f"Обрабатываю файл таблицы из архива: {json_file}")
|
| 260 |
+
|
| 261 |
+
# Read JSON file from ZIP
|
| 262 |
+
with zip_ref.open(json_file) as f:
|
| 263 |
+
table_data = json.load(f)
|
| 264 |
+
|
| 265 |
+
if isinstance(table_data, dict):
|
| 266 |
+
document_id = table_data.get('document', 'unknown')
|
| 267 |
+
|
| 268 |
+
if 'sheets' in table_data:
|
| 269 |
+
for sheet in table_data['sheets']:
|
| 270 |
+
sheet['document'] = document_id
|
| 271 |
+
doc = self.table_to_document(sheet, document_id)
|
| 272 |
+
documents.append(doc)
|
| 273 |
+
else:
|
| 274 |
+
doc = self.table_to_document(table_data, document_id)
|
| 275 |
+
documents.append(doc)
|
| 276 |
+
elif isinstance(table_data, list):
|
| 277 |
+
for table_json in table_data:
|
| 278 |
+
doc = self.table_to_document(table_json)
|
| 279 |
+
documents.append(doc)
|
| 280 |
+
|
| 281 |
+
except Exception as e:
|
| 282 |
+
log_message(f"Ошибка обработки файла таблицы {json_file}: {str(e)}")
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
except Exception as e:
|
| 286 |
+
log_message(f"Ошибка извлечения ZIP архива таблиц {zip_path}: {str(e)}")
|
| 287 |
+
|
| 288 |
+
return documents
|
| 289 |
+
|
| 290 |
def load_table_documents(self):
|
| 291 |
log_message("Начинаю загрузку табличных данных")
|
| 292 |
|
| 293 |
try:
|
| 294 |
files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
|
| 295 |
+
|
| 296 |
+
# Look for ZIP files in the table directory
|
| 297 |
+
zip_files = [f for f in files if f.startswith(self.table_data_dir) and f.endswith('.zip')]
|
| 298 |
+
# Also look for direct JSON files (fallback)
|
| 299 |
table_files = [f for f in files if f.startswith(self.table_data_dir) and f.endswith('.json')]
|
| 300 |
|
| 301 |
+
log_message(f"Найдено {len(zip_files)} ZIP файлов с таблицами и {len(table_files)} прямых JSON файлов")
|
| 302 |
|
| 303 |
table_documents = []
|
| 304 |
+
|
| 305 |
+
# Process ZIP files first
|
| 306 |
+
for zip_file_path in zip_files:
|
| 307 |
+
try:
|
| 308 |
+
log_message(f"Загружаю ZIP архив таблиц: {zip_file_path}")
|
| 309 |
+
local_zip_path = hf_hub_download(
|
| 310 |
+
repo_id=self.repo_id,
|
| 311 |
+
filename=zip_file_path,
|
| 312 |
+
local_dir=self.download_dir,
|
| 313 |
+
repo_type="dataset",
|
| 314 |
+
token=self.hf_token
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
documents = self.extract_zip_and_process_tables(local_zip_path)
|
| 318 |
+
table_documents.extend(documents)
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
log_message(f"Ошибка обработки ZIP файла таблиц {zip_file_path}: {str(e)}")
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# Process direct JSON files (if any)
|
| 325 |
for file_path in table_files:
|
| 326 |
try:
|
| 327 |
+
log_message(f"Обрабатываю прямой файл таблицы: {file_path}")
|
| 328 |
local_path = hf_hub_download(
|
| 329 |
repo_id=self.repo_id,
|
| 330 |
filename=file_path,
|