Spaces:
Sleeping
Sleeping
Commit
·
4775037
1
Parent(s):
a33029f
new documents_prep
Browse files- app.py +1 -6
- documents_prep.py +3 -3
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from llama_index.core import Settings
|
| 4 |
-
from documents_prep import load_json_documents, load_table_data, load_image_data
|
| 5 |
from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
|
| 6 |
from my_logging import log_message
|
| 7 |
from index_retriever import create_vector_index, create_query_engine
|
|
@@ -127,11 +127,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 127 |
json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
|
| 128 |
all_documents.extend(json_documents)
|
| 129 |
chunk_info.extend(json_chunk_info)
|
| 130 |
-
else:
|
| 131 |
-
if chunks_filename:
|
| 132 |
-
log_message("Загружаем данные из CSV")
|
| 133 |
-
csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
|
| 134 |
-
all_documents.extend(csv_documents)
|
| 135 |
|
| 136 |
if table_data_dir:
|
| 137 |
log_message("Добавляю табличные данные")
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from llama_index.core import Settings
|
| 4 |
+
from documents_prep import load_json_documents, load_table_data, load_image_data
|
| 5 |
from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
|
| 6 |
from my_logging import log_message
|
| 7 |
from index_retriever import create_vector_index, create_query_engine
|
|
|
|
| 127 |
json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
|
| 128 |
all_documents.extend(json_documents)
|
| 129 |
chunk_info.extend(json_chunk_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
if table_data_dir:
|
| 132 |
log_message("Добавляю табличные данные")
|
documents_prep.py
CHANGED
|
@@ -123,7 +123,7 @@ def chunk_table_document(doc):
|
|
| 123 |
# TABLE DATA LOADING
|
| 124 |
# ============================================================================
|
| 125 |
|
| 126 |
-
def
|
| 127 |
"""Format table data as readable text"""
|
| 128 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 129 |
table_num = table_data.get('table_number', 'Неизвестно')
|
|
@@ -190,7 +190,7 @@ def load_tables_from_json(repo_id, hf_token, table_data_dir):
|
|
| 190 |
continue
|
| 191 |
|
| 192 |
# Create table text
|
| 193 |
-
table_text =
|
| 194 |
table_size = len(table_text)
|
| 195 |
table_num = sheet.get('table_number', 'unknown')
|
| 196 |
|
|
@@ -434,7 +434,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
|
|
| 434 |
# IMAGE DATA LOADING
|
| 435 |
# ============================================================================
|
| 436 |
|
| 437 |
-
def
|
| 438 |
"""Load image metadata from CSV files"""
|
| 439 |
log_message("=" * 60)
|
| 440 |
log_message("LOADING IMAGE METADATA")
|
|
|
|
| 123 |
# TABLE DATA LOADING
|
| 124 |
# ============================================================================
|
| 125 |
|
| 126 |
+
def load_table_data(table_data):
|
| 127 |
"""Format table data as readable text"""
|
| 128 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
| 129 |
table_num = table_data.get('table_number', 'Неизвестно')
|
|
|
|
| 190 |
continue
|
| 191 |
|
| 192 |
# Create table text
|
| 193 |
+
table_text = load_table_data(sheet)
|
| 194 |
table_size = len(table_text)
|
| 195 |
table_num = sheet.get('table_number', 'unknown')
|
| 196 |
|
|
|
|
| 434 |
# IMAGE DATA LOADING
|
| 435 |
# ============================================================================
|
| 436 |
|
| 437 |
+
def load_image_data(repo_id, hf_token, image_data_dir):
|
| 438 |
"""Load image metadata from CSV files"""
|
| 439 |
log_message("=" * 60)
|
| 440 |
log_message("LOADING IMAGE METADATA")
|