MrSimple07 commited on
Commit
4775037
·
1 Parent(s): a33029f

new documents_prep

Browse files
Files changed (2) hide show
  1. app.py +1 -6
  2. documents_prep.py +3 -3
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
- from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
@@ -127,11 +127,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
127
  json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
  all_documents.extend(json_documents)
129
  chunk_info.extend(json_chunk_info)
130
- else:
131
- if chunks_filename:
132
- log_message("Загружаем данные из CSV")
133
- csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
134
- all_documents.extend(csv_documents)
135
 
136
  if table_data_dir:
137
  log_message("Добавляю табличные данные")
 
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
+ from documents_prep import load_json_documents, load_table_data, load_image_data
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
 
127
  json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
  all_documents.extend(json_documents)
129
  chunk_info.extend(json_chunk_info)
 
 
 
 
 
130
 
131
  if table_data_dir:
132
  log_message("Добавляю табличные данные")
documents_prep.py CHANGED
@@ -123,7 +123,7 @@ def chunk_table_document(doc):
123
  # TABLE DATA LOADING
124
  # ============================================================================
125
 
126
- def create_table_text(table_data):
127
  """Format table data as readable text"""
128
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
129
  table_num = table_data.get('table_number', 'Неизвестно')
@@ -190,7 +190,7 @@ def load_tables_from_json(repo_id, hf_token, table_data_dir):
190
  continue
191
 
192
  # Create table text
193
- table_text = create_table_text(sheet)
194
  table_size = len(table_text)
195
  table_num = sheet.get('table_number', 'unknown')
196
 
@@ -434,7 +434,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
434
  # IMAGE DATA LOADING
435
  # ============================================================================
436
 
437
- def load_image_documents(repo_id, hf_token, image_data_dir):
438
  """Load image metadata from CSV files"""
439
  log_message("=" * 60)
440
  log_message("LOADING IMAGE METADATA")
 
123
  # TABLE DATA LOADING
124
  # ============================================================================
125
 
126
+ def load_table_data(table_data):
127
  """Format table data as readable text"""
128
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
129
  table_num = table_data.get('table_number', 'Неизвестно')
 
190
  continue
191
 
192
  # Create table text
193
+ table_text = load_table_data(sheet)
194
  table_size = len(table_text)
195
  table_num = sheet.get('table_number', 'unknown')
196
 
 
434
  # IMAGE DATA LOADING
435
  # ============================================================================
436
 
437
+ def load_image_data(repo_id, hf_token, image_data_dir):
438
  """Load image metadata from CSV files"""
439
  log_message("=" * 60)
440
  log_message("LOADING IMAGE METADATA")