MrSimple07 commited on
Commit
d013631
·
1 Parent(s): d1e7fd2

new documents prep

Browse files
Files changed (1) hide show
  1. app.py +34 -21
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
- from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
@@ -96,14 +96,12 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
  use_json_instead_csv=False):
98
  try:
99
- from documents_prep import process_documents_with_chunking
100
  log_message("Инициализация системы")
101
  os.makedirs(download_dir, exist_ok=True)
102
  from config import CHUNK_SIZE, CHUNK_OVERLAP
103
  from llama_index.core.text_splitter import TokenTextSplitter
104
 
105
  embed_model = get_embedding_model()
106
-
107
  llm = get_llm_model(DEFAULT_MODEL)
108
  reranker = get_reranker_model()
109
 
@@ -121,49 +119,64 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
121
 
122
  all_documents = []
123
  chunks_df = None
124
- chunk_info = []
125
 
126
  if use_json_instead_csv and json_files_dir:
127
  log_message("Используем JSON файлы вместо CSV")
128
- json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
129
- all_documents.extend(json_documents)
130
- chunk_info.extend(json_chunk_info)
 
 
 
 
131
  else:
132
  if chunks_filename:
133
  log_message("Загружаем данные из CSV")
134
- csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
135
- all_documents.extend(csv_documents)
136
 
137
  if table_data_dir:
138
  log_message("Добавляю табличные данные")
139
- table_documents = load_table_data(repo_id, hf_token, table_data_dir)
140
- log_message(f"Загружено {len(table_documents)} табличных документов")
141
 
142
- # Process table documents through chunking
143
- chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
144
- all_documents.extend(chunked_table_docs)
145
- chunk_info.extend(table_chunk_info)
146
 
147
  if image_data_dir:
148
  log_message("Добавляю данные изображений")
149
- image_documents = load_image_data(repo_id, hf_token, image_data_dir)
150
- log_message(f"Загружено {len(image_documents)} документов изображений")
151
 
152
- # Process image documents through chunking
153
- chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
154
- all_documents.extend(chunked_image_docs)
155
- chunk_info.extend(image_chunk_info)
156
 
157
  log_message(f"Всего документов после всей обработки: {len(all_documents)}")
158
 
159
  vector_index = create_vector_index(all_documents)
160
  query_engine = create_query_engine(vector_index)
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  log_message(f"Система успешно инициализирована")
163
  return query_engine, chunks_df, reranker, vector_index, chunk_info
164
 
165
  except Exception as e:
166
  log_message(f"Ошибка инициализации: {str(e)}")
 
 
167
  return None, None, None, None, []
168
 
169
  def switch_model(model_name, vector_index):
 
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
+ from documents_prep import load_json_documents, load_table_documents, load_image_documents
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
 
96
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
  use_json_instead_csv=False):
98
  try:
 
99
  log_message("Инициализация системы")
100
  os.makedirs(download_dir, exist_ok=True)
101
  from config import CHUNK_SIZE, CHUNK_OVERLAP
102
  from llama_index.core.text_splitter import TokenTextSplitter
103
 
104
  embed_model = get_embedding_model()
 
105
  llm = get_llm_model(DEFAULT_MODEL)
106
  reranker = get_reranker_model()
107
 
 
119
 
120
  all_documents = []
121
  chunks_df = None
 
122
 
123
  if use_json_instead_csv and json_files_dir:
124
  log_message("Используем JSON файлы вместо CSV")
125
+ from documents_prep import load_json_documents, chunk_text_documents
126
+
127
+ # Load JSON docs (returns list of Documents)
128
+ json_documents = load_json_documents(repo_id, hf_token, json_files_dir)
129
+ # Chunk them
130
+ json_chunks = chunk_text_documents(json_documents)
131
+ all_documents.extend(json_chunks)
132
  else:
133
  if chunks_filename:
134
  log_message("Загружаем данные из CSV")
135
+
 
136
 
137
  if table_data_dir:
138
  log_message("Добавляю табличные данные")
139
+ from documents_prep import load_table_documents
 
140
 
141
+ # load_table_documents already returns chunked documents
142
+ table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
143
+ log_message(f"Загружено {len(table_chunks)} табличных чанков")
144
+ all_documents.extend(table_chunks)
145
 
146
  if image_data_dir:
147
  log_message("Добавляю данные изображений")
148
+ from documents_prep import load_image_documents
 
149
 
150
+ # load_image_documents returns documents (no chunking needed)
151
+ image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
152
+ log_message(f"Загружено {len(image_documents)} документов изображений")
153
+ all_documents.extend(image_documents)
154
 
155
  log_message(f"Всего документов после всей обработки: {len(all_documents)}")
156
 
157
  vector_index = create_vector_index(all_documents)
158
  query_engine = create_query_engine(vector_index)
159
 
160
+ # Create chunk_info for display (extract from documents metadata)
161
+ chunk_info = []
162
+ for doc in all_documents:
163
+ chunk_info.append({
164
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
165
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
166
+ 'type': doc.metadata.get('type', 'text'),
167
+ 'chunk_text': doc.text[:200] + '...' if len(doc.text) > 200 else doc.text,
168
+ 'table_number': doc.metadata.get('table_number', ''),
169
+ 'image_number': doc.metadata.get('image_number', ''),
170
+ 'section': doc.metadata.get('section', ''),
171
+ })
172
+
173
  log_message(f"Система успешно инициализирована")
174
  return query_engine, chunks_df, reranker, vector_index, chunk_info
175
 
176
  except Exception as e:
177
  log_message(f"Ошибка инициализации: {str(e)}")
178
+ import traceback
179
+ log_message(traceback.format_exc())
180
  return None, None, None, None, []
181
 
182
  def switch_model(model_name, vector_index):