MrSimple07 commited on
Commit
35eb459
·
1 Parent(s): 5ebc241

top k = 150 + max chunk size is 4000 + max rows =15 + sim cut off = 0.45

Browse files
Files changed (3) hide show
  1. documents_prep.py +10 -40
  2. index_retriever.py +4 -4
  3. table_prep.py +107 -107
documents_prep.py CHANGED
@@ -157,11 +157,18 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
157
 
158
 
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
- content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
 
 
 
 
 
161
  if table_title:
162
  content += f"НАЗВАНИЕ: {table_title}\n"
 
163
  if section:
164
  content += f"РАЗДЕЛ: {section}\n"
 
165
  content += f"{'='*70}\n"
166
 
167
  if headers:
@@ -199,40 +206,6 @@ def format_table_footer(table_identifier, doc_id):
199
  """Format table footer"""
200
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
201
 
202
- def load_table_documents(repo_id, hf_token, table_dir):
203
- log_message("Loading tables...")
204
-
205
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
206
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
207
-
208
- all_chunks = []
209
- for file_path in table_files:
210
- try:
211
- local_path = hf_hub_download(
212
- repo_id=repo_id,
213
- filename=file_path,
214
- repo_type="dataset",
215
- token=hf_token
216
- )
217
-
218
- with open(local_path, 'r', encoding='utf-8') as f:
219
- data = json.load(f)
220
-
221
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
222
-
223
- for sheet in data.get('sheets', []):
224
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
225
-
226
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
227
- all_chunks.extend(chunks)
228
-
229
- except Exception as e:
230
- log_message(f"Error loading {file_path}: {e}")
231
-
232
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
233
- return all_chunks
234
-
235
-
236
  def load_json_documents(repo_id, hf_token, json_dir):
237
  import zipfile
238
  import tempfile
@@ -414,7 +387,6 @@ def extract_sections_from_json(json_path):
414
 
415
 
416
  def load_table_documents(repo_id, hf_token, table_dir):
417
- """Load and chunk tables"""
418
  log_message("Loading tables...")
419
 
420
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -433,15 +405,13 @@ def load_table_documents(repo_id, hf_token, table_dir):
433
  with open(local_path, 'r', encoding='utf-8') as f:
434
  data = json.load(f)
435
 
436
- # Extract file-level document_id
437
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
438
 
439
  for sheet in data.get('sheets', []):
440
- # Use sheet-level document_id if available, otherwise use file-level
441
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
442
 
443
- # CRITICAL: Pass document_id to chunk function
444
- chunks = chunk_table_by_content(sheet, sheet_doc_id)
445
  all_chunks.extend(chunks)
446
 
447
  except Exception as e:
 
157
 
158
 
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
+ content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
161
+
162
+ # Add table type/number prominently for matching
163
+ if table_num:
164
+ content += f"ТИП: {table_num}\n"
165
+
166
  if table_title:
167
  content += f"НАЗВАНИЕ: {table_title}\n"
168
+
169
  if section:
170
  content += f"РАЗДЕЛ: {section}\n"
171
+
172
  content += f"{'='*70}\n"
173
 
174
  if headers:
 
206
  """Format table footer"""
207
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def load_json_documents(repo_id, hf_token, json_dir):
210
  import zipfile
211
  import tempfile
 
387
 
388
 
389
  def load_table_documents(repo_id, hf_token, table_dir):
 
390
  log_message("Loading tables...")
391
 
392
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
405
  with open(local_path, 'r', encoding='utf-8') as f:
406
  data = json.load(f)
407
 
 
408
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
409
 
410
  for sheet in data.get('sheets', []):
 
411
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
412
 
413
+ # Use the consistent MAX_CHARS_TABLE from config
414
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
415
  all_chunks.extend(chunks)
416
 
417
  except Exception as e:
index_retriever.py CHANGED
@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
- similarity_top_k=120
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
- similarity_top_k=120,
55
- similarity_cutoff=0.35
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
- similarity_top_k=120,
61
  num_queries=1
62
  )
63
 
 
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
+ similarity_top_k=150
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
+ similarity_top_k=150,
55
+ similarity_cutoff=0.45
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
+ similarity_top_k=150,
61
  num_queries=1
62
  )
63
 
table_prep.py CHANGED
@@ -95,135 +95,135 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk
95
  return chunked_docs
96
 
97
 
98
- def table_to_document(table_data, document_id=None):
99
- if not isinstance(table_data, dict):
100
- return []
101
 
102
- doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
103
- table_num = table_data.get('table_number', 'Неизвестно')
104
- table_title = table_data.get('table_title', 'Неизвестно')
105
- section = table_data.get('section', 'Неизвестно')
106
- table_rows = table_data.get('data', [])
107
 
108
- if not table_rows:
109
- return []
110
 
111
- # Build table content
112
- content = f"Таблица: {table_num}\n"
113
- content += f"Название: {table_title}\n"
114
- content += f"Документ: {doc_id}\n"
115
- content += f"Раздел: {section}\n"
116
 
117
- headers = table_data.get('headers', [])
118
- if headers:
119
- content += f"\nЗаголовки: {' | '.join(headers)}\n"
120
 
121
- content += "\nДанные таблицы:\n"
122
- for row_idx, row in enumerate(table_rows, start=1):
123
- if isinstance(row, dict):
124
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
125
- content += f"Строка {row_idx}: {row_text}\n"
126
 
127
- # Create base document
128
- base_doc = Document(
129
- text=content,
130
- metadata={
131
- "type": "table",
132
- "table_number": table_num,
133
- "document_id": doc_id,
134
- "section": section
135
- }
136
- )
137
- if len(content) > 4000:
138
- chunks = chunk_table_document(base_doc)
139
- log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
140
- return chunk_table_document(base_doc)
141
- return [base_doc]
142
 
143
 
144
- def load_table_data(repo_id, hf_token, table_data_dir):
145
- try:
146
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
148
 
149
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
150
 
151
- table_documents = []
152
- stats = {
153
- 'total_tables': 0,
154
- 'total_size': 0,
155
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
156
- }
157
 
158
- for file_path in table_files:
159
- try:
160
- local_path = hf_hub_download(
161
- repo_id=repo_id,
162
- filename=file_path,
163
- local_dir='',
164
- repo_type="dataset",
165
- token=hf_token
166
- )
167
 
168
- log_message(f"\nОбработка файла: {file_path}")
169
 
170
- with open(local_path, 'r', encoding='utf-8') as f:
171
- table_data = json.load(f)
172
 
173
- if isinstance(table_data, dict):
174
- document_id = table_data.get('document', 'unknown')
175
 
176
- if 'sheets' in table_data:
177
- sorted_sheets = sorted(
178
- table_data['sheets'],
179
- key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
180
- )
181
 
182
- for sheet in sorted_sheets:
183
- sheet['document'] = document_id
184
- docs_list = table_to_document(sheet, document_id)
185
- table_documents.extend(docs_list)
186
 
187
- for doc in docs_list:
188
- stats['total_tables'] += 1
189
- size = doc.metadata.get('content_size', 0)
190
- stats['total_size'] += size
191
- stats['by_document'][document_id]['count'] += 1
192
- stats['by_document'][document_id]['size'] += size
193
- log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
194
- else:
195
- docs_list = table_to_document(table_data, document_id)
196
- table_documents.extend(docs_list)
197
 
198
- for doc in docs_list:
199
- stats['total_tables'] += 1
200
- size = doc.metadata.get('content_size', 0)
201
- stats['total_size'] += size
202
- stats['by_document'][document_id]['count'] += 1
203
- stats['by_document'][document_id]['size'] += size
204
 
205
 
206
- except Exception as e:
207
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
208
- continue
209
 
210
- # Log summary statistics
211
- log_message("\n" + "=" * 60)
212
- log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
213
- log_message("=" * 60)
214
- log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
215
- log_message(f"Общий размер: {stats['total_size']:,} символов")
216
- log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
217
 
218
- log_message("\nПо документам:")
219
- for doc_id, doc_stats in sorted(stats['by_document'].items()):
220
- log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
221
- f"{doc_stats['size']:,} символов")
222
 
223
- log_message("=" * 60)
224
 
225
- return table_documents
226
 
227
- except Exception as e:
228
- log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
229
- return []
 
95
  return chunked_docs
96
 
97
 
98
+ # def table_to_document(table_data, document_id=None):
99
+ # if not isinstance(table_data, dict):
100
+ # return []
101
 
102
+ # doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
103
+ # table_num = table_data.get('table_number', 'Неизвестно')
104
+ # table_title = table_data.get('table_title', 'Неизвестно')
105
+ # section = table_data.get('section', 'Неизвестно')
106
+ # table_rows = table_data.get('data', [])
107
 
108
+ # if not table_rows:
109
+ # return []
110
 
111
+ # # Build table content
112
+ # content = f"Таблица: {table_num}\n"
113
+ # content += f"Название: {table_title}\n"
114
+ # content += f"Документ: {doc_id}\n"
115
+ # content += f"Раздел: {section}\n"
116
 
117
+ # headers = table_data.get('headers', [])
118
+ # if headers:
119
+ # content += f"\nЗаголовки: {' | '.join(headers)}\n"
120
 
121
+ # content += "\nДанные таблицы:\n"
122
+ # for row_idx, row in enumerate(table_rows, start=1):
123
+ # if isinstance(row, dict):
124
+ # row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
125
+ # content += f"Строка {row_idx}: {row_text}\n"
126
 
127
+ # # Create base document
128
+ # base_doc = Document(
129
+ # text=content,
130
+ # metadata={
131
+ # "type": "table",
132
+ # "table_number": table_num,
133
+ # "document_id": doc_id,
134
+ # "section": section
135
+ # }
136
+ # )
137
+ # if len(content) > 4000:
138
+ # chunks = chunk_table_document(base_doc)
139
+ # log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
140
+ # return chunk_table_document(base_doc)
141
+ # return [base_doc]
142
 
143
 
144
+ # def load_table_data(repo_id, hf_token, table_data_dir):
145
+ # try:
146
+ # files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
+ # table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
148
 
149
+ # log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
150
 
151
+ # table_documents = []
152
+ # stats = {
153
+ # 'total_tables': 0,
154
+ # 'total_size': 0,
155
+ # 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
156
+ # }
157
 
158
+ # for file_path in table_files:
159
+ # try:
160
+ # local_path = hf_hub_download(
161
+ # repo_id=repo_id,
162
+ # filename=file_path,
163
+ # local_dir='',
164
+ # repo_type="dataset",
165
+ # token=hf_token
166
+ # )
167
 
168
+ # log_message(f"\nОбработка файла: {file_path}")
169
 
170
+ # with open(local_path, 'r', encoding='utf-8') as f:
171
+ # table_data = json.load(f)
172
 
173
+ # if isinstance(table_data, dict):
174
+ # document_id = table_data.get('document', 'unknown')
175
 
176
+ # if 'sheets' in table_data:
177
+ # sorted_sheets = sorted(
178
+ # table_data['sheets'],
179
+ # key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
180
+ # )
181
 
182
+ # for sheet in sorted_sheets:
183
+ # sheet['document'] = document_id
184
+ # docs_list = table_to_document(sheet, document_id)
185
+ # table_documents.extend(docs_list)
186
 
187
+ # for doc in docs_list:
188
+ # stats['total_tables'] += 1
189
+ # size = doc.metadata.get('content_size', 0)
190
+ # stats['total_size'] += size
191
+ # stats['by_document'][document_id]['count'] += 1
192
+ # stats['by_document'][document_id]['size'] += size
193
+ # log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
194
+ # else:
195
+ # docs_list = table_to_document(table_data, document_id)
196
+ # table_documents.extend(docs_list)
197
 
198
+ # for doc in docs_list:
199
+ # stats['total_tables'] += 1
200
+ # size = doc.metadata.get('content_size', 0)
201
+ # stats['total_size'] += size
202
+ # stats['by_document'][document_id]['count'] += 1
203
+ # stats['by_document'][document_id]['size'] += size
204
 
205
 
206
+ # except Exception as e:
207
+ # log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
208
+ # continue
209
 
210
+ # # Log summary statistics
211
+ # log_message("\n" + "=" * 60)
212
+ # log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
213
+ # log_message("=" * 60)
214
+ # log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
215
+ # log_message(f"Общий размер: {stats['total_size']:,} символов")
216
+ # log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
217
 
218
+ # log_message("\nПо документам:")
219
+ # for doc_id, doc_stats in sorted(stats['by_document'].items()):
220
+ # log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
221
+ # f"{doc_stats['size']:,} символов")
222
 
223
+ # log_message("=" * 60)
224
 
225
+ # return table_documents
226
 
227
+ # except Exception as e:
228
+ # log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
229
+ # return []