Spaces:
Sleeping
Sleeping
Commit
·
35eb459
1
Parent(s):
5ebc241
top k = 150 + max chunk size is 4000 + max rows =15 + sim cut off = 0.45
Browse files- documents_prep.py +10 -40
- index_retriever.py +4 -4
- table_prep.py +107 -107
documents_prep.py
CHANGED
|
@@ -157,11 +157,18 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 157 |
|
| 158 |
|
| 159 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 160 |
-
content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
if table_title:
|
| 162 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
|
|
|
| 163 |
if section:
|
| 164 |
content += f"РАЗДЕЛ: {section}\n"
|
|
|
|
| 165 |
content += f"{'='*70}\n"
|
| 166 |
|
| 167 |
if headers:
|
|
@@ -199,40 +206,6 @@ def format_table_footer(table_identifier, doc_id):
|
|
| 199 |
"""Format table footer"""
|
| 200 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 201 |
|
| 202 |
-
def load_table_documents(repo_id, hf_token, table_dir):
|
| 203 |
-
log_message("Loading tables...")
|
| 204 |
-
|
| 205 |
-
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 206 |
-
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 207 |
-
|
| 208 |
-
all_chunks = []
|
| 209 |
-
for file_path in table_files:
|
| 210 |
-
try:
|
| 211 |
-
local_path = hf_hub_download(
|
| 212 |
-
repo_id=repo_id,
|
| 213 |
-
filename=file_path,
|
| 214 |
-
repo_type="dataset",
|
| 215 |
-
token=hf_token
|
| 216 |
-
)
|
| 217 |
-
|
| 218 |
-
with open(local_path, 'r', encoding='utf-8') as f:
|
| 219 |
-
data = json.load(f)
|
| 220 |
-
|
| 221 |
-
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 222 |
-
|
| 223 |
-
for sheet in data.get('sheets', []):
|
| 224 |
-
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 225 |
-
|
| 226 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
|
| 227 |
-
all_chunks.extend(chunks)
|
| 228 |
-
|
| 229 |
-
except Exception as e:
|
| 230 |
-
log_message(f"Error loading {file_path}: {e}")
|
| 231 |
-
|
| 232 |
-
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 233 |
-
return all_chunks
|
| 234 |
-
|
| 235 |
-
|
| 236 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 237 |
import zipfile
|
| 238 |
import tempfile
|
|
@@ -414,7 +387,6 @@ def extract_sections_from_json(json_path):
|
|
| 414 |
|
| 415 |
|
| 416 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 417 |
-
"""Load and chunk tables"""
|
| 418 |
log_message("Loading tables...")
|
| 419 |
|
| 420 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
@@ -433,15 +405,13 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 433 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 434 |
data = json.load(f)
|
| 435 |
|
| 436 |
-
# Extract file-level document_id
|
| 437 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 438 |
|
| 439 |
for sheet in data.get('sheets', []):
|
| 440 |
-
# Use sheet-level document_id if available, otherwise use file-level
|
| 441 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 442 |
|
| 443 |
-
#
|
| 444 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id)
|
| 445 |
all_chunks.extend(chunks)
|
| 446 |
|
| 447 |
except Exception as e:
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 160 |
+
content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
|
| 161 |
+
|
| 162 |
+
# Add table type/number prominently for matching
|
| 163 |
+
if table_num:
|
| 164 |
+
content += f"ТИП: {table_num}\n"
|
| 165 |
+
|
| 166 |
if table_title:
|
| 167 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
| 168 |
+
|
| 169 |
if section:
|
| 170 |
content += f"РАЗДЕЛ: {section}\n"
|
| 171 |
+
|
| 172 |
content += f"{'='*70}\n"
|
| 173 |
|
| 174 |
if headers:
|
|
|
|
| 206 |
"""Format table footer"""
|
| 207 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 210 |
import zipfile
|
| 211 |
import tempfile
|
|
|
|
| 387 |
|
| 388 |
|
| 389 |
def load_table_documents(repo_id, hf_token, table_dir):
|
|
|
|
| 390 |
log_message("Loading tables...")
|
| 391 |
|
| 392 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
|
|
| 405 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 406 |
data = json.load(f)
|
| 407 |
|
|
|
|
| 408 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 409 |
|
| 410 |
for sheet in data.get('sheets', []):
|
|
|
|
| 411 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 412 |
|
| 413 |
+
# Use the consistent MAX_CHARS_TABLE from config
|
| 414 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
|
| 415 |
all_chunks.extend(chunks)
|
| 416 |
|
| 417 |
except Exception as e:
|
index_retriever.py
CHANGED
|
@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
-
similarity_top_k=
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
-
similarity_top_k=
|
| 55 |
-
similarity_cutoff=0.
|
| 56 |
)
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
-
similarity_top_k=
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
|
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
+
similarity_top_k=150
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
+
similarity_top_k=150,
|
| 55 |
+
similarity_cutoff=0.45
|
| 56 |
)
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
+
similarity_top_k=150,
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
table_prep.py
CHANGED
|
@@ -95,135 +95,135 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk
|
|
| 95 |
return chunked_docs
|
| 96 |
|
| 97 |
|
| 98 |
-
def table_to_document(table_data, document_id=None):
|
| 99 |
-
|
| 100 |
-
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
|
| 144 |
-
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
|
| 149 |
-
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
|
| 168 |
-
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
-
|
| 224 |
|
| 225 |
-
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
| 95 |
return chunked_docs
|
| 96 |
|
| 97 |
|
| 98 |
+
# def table_to_document(table_data, document_id=None):
|
| 99 |
+
# if not isinstance(table_data, dict):
|
| 100 |
+
# return []
|
| 101 |
|
| 102 |
+
# doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
|
| 103 |
+
# table_num = table_data.get('table_number', 'Неизвестно')
|
| 104 |
+
# table_title = table_data.get('table_title', 'Неизвестно')
|
| 105 |
+
# section = table_data.get('section', 'Неизвестно')
|
| 106 |
+
# table_rows = table_data.get('data', [])
|
| 107 |
|
| 108 |
+
# if not table_rows:
|
| 109 |
+
# return []
|
| 110 |
|
| 111 |
+
# # Build table content
|
| 112 |
+
# content = f"Таблица: {table_num}\n"
|
| 113 |
+
# content += f"Название: {table_title}\n"
|
| 114 |
+
# content += f"Документ: {doc_id}\n"
|
| 115 |
+
# content += f"Раздел: {section}\n"
|
| 116 |
|
| 117 |
+
# headers = table_data.get('headers', [])
|
| 118 |
+
# if headers:
|
| 119 |
+
# content += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 120 |
|
| 121 |
+
# content += "\nДанные таблицы:\n"
|
| 122 |
+
# for row_idx, row in enumerate(table_rows, start=1):
|
| 123 |
+
# if isinstance(row, dict):
|
| 124 |
+
# row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 125 |
+
# content += f"Строка {row_idx}: {row_text}\n"
|
| 126 |
|
| 127 |
+
# # Create base document
|
| 128 |
+
# base_doc = Document(
|
| 129 |
+
# text=content,
|
| 130 |
+
# metadata={
|
| 131 |
+
# "type": "table",
|
| 132 |
+
# "table_number": table_num,
|
| 133 |
+
# "document_id": doc_id,
|
| 134 |
+
# "section": section
|
| 135 |
+
# }
|
| 136 |
+
# )
|
| 137 |
+
# if len(content) > 4000:
|
| 138 |
+
# chunks = chunk_table_document(base_doc)
|
| 139 |
+
# log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 140 |
+
# return chunk_table_document(base_doc)
|
| 141 |
+
# return [base_doc]
|
| 142 |
|
| 143 |
|
| 144 |
+
# def load_table_data(repo_id, hf_token, table_data_dir):
|
| 145 |
+
# try:
|
| 146 |
+
# files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 147 |
+
# table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
|
| 148 |
|
| 149 |
+
# log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
|
| 150 |
|
| 151 |
+
# table_documents = []
|
| 152 |
+
# stats = {
|
| 153 |
+
# 'total_tables': 0,
|
| 154 |
+
# 'total_size': 0,
|
| 155 |
+
# 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
|
| 156 |
+
# }
|
| 157 |
|
| 158 |
+
# for file_path in table_files:
|
| 159 |
+
# try:
|
| 160 |
+
# local_path = hf_hub_download(
|
| 161 |
+
# repo_id=repo_id,
|
| 162 |
+
# filename=file_path,
|
| 163 |
+
# local_dir='',
|
| 164 |
+
# repo_type="dataset",
|
| 165 |
+
# token=hf_token
|
| 166 |
+
# )
|
| 167 |
|
| 168 |
+
# log_message(f"\nОбработка файла: {file_path}")
|
| 169 |
|
| 170 |
+
# with open(local_path, 'r', encoding='utf-8') as f:
|
| 171 |
+
# table_data = json.load(f)
|
| 172 |
|
| 173 |
+
# if isinstance(table_data, dict):
|
| 174 |
+
# document_id = table_data.get('document', 'unknown')
|
| 175 |
|
| 176 |
+
# if 'sheets' in table_data:
|
| 177 |
+
# sorted_sheets = sorted(
|
| 178 |
+
# table_data['sheets'],
|
| 179 |
+
# key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
|
| 180 |
+
# )
|
| 181 |
|
| 182 |
+
# for sheet in sorted_sheets:
|
| 183 |
+
# sheet['document'] = document_id
|
| 184 |
+
# docs_list = table_to_document(sheet, document_id)
|
| 185 |
+
# table_documents.extend(docs_list)
|
| 186 |
|
| 187 |
+
# for doc in docs_list:
|
| 188 |
+
# stats['total_tables'] += 1
|
| 189 |
+
# size = doc.metadata.get('content_size', 0)
|
| 190 |
+
# stats['total_size'] += size
|
| 191 |
+
# stats['by_document'][document_id]['count'] += 1
|
| 192 |
+
# stats['by_document'][document_id]['size'] += size
|
| 193 |
+
# log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
|
| 194 |
+
# else:
|
| 195 |
+
# docs_list = table_to_document(table_data, document_id)
|
| 196 |
+
# table_documents.extend(docs_list)
|
| 197 |
|
| 198 |
+
# for doc in docs_list:
|
| 199 |
+
# stats['total_tables'] += 1
|
| 200 |
+
# size = doc.metadata.get('content_size', 0)
|
| 201 |
+
# stats['total_size'] += size
|
| 202 |
+
# stats['by_document'][document_id]['count'] += 1
|
| 203 |
+
# stats['by_document'][document_id]['size'] += size
|
| 204 |
|
| 205 |
|
| 206 |
+
# except Exception as e:
|
| 207 |
+
# log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
|
| 208 |
+
# continue
|
| 209 |
|
| 210 |
+
# # Log summary statistics
|
| 211 |
+
# log_message("\n" + "=" * 60)
|
| 212 |
+
# log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
|
| 213 |
+
# log_message("=" * 60)
|
| 214 |
+
# log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
|
| 215 |
+
# log_message(f"Общий размер: {stats['total_size']:,} символов")
|
| 216 |
+
# log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
|
| 217 |
|
| 218 |
+
# log_message("\nПо документам:")
|
| 219 |
+
# for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 220 |
+
# log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
|
| 221 |
+
# f"{doc_stats['size']:,} символов")
|
| 222 |
|
| 223 |
+
# log_message("=" * 60)
|
| 224 |
|
| 225 |
+
# return table_documents
|
| 226 |
|
| 227 |
+
# except Exception as e:
|
| 228 |
+
# log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 229 |
+
# return []
|