Spaces:
Sleeping
Sleeping
Commit
·
7565a55
1
Parent(s):
2edec29
max chars = 2000 for tables + new answer_question
Browse files- documents_prep.py +137 -63
- utils.py +20 -7
documents_prep.py
CHANGED
|
@@ -53,7 +53,8 @@ def normalize_doc_id(doc_id):
|
|
| 53 |
return doc_id
|
| 54 |
|
| 55 |
|
| 56 |
-
def
|
|
|
|
| 57 |
headers = table_data.get('headers', [])
|
| 58 |
rows = table_data.get('data', [])
|
| 59 |
table_num = table_data.get('table_number', 'unknown')
|
|
@@ -62,7 +63,6 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
|
|
| 62 |
|
| 63 |
# NORMALIZE document ID
|
| 64 |
doc_id = normalize_doc_id(doc_id)
|
| 65 |
-
|
| 66 |
table_num_clean = str(table_num).strip()
|
| 67 |
|
| 68 |
# Create section-aware identifier
|
|
@@ -82,9 +82,15 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
|
|
| 82 |
|
| 83 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
metadata = {
|
| 90 |
'type': 'table',
|
|
@@ -94,30 +100,62 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
|
|
| 94 |
'table_title': table_title,
|
| 95 |
'section': section,
|
| 96 |
'total_rows': len(rows),
|
| 97 |
-
'chunk_size':
|
| 98 |
'is_complete_table': True
|
| 99 |
}
|
| 100 |
|
| 101 |
-
log_message(f"
|
| 102 |
-
|
| 103 |
return [Document(text=content, metadata=metadata)]
|
| 104 |
|
|
|
|
| 105 |
chunks = []
|
| 106 |
-
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
for i in
|
| 109 |
-
|
| 110 |
-
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
table_identifier,
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
metadata = {
|
| 123 |
'type': 'table',
|
|
@@ -127,28 +165,21 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
|
|
| 127 |
'table_title': table_title,
|
| 128 |
'section': section,
|
| 129 |
'chunk_id': chunk_num,
|
| 130 |
-
'row_start':
|
| 131 |
-
'row_end':
|
| 132 |
'total_rows': len(rows),
|
| 133 |
-
'chunk_size':
|
| 134 |
-
'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
|
| 135 |
'is_complete_table': False
|
| 136 |
}
|
| 137 |
|
| 138 |
-
log_message(f" Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
|
| 139 |
-
|
| 140 |
chunks.append(Document(text=content, metadata=metadata))
|
|
|
|
| 141 |
|
| 142 |
return chunks
|
| 143 |
|
| 144 |
|
| 145 |
-
def
|
| 146 |
-
|
| 147 |
-
table_num = table_data.get('table_number', 'unknown')
|
| 148 |
-
table_title = table_data.get('table_title', '')
|
| 149 |
-
section = table_data.get('section', '')
|
| 150 |
-
|
| 151 |
-
# Use enhanced identifier
|
| 152 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 153 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 154 |
content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
|
|
@@ -161,48 +192,91 @@ def format_table_content(table_data, headers, rows, table_identifier, chunk_info
|
|
| 161 |
|
| 162 |
# Enhanced search keywords
|
| 163 |
content += f"Это таблица {table_identifier} из документа {doc_id}. "
|
| 164 |
-
content += f"
|
| 165 |
-
content += f"Номер: {table_num}. "
|
| 166 |
-
content += f"Документ: {doc_id}. "
|
| 167 |
|
| 168 |
if section:
|
| 169 |
-
content += f"
|
| 170 |
if 'приложени' in section.lower():
|
| 171 |
content += f"Таблица из приложения. "
|
| 172 |
|
| 173 |
if table_title:
|
| 174 |
-
content += f"
|
| 175 |
-
content += f"Таблица о: {table_title}. "
|
| 176 |
|
| 177 |
-
content += f"
|
| 178 |
-
|
| 179 |
-
if chunk_info:
|
| 180 |
-
content += f"\n{chunk_info}\n"
|
| 181 |
-
|
| 182 |
-
content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
|
| 183 |
-
content += f"="*70 + "\n\n"
|
| 184 |
|
| 185 |
if headers:
|
| 186 |
header_str = ' | '.join(str(h) for h in headers)
|
| 187 |
-
content += f"
|
| 188 |
-
|
| 189 |
-
content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
|
| 190 |
-
for idx, row in enumerate(rows, 1):
|
| 191 |
-
if isinstance(row, dict):
|
| 192 |
-
parts = [f"{k}: {v}" for k, v in row.items()
|
| 193 |
-
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
| 194 |
-
if parts:
|
| 195 |
-
content += f"{idx}. {' | '.join(parts)}\n"
|
| 196 |
-
elif isinstance(row, list):
|
| 197 |
-
parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
| 198 |
-
if parts:
|
| 199 |
-
content += f"{idx}. {' | '.join(parts)}\n"
|
| 200 |
-
|
| 201 |
-
content += f"\n{'='*70}\n"
|
| 202 |
-
content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 203 |
|
|
|
|
| 204 |
return content
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 207 |
import zipfile
|
| 208 |
import tempfile
|
|
@@ -411,7 +485,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 411 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 412 |
|
| 413 |
# CRITICAL: Pass document_id to chunk function
|
| 414 |
-
chunks =
|
| 415 |
all_chunks.extend(chunks)
|
| 416 |
|
| 417 |
except Exception as e:
|
|
|
|
| 53 |
return doc_id
|
| 54 |
|
| 55 |
|
| 56 |
+
def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
| 57 |
+
"""Chunk tables by content size instead of rows"""
|
| 58 |
headers = table_data.get('headers', [])
|
| 59 |
rows = table_data.get('data', [])
|
| 60 |
table_num = table_data.get('table_number', 'unknown')
|
|
|
|
| 63 |
|
| 64 |
# NORMALIZE document ID
|
| 65 |
doc_id = normalize_doc_id(doc_id)
|
|
|
|
| 66 |
table_num_clean = str(table_num).strip()
|
| 67 |
|
| 68 |
# Create section-aware identifier
|
|
|
|
| 82 |
|
| 83 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 84 |
|
| 85 |
+
# Calculate base metadata size (everything except row data)
|
| 86 |
+
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 87 |
+
base_size = len(base_content)
|
| 88 |
+
available_space = max_chars - base_size - 200 # Reserve 200 chars for footer
|
| 89 |
+
|
| 90 |
+
# If entire table fits, return as one chunk
|
| 91 |
+
full_rows_content = format_table_rows(rows)
|
| 92 |
+
if base_size + len(full_rows_content) <= max_chars:
|
| 93 |
+
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 94 |
|
| 95 |
metadata = {
|
| 96 |
'type': 'table',
|
|
|
|
| 100 |
'table_title': table_title,
|
| 101 |
'section': section,
|
| 102 |
'total_rows': len(rows),
|
| 103 |
+
'chunk_size': len(content),
|
| 104 |
'is_complete_table': True
|
| 105 |
}
|
| 106 |
|
| 107 |
+
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
|
|
|
| 108 |
return [Document(text=content, metadata=metadata)]
|
| 109 |
|
| 110 |
+
# Otherwise, chunk by content size
|
| 111 |
chunks = []
|
| 112 |
+
current_rows = []
|
| 113 |
+
current_size = 0
|
| 114 |
+
chunk_num = 0
|
| 115 |
|
| 116 |
+
for i, row in enumerate(rows):
|
| 117 |
+
row_text = format_single_row(row, i + 1)
|
| 118 |
+
row_size = len(row_text)
|
| 119 |
|
| 120 |
+
# If adding this row exceeds limit, save current chunk
|
| 121 |
+
if current_size + row_size > available_space and current_rows:
|
| 122 |
+
content = base_content + format_table_rows(current_rows)
|
| 123 |
+
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
| 124 |
+
content += format_table_footer(table_identifier, doc_id)
|
| 125 |
+
|
| 126 |
+
metadata = {
|
| 127 |
+
'type': 'table',
|
| 128 |
+
'document_id': doc_id,
|
| 129 |
+
'table_number': table_num_clean,
|
| 130 |
+
'table_identifier': table_identifier,
|
| 131 |
+
'table_title': table_title,
|
| 132 |
+
'section': section,
|
| 133 |
+
'chunk_id': chunk_num,
|
| 134 |
+
'row_start': current_rows[0]['_idx'] - 1,
|
| 135 |
+
'row_end': current_rows[-1]['_idx'],
|
| 136 |
+
'total_rows': len(rows),
|
| 137 |
+
'chunk_size': len(content),
|
| 138 |
+
'is_complete_table': False
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
chunks.append(Document(text=content, metadata=metadata))
|
| 142 |
+
log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
|
| 143 |
+
|
| 144 |
+
chunk_num += 1
|
| 145 |
+
current_rows = []
|
| 146 |
+
current_size = 0
|
| 147 |
|
| 148 |
+
# Add row index for tracking
|
| 149 |
+
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
|
| 150 |
+
row_copy['_idx'] = i + 1
|
| 151 |
+
current_rows.append(row_copy)
|
| 152 |
+
current_size += row_size
|
| 153 |
+
|
| 154 |
+
# Add final chunk if rows remain
|
| 155 |
+
if current_rows:
|
| 156 |
+
content = base_content + format_table_rows(current_rows)
|
| 157 |
+
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
| 158 |
+
content += format_table_footer(table_identifier, doc_id)
|
| 159 |
|
| 160 |
metadata = {
|
| 161 |
'type': 'table',
|
|
|
|
| 165 |
'table_title': table_title,
|
| 166 |
'section': section,
|
| 167 |
'chunk_id': chunk_num,
|
| 168 |
+
'row_start': current_rows[0]['_idx'] - 1,
|
| 169 |
+
'row_end': current_rows[-1]['_idx'],
|
| 170 |
'total_rows': len(rows),
|
| 171 |
+
'chunk_size': len(content),
|
|
|
|
| 172 |
'is_complete_table': False
|
| 173 |
}
|
| 174 |
|
|
|
|
|
|
|
| 175 |
chunks.append(Document(text=content, metadata=metadata))
|
| 176 |
+
log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
|
| 177 |
|
| 178 |
return chunks
|
| 179 |
|
| 180 |
|
| 181 |
+
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 182 |
+
"""Format consistent table header"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 184 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 185 |
content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
|
|
|
|
| 192 |
|
| 193 |
# Enhanced search keywords
|
| 194 |
content += f"Это таблица {table_identifier} из документа {doc_id}. "
|
| 195 |
+
content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
|
|
|
|
|
|
|
| 196 |
|
| 197 |
if section:
|
| 198 |
+
content += f"Раздел: {section}. "
|
| 199 |
if 'приложени' in section.lower():
|
| 200 |
content += f"Таблица из приложения. "
|
| 201 |
|
| 202 |
if table_title:
|
| 203 |
+
content += f"Название: {table_title}. "
|
|
|
|
| 204 |
|
| 205 |
+
content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
if headers:
|
| 208 |
header_str = ' | '.join(str(h) for h in headers)
|
| 209 |
+
content += f"ЗАГОЛОВКИ: {header_str}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
content += "ДАННЫЕ:\n"
|
| 212 |
return content
|
| 213 |
|
| 214 |
+
|
| 215 |
+
def format_single_row(row, idx):
|
| 216 |
+
"""Format a single row"""
|
| 217 |
+
if isinstance(row, dict):
|
| 218 |
+
parts = [f"{k}: {v}" for k, v in row.items()
|
| 219 |
+
if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
| 220 |
+
if parts:
|
| 221 |
+
return f"{idx}. {' | '.join(parts)}\n"
|
| 222 |
+
elif isinstance(row, list):
|
| 223 |
+
parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
|
| 224 |
+
if parts:
|
| 225 |
+
return f"{idx}. {' | '.join(parts)}\n"
|
| 226 |
+
return ""
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def format_table_rows(rows):
|
| 230 |
+
"""Format multiple rows"""
|
| 231 |
+
content = ""
|
| 232 |
+
for row in rows:
|
| 233 |
+
idx = row.get('_idx', 0)
|
| 234 |
+
content += format_single_row(row, idx)
|
| 235 |
+
return content
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def format_table_footer(table_identifier, doc_id):
|
| 239 |
+
"""Format table footer"""
|
| 240 |
+
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# Update load_table_documents to use new function
|
| 244 |
+
def load_table_documents(repo_id, hf_token, table_dir):
|
| 245 |
+
"""Load and chunk tables by content size"""
|
| 246 |
+
log_message("Loading tables...")
|
| 247 |
+
|
| 248 |
+
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 249 |
+
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 250 |
+
|
| 251 |
+
all_chunks = []
|
| 252 |
+
for file_path in table_files:
|
| 253 |
+
try:
|
| 254 |
+
local_path = hf_hub_download(
|
| 255 |
+
repo_id=repo_id,
|
| 256 |
+
filename=file_path,
|
| 257 |
+
repo_type="dataset",
|
| 258 |
+
token=hf_token
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
with open(local_path, 'r', encoding='utf-8') as f:
|
| 262 |
+
data = json.load(f)
|
| 263 |
+
|
| 264 |
+
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 265 |
+
|
| 266 |
+
for sheet in data.get('sheets', []):
|
| 267 |
+
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 268 |
+
|
| 269 |
+
# Use content-based chunking instead of row-based
|
| 270 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
|
| 271 |
+
all_chunks.extend(chunks)
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
log_message(f"Error loading {file_path}: {e}")
|
| 275 |
+
|
| 276 |
+
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 277 |
+
return all_chunks
|
| 278 |
+
|
| 279 |
+
|
| 280 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 281 |
import zipfile
|
| 282 |
import tempfile
|
|
|
|
| 485 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 486 |
|
| 487 |
# CRITICAL: Pass document_id to chunk function
|
| 488 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id)
|
| 489 |
all_chunks.extend(chunks)
|
| 490 |
|
| 491 |
except Exception as e:
|
utils.py
CHANGED
|
@@ -62,20 +62,33 @@ def answer_question(question, query_engine, reranker):
|
|
| 62 |
source_label += f" {title}"
|
| 63 |
else:
|
| 64 |
source_label = f"[{doc_id}]"
|
| 65 |
-
context_parts.append(f"{source_label}\n{n.text
|
| 66 |
|
| 67 |
context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
|
| 68 |
|
| 69 |
-
# Use
|
| 70 |
from config import CUSTOM_PROMPT
|
| 71 |
prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
|
| 72 |
-
log_message(f"\nPROMPT
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
sources = format_sources(reranked)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
except Exception as e:
|
| 81 |
log_message(f"Error: {e}")
|
|
|
|
| 62 |
source_label += f" {title}"
|
| 63 |
else:
|
| 64 |
source_label = f"[{doc_id}]"
|
| 65 |
+
context_parts.append(f"{source_label}\n{n.text}") # Use FULL text, not [:500]
|
| 66 |
|
| 67 |
context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
|
| 68 |
|
| 69 |
+
# Use CUSTOM_PROMPT from config
|
| 70 |
from config import CUSTOM_PROMPT
|
| 71 |
prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
|
| 72 |
+
log_message(f"\nPROMPT LENGTH: {len(prompt)} chars\n")
|
| 73 |
+
|
| 74 |
+
# CRITICAL FIX: Call LLM directly instead of query_engine.query()
|
| 75 |
+
from llama_index.core import Settings
|
| 76 |
+
response = Settings.llm.complete(prompt)
|
| 77 |
|
| 78 |
sources = format_sources(reranked)
|
| 79 |
+
|
| 80 |
+
# Log retrieved chunks
|
| 81 |
+
log_message(f"\n{'='*70}")
|
| 82 |
+
log_message("RETRIEVED CHUNKS:")
|
| 83 |
+
for i, node in enumerate(reranked, 1):
|
| 84 |
+
log_message(f"\n--- Chunk {i} ---")
|
| 85 |
+
log_message(f"Document: {node.metadata.get('document_id', 'unknown')}")
|
| 86 |
+
log_message(f"Type: {node.metadata.get('type', 'unknown')}")
|
| 87 |
+
if node.metadata.get('type') == 'table':
|
| 88 |
+
log_message(f"Table: {node.metadata.get('table_identifier', 'unknown')}")
|
| 89 |
+
log_message(f"Text preview: {node.text[:500]}...")
|
| 90 |
+
|
| 91 |
+
return response.text, sources
|
| 92 |
|
| 93 |
except Exception as e:
|
| 94 |
log_message(f"Error: {e}")
|