MrSimple07 commited on
Commit
7565a55
·
1 Parent(s): 2edec29

max chars = 2000 for tables + new answer_question

Browse files
Files changed (2) hide show
  1. documents_prep.py +137 -63
  2. utils.py +20 -7
documents_prep.py CHANGED
@@ -53,7 +53,8 @@ def normalize_doc_id(doc_id):
53
  return doc_id
54
 
55
 
56
- def chunk_table_by_rows(table_data, doc_id, max_rows=10):
 
57
  headers = table_data.get('headers', [])
58
  rows = table_data.get('data', [])
59
  table_num = table_data.get('table_number', 'unknown')
@@ -62,7 +63,6 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
62
 
63
  # NORMALIZE document ID
64
  doc_id = normalize_doc_id(doc_id)
65
-
66
  table_num_clean = str(table_num).strip()
67
 
68
  # Create section-aware identifier
@@ -82,9 +82,15 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
82
 
83
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
84
 
85
- if len(rows) <= max_rows:
86
- content = format_table_content(table_data, headers, rows, doc_id, table_identifier)
87
- chunk_size = len(content)
 
 
 
 
 
 
88
 
89
  metadata = {
90
  'type': 'table',
@@ -94,30 +100,62 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
94
  'table_title': table_title,
95
  'section': section,
96
  'total_rows': len(rows),
97
- 'chunk_size': chunk_size,
98
  'is_complete_table': True
99
  }
100
 
101
- log_message(f" Chunk: 1/1, {chunk_size} chars, doc={doc_id}, table={table_identifier}")
102
-
103
  return [Document(text=content, metadata=metadata)]
104
 
 
105
  chunks = []
106
- overlap = 1
 
 
107
 
108
- for i in range(0, len(rows), max_rows - overlap):
109
- chunk_rows = rows[i:min(i+max_rows, len(rows))]
110
- chunk_num = i // (max_rows - overlap)
111
 
112
- content = format_table_content(
113
- table_data,
114
- headers,
115
- chunk_rows,
116
- table_identifier,
117
- chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- chunk_size = len(content)
 
 
 
 
 
 
 
 
 
 
121
 
122
  metadata = {
123
  'type': 'table',
@@ -127,28 +165,21 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):
127
  'table_title': table_title,
128
  'section': section,
129
  'chunk_id': chunk_num,
130
- 'row_start': i,
131
- 'row_end': i + len(chunk_rows),
132
  'total_rows': len(rows),
133
- 'chunk_size': chunk_size,
134
- 'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
135
  'is_complete_table': False
136
  }
137
 
138
- log_message(f" Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
139
-
140
  chunks.append(Document(text=content, metadata=metadata))
 
141
 
142
  return chunks
143
 
144
 
145
- def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
146
- doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
147
- table_num = table_data.get('table_number', 'unknown')
148
- table_title = table_data.get('table_title', '')
149
- section = table_data.get('section', '')
150
-
151
- # Use enhanced identifier
152
  content = f"ДОКУМЕНТ: {doc_id}\n"
153
  content += f"ТАБЛИЦА: {table_identifier}\n"
154
  content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
@@ -161,48 +192,91 @@ def format_table_content(table_data, headers, rows, table_identifier, chunk_info
161
 
162
  # Enhanced search keywords
163
  content += f"Это таблица {table_identifier} из документа {doc_id}. "
164
- content += f"Идентификатор таблицы: {table_identifier}. "
165
- content += f"Номер: {table_num}. "
166
- content += f"Документ: {doc_id}. "
167
 
168
  if section:
169
- content += f"Находится в разделе: {section}. "
170
  if 'приложени' in section.lower():
171
  content += f"Таблица из приложения. "
172
 
173
  if table_title:
174
- content += f"Название таблицы: {table_title}. "
175
- content += f"Таблица о: {table_title}. "
176
 
177
- content += f"Поиск: таблица {table_identifier} {doc_id}. "
178
-
179
- if chunk_info:
180
- content += f"\n{chunk_info}\n"
181
-
182
- content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
183
- content += f"="*70 + "\n\n"
184
 
185
  if headers:
186
  header_str = ' | '.join(str(h) for h in headers)
187
- content += f"ЗАГОЛОВКИ СТОЛБЦОВ:\n{header_str}\n\n"
188
-
189
- content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
190
- for idx, row in enumerate(rows, 1):
191
- if isinstance(row, dict):
192
- parts = [f"{k}: {v}" for k, v in row.items()
193
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
194
- if parts:
195
- content += f"{idx}. {' | '.join(parts)}\n"
196
- elif isinstance(row, list):
197
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
198
- if parts:
199
- content += f"{idx}. {' | '.join(parts)}\n"
200
-
201
- content += f"\n{'='*70}\n"
202
- content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
203
 
 
204
  return content
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def load_json_documents(repo_id, hf_token, json_dir):
207
  import zipfile
208
  import tempfile
@@ -411,7 +485,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
411
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
412
 
413
  # CRITICAL: Pass document_id to chunk function
414
- chunks = chunk_table_by_rows(sheet, sheet_doc_id)
415
  all_chunks.extend(chunks)
416
 
417
  except Exception as e:
 
53
  return doc_id
54
 
55
 
56
+ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
57
+ """Chunk tables by content size instead of rows"""
58
  headers = table_data.get('headers', [])
59
  rows = table_data.get('data', [])
60
  table_num = table_data.get('table_number', 'unknown')
 
63
 
64
  # NORMALIZE document ID
65
  doc_id = normalize_doc_id(doc_id)
 
66
  table_num_clean = str(table_num).strip()
67
 
68
  # Create section-aware identifier
 
82
 
83
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
84
 
85
+ # Calculate base metadata size (everything except row data)
86
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
87
+ base_size = len(base_content)
88
+ available_space = max_chars - base_size - 200 # Reserve 200 chars for footer
89
+
90
+ # If entire table fits, return as one chunk
91
+ full_rows_content = format_table_rows(rows)
92
+ if base_size + len(full_rows_content) <= max_chars:
93
+ content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
94
 
95
  metadata = {
96
  'type': 'table',
 
100
  'table_title': table_title,
101
  'section': section,
102
  'total_rows': len(rows),
103
+ 'chunk_size': len(content),
104
  'is_complete_table': True
105
  }
106
 
107
+ log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
 
108
  return [Document(text=content, metadata=metadata)]
109
 
110
+ # Otherwise, chunk by content size
111
  chunks = []
112
+ current_rows = []
113
+ current_size = 0
114
+ chunk_num = 0
115
 
116
+ for i, row in enumerate(rows):
117
+ row_text = format_single_row(row, i + 1)
118
+ row_size = len(row_text)
119
 
120
+ # If adding this row exceeds limit, save current chunk
121
+ if current_size + row_size > available_space and current_rows:
122
+ content = base_content + format_table_rows(current_rows)
123
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
124
+ content += format_table_footer(table_identifier, doc_id)
125
+
126
+ metadata = {
127
+ 'type': 'table',
128
+ 'document_id': doc_id,
129
+ 'table_number': table_num_clean,
130
+ 'table_identifier': table_identifier,
131
+ 'table_title': table_title,
132
+ 'section': section,
133
+ 'chunk_id': chunk_num,
134
+ 'row_start': current_rows[0]['_idx'] - 1,
135
+ 'row_end': current_rows[-1]['_idx'],
136
+ 'total_rows': len(rows),
137
+ 'chunk_size': len(content),
138
+ 'is_complete_table': False
139
+ }
140
+
141
+ chunks.append(Document(text=content, metadata=metadata))
142
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
143
+
144
+ chunk_num += 1
145
+ current_rows = []
146
+ current_size = 0
147
 
148
+ # Add row index for tracking
149
+ row_copy = row.copy() if isinstance(row, dict) else {'data': row}
150
+ row_copy['_idx'] = i + 1
151
+ current_rows.append(row_copy)
152
+ current_size += row_size
153
+
154
+ # Add final chunk if rows remain
155
+ if current_rows:
156
+ content = base_content + format_table_rows(current_rows)
157
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
158
+ content += format_table_footer(table_identifier, doc_id)
159
 
160
  metadata = {
161
  'type': 'table',
 
165
  'table_title': table_title,
166
  'section': section,
167
  'chunk_id': chunk_num,
168
+ 'row_start': current_rows[0]['_idx'] - 1,
169
+ 'row_end': current_rows[-1]['_idx'],
170
  'total_rows': len(rows),
171
+ 'chunk_size': len(content),
 
172
  'is_complete_table': False
173
  }
174
 
 
 
175
  chunks.append(Document(text=content, metadata=metadata))
176
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
177
 
178
  return chunks
179
 
180
 
181
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
182
+ """Format consistent table header"""
 
 
 
 
 
183
  content = f"ДОКУМЕНТ: {doc_id}\n"
184
  content += f"ТАБЛИЦА: {table_identifier}\n"
185
  content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
 
192
 
193
  # Enhanced search keywords
194
  content += f"Это таблица {table_identifier} из документа {doc_id}. "
195
+ content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
 
 
196
 
197
  if section:
198
+ content += f"Раздел: {section}. "
199
  if 'приложени' in section.lower():
200
  content += f"Таблица из приложения. "
201
 
202
  if table_title:
203
+ content += f"Название: {table_title}. "
 
204
 
205
+ content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
 
 
 
 
 
 
206
 
207
  if headers:
208
  header_str = ' | '.join(str(h) for h in headers)
209
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ content += "ДАННЫЕ:\n"
212
  return content
213
 
214
+
215
+ def format_single_row(row, idx):
216
+ """Format a single row"""
217
+ if isinstance(row, dict):
218
+ parts = [f"{k}: {v}" for k, v in row.items()
219
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
220
+ if parts:
221
+ return f"{idx}. {' | '.join(parts)}\n"
222
+ elif isinstance(row, list):
223
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
224
+ if parts:
225
+ return f"{idx}. {' | '.join(parts)}\n"
226
+ return ""
227
+
228
+
229
+ def format_table_rows(rows):
230
+ """Format multiple rows"""
231
+ content = ""
232
+ for row in rows:
233
+ idx = row.get('_idx', 0)
234
+ content += format_single_row(row, idx)
235
+ return content
236
+
237
+
238
+ def format_table_footer(table_identifier, doc_id):
239
+ """Format table footer"""
240
+ return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
241
+
242
+
243
+ # Update load_table_documents to use new function
244
+ def load_table_documents(repo_id, hf_token, table_dir):
245
+ """Load and chunk tables by content size"""
246
+ log_message("Loading tables...")
247
+
248
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
249
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
250
+
251
+ all_chunks = []
252
+ for file_path in table_files:
253
+ try:
254
+ local_path = hf_hub_download(
255
+ repo_id=repo_id,
256
+ filename=file_path,
257
+ repo_type="dataset",
258
+ token=hf_token
259
+ )
260
+
261
+ with open(local_path, 'r', encoding='utf-8') as f:
262
+ data = json.load(f)
263
+
264
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
265
+
266
+ for sheet in data.get('sheets', []):
267
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
268
+
269
+ # Use content-based chunking instead of row-based
270
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
271
+ all_chunks.extend(chunks)
272
+
273
+ except Exception as e:
274
+ log_message(f"Error loading {file_path}: {e}")
275
+
276
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks")
277
+ return all_chunks
278
+
279
+
280
  def load_json_documents(repo_id, hf_token, json_dir):
281
  import zipfile
282
  import tempfile
 
485
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
486
 
487
  # CRITICAL: Pass document_id to chunk function
488
+ chunks = chunk_table_by_content(sheet, sheet_doc_id)
489
  all_chunks.extend(chunks)
490
 
491
  except Exception as e:
utils.py CHANGED
@@ -62,20 +62,33 @@ def answer_question(question, query_engine, reranker):
62
  source_label += f" {title}"
63
  else:
64
  source_label = f"[{doc_id}]"
65
- context_parts.append(f"{source_label}\n{n.text[:500]}") # Limit context per chunk
66
 
67
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
68
 
69
- # Use only CUSTOM_PROMPT from config
70
  from config import CUSTOM_PROMPT
71
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
72
- log_message(f"\nPROMPT:\n{prompt[:300]}...\n") # Log first 1000 chars of prompt
73
- response = query_engine.query(prompt)
 
 
 
74
 
75
  sources = format_sources(reranked)
76
- for i in reranked:
77
- log_message(f"---\n{i.text[:500]}\n...")
78
- return response.response, sources
 
 
 
 
 
 
 
 
 
 
79
 
80
  except Exception as e:
81
  log_message(f"Error: {e}")
 
62
  source_label += f" {title}"
63
  else:
64
  source_label = f"[{doc_id}]"
65
+ context_parts.append(f"{source_label}\n{n.text}") # Use FULL text, not [:500]
66
 
67
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
68
 
69
+ # Use CUSTOM_PROMPT from config
70
  from config import CUSTOM_PROMPT
71
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
72
+ log_message(f"\nPROMPT LENGTH: {len(prompt)} chars\n")
73
+
74
+ # CRITICAL FIX: Call LLM directly instead of query_engine.query()
75
+ from llama_index.core import Settings
76
+ response = Settings.llm.complete(prompt)
77
 
78
  sources = format_sources(reranked)
79
+
80
+ # Log retrieved chunks
81
+ log_message(f"\n{'='*70}")
82
+ log_message("RETRIEVED CHUNKS:")
83
+ for i, node in enumerate(reranked, 1):
84
+ log_message(f"\n--- Chunk {i} ---")
85
+ log_message(f"Document: {node.metadata.get('document_id', 'unknown')}")
86
+ log_message(f"Type: {node.metadata.get('type', 'unknown')}")
87
+ if node.metadata.get('type') == 'table':
88
+ log_message(f"Table: {node.metadata.get('table_identifier', 'unknown')}")
89
+ log_message(f"Text preview: {node.text[:500]}...")
90
+
91
+ return response.text, sources
92
 
93
  except Exception as e:
94
  log_message(f"Error: {e}")