MrSimple07 commited on
Commit
40c6310
·
1 Parent(s): f3333c8

fixed import error

Browse files
Files changed (1) hide show
  1. main_utils.py +112 -0
main_utils.py CHANGED
@@ -210,6 +210,118 @@ def enhance_query_with_keywords(query):
210
  return f"{query}"
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
214
  normalized_question = normalize_text(question)
215
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question)
 
210
  return f"{query}"
211
 
212
 
213
+
214
+ def merge_table_chunks(chunk_info):
215
+ merged = {}
216
+
217
+ for chunk in chunk_info:
218
+ doc_type = chunk.get('type', 'text')
219
+ doc_id = chunk.get('document_id', 'unknown')
220
+
221
+ if doc_type == 'table' or doc_type == 'table_row':
222
+ table_num = chunk.get('table_number', '')
223
+ key = f"{doc_id}_{table_num}"
224
+
225
+ if key not in merged:
226
+ merged[key] = {
227
+ 'document_id': doc_id,
228
+ 'type': 'table',
229
+ 'table_number': table_num,
230
+ 'section_id': chunk.get('section_id', 'unknown'),
231
+ 'chunk_text': chunk.get('chunk_text', '')
232
+ }
233
+ else:
234
+ merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
235
+ else:
236
+ unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
237
+ merged[unique_key] = chunk
238
+
239
+ return list(merged.values())
240
+
241
+ def create_chunks_display_html(chunk_info):
242
+ if not chunk_info:
243
+ return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
244
+
245
+ merged_chunks = merge_table_chunks(chunk_info)
246
+
247
+ html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
248
+ html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
249
+
250
+ for i, chunk in enumerate(merged_chunks):
251
+ bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
252
+ section_display = get_section_display(chunk)
253
+ formatted_content = get_formatted_content(chunk)
254
+
255
+ html += f"""
256
+ <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
257
+ <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
258
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
259
+ <strong style='color: black;'>Содержание:</strong><br>
260
+ <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
261
+ {formatted_content}
262
+ </div>
263
+ </div>
264
+ """
265
+
266
+ html += "</div>"
267
+ return html
268
+
269
+ def get_section_display(chunk):
270
+ section_path = chunk.get('section_path', '')
271
+ section_id = chunk.get('section_id', 'unknown')
272
+ doc_type = chunk.get('type', 'text')
273
+
274
+ if doc_type == 'table' and chunk.get('table_number'):
275
+ table_num = chunk.get('table_number')
276
+ if not str(table_num).startswith('№'):
277
+ table_num = f"№{table_num}"
278
+ return f"таблица {table_num}"
279
+
280
+ if doc_type == 'image' and chunk.get('image_number'):
281
+ image_num = chunk.get('image_number')
282
+ if not str(image_num).startswith('№'):
283
+ image_num = f"№{image_num}"
284
+ return f"рисунок {image_num}"
285
+
286
+ if section_path:
287
+ return section_path
288
+ elif section_id and section_id != 'unknown':
289
+ return section_id
290
+
291
+ return section_id
292
+
293
+ def get_formatted_content(chunk):
294
+ document_id = chunk.get('document_id', 'unknown')
295
+ section_path = chunk.get('section_path', '')
296
+ section_id = chunk.get('section_id', 'unknown')
297
+ section_text = chunk.get('section_text', '')
298
+ parent_section = chunk.get('parent_section', '')
299
+ parent_title = chunk.get('parent_title', '')
300
+ level = chunk.get('level', '')
301
+ chunk_text = chunk.get('chunk_text', '')
302
+ doc_type = chunk.get('type', 'text')
303
+
304
+ # For text documents
305
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
306
+ current_section = section_path if section_path else section_id
307
+ parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
308
+ return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
309
+ else:
310
+ current_section = section_path if section_path else section_id
311
+ clean_text = chunk_text
312
+ if section_text and chunk_text.startswith(section_text):
313
+ section_title = section_text
314
+ elif chunk_text.startswith(f"{current_section} "):
315
+ clean_text = chunk_text[len(f"{current_section} "):].strip()
316
+ section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
317
+ else:
318
+ section_title = section_text if section_text else current_section
319
+
320
+ return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
321
+
322
+
323
+
324
+
325
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
326
  normalized_question = normalize_text(question)
327
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question)