MrSimple07 commited on
Commit
e393fbc
·
1 Parent(s): 19e03d0

new chunking showing + improved context giving to LLM

Browse files
__pycache__/config.cpython-311.pyc ADDED
Binary file (66.4 kB). View file
 
__pycache__/index_retriever.cpython-311.pyc ADDED
Binary file (4.25 kB). View file
 
__pycache__/my_logging.cpython-311.pyc ADDED
Binary file (811 Bytes). View file
 
app.py CHANGED
@@ -20,10 +20,14 @@ def create_chunks_display_html(chunk_info):
20
 
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
 
 
 
 
23
  html += f"""
24
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
25
  <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
26
- <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
27
  <strong style='color: black;'>Содержание:</strong><br>
28
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
29
  {chunk['chunk_text']}
@@ -34,6 +38,45 @@ def create_chunks_display_html(chunk_info):
34
  html += "</div>"
35
  return html
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
38
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
39
  use_json_instead_csv=False):
 
20
 
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
+
24
+ # Get section display info similar to format_context_for_llm
25
+ section_display = format_section_for_display(chunk)
26
+
27
  html += f"""
28
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
29
  <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
30
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
31
  <strong style='color: black;'>Содержание:</strong><br>
32
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
33
  {chunk['chunk_text']}
 
38
  html += "</div>"
39
  return html
40
 
41
+ def format_section_for_display(chunk):
42
+ section_path = chunk.get('section_path', '')
43
+ section_id = chunk.get('section_id', 'unknown')
44
+ section_text = chunk.get('section_text', '')
45
+ level = chunk.get('level', '')
46
+ parent_section = chunk.get('parent_section', '')
47
+ parent_title = chunk.get('parent_title', '')
48
+ doc_type = chunk.get('type', 'text')
49
+
50
+ if doc_type == 'table' and chunk.get('table_number'):
51
+ table_num = chunk.get('table_number')
52
+ if not str(table_num).startswith('№'):
53
+ table_num = f"№{table_num}"
54
+ return f"таблица {table_num}"
55
+
56
+ if doc_type == 'image' and chunk.get('image_number'):
57
+ image_num = chunk.get('image_number')
58
+ if not str(image_num).startswith('№'):
59
+ image_num = f"№{image_num}"
60
+ return f"рисунок {image_num}"
61
+
62
+ if section_path:
63
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
+ return f"пункт {section_path} в разделе {parent_section} ({parent_title})"
65
+ elif section_text:
66
+ return f"пункт {section_path} ({section_text})"
67
+ else:
68
+ return f"пункт {section_path}"
69
+ elif section_id and section_id != 'unknown':
70
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
71
+ return f"пункт {section_id} в разделе {parent_section} ({parent_title})"
72
+ elif section_text:
73
+ return f"пункт {section_id} ({section_text})"
74
+ else:
75
+ return f"пункт {section_id}"
76
+
77
+ return section_id
78
+
79
+
80
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
81
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
82
  use_json_instead_csv=False):
index_retriever.py CHANGED
@@ -22,7 +22,7 @@ def create_query_engine(vector_index):
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
  similarity_top_k=30,
25
- similarity_cutoff=0.7
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
  similarity_top_k=30,
25
+ similarity_cutoff=0.8
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
utils.py CHANGED
@@ -57,17 +57,27 @@ def format_context_for_llm(nodes):
57
  section_text = metadata.get('section_text', '')
58
  parent_section = metadata.get('parent_section', '')
59
  parent_title = metadata.get('parent_title', '')
 
60
 
61
- if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
62
- section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
 
63
  elif section_text:
 
64
  section_info = f"пункт {section_path} ({section_text})"
65
  else:
66
  section_info = f"пункт {section_path}"
67
  elif metadata.get('section_id'):
68
  section_id = metadata['section_id']
69
  section_text = metadata.get('section_text', '')
70
- if section_text:
 
 
 
 
 
 
 
71
  section_info = f"пункт {section_id} ({section_text})"
72
  else:
73
  section_info = f"пункт {section_id}"
@@ -249,10 +259,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
249
 
250
  chunk_info = []
251
  for node in reranked_nodes:
252
- section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
253
  chunk_info.append({
254
- 'document_id': node.metadata.get('document_id', 'unknown'),
255
- 'section_id': section_id,
 
 
 
 
 
 
 
 
256
  'chunk_size': len(node.text),
257
  'chunk_text': node.text
258
  })
@@ -413,10 +431,6 @@ def generate_sources_html(nodes, chunks_df=None):
413
 
414
  if doc_type == 'text':
415
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
416
- # Show all sections for this document
417
- if source_info['sections']:
418
- sections_text = ", ".join(sorted(source_info['sections']))
419
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{sections_text}</p>"
420
 
421
  elif doc_type == 'table' or doc_type == 'table_row':
422
  table_num = metadata.get('table_number', 'unknown')
 
57
  section_text = metadata.get('section_text', '')
58
  parent_section = metadata.get('parent_section', '')
59
  parent_title = metadata.get('parent_title', '')
60
+ level = metadata.get('level', '')
61
 
62
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
63
+ # For subsections, show: пункт X.X в разделе X (Title)
64
+ section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
65
  elif section_text:
66
+ # For main sections, show: пункт X (Title)
67
  section_info = f"пункт {section_path} ({section_text})"
68
  else:
69
  section_info = f"пункт {section_path}"
70
  elif metadata.get('section_id'):
71
  section_id = metadata['section_id']
72
  section_text = metadata.get('section_text', '')
73
+ level = metadata.get('level', '')
74
+ parent_section = metadata.get('parent_section', '')
75
+ parent_title = metadata.get('parent_title', '')
76
+
77
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
78
+ # For subsections without section_path, show: пункт X.X в разделе X (Title)
79
+ section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
80
+ elif section_text:
81
  section_info = f"пункт {section_id} ({section_text})"
82
  else:
83
  section_info = f"пункт {section_id}"
 
259
 
260
  chunk_info = []
261
  for node in reranked_nodes:
262
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
263
  chunk_info.append({
264
+ 'document_id': metadata.get('document_id', 'unknown'),
265
+ 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
266
+ 'section_path': metadata.get('section_path', ''),
267
+ 'section_text': metadata.get('section_text', ''),
268
+ 'level': metadata.get('level', ''),
269
+ 'parent_section': metadata.get('parent_section', ''),
270
+ 'parent_title': metadata.get('parent_title', ''),
271
+ 'type': metadata.get('type', 'text'),
272
+ 'table_number': metadata.get('table_number', ''),
273
+ 'image_number': metadata.get('image_number', ''),
274
  'chunk_size': len(node.text),
275
  'chunk_text': node.text
276
  })
 
431
 
432
  if doc_type == 'text':
433
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
 
 
 
 
434
 
435
  elif doc_type == 'table' or doc_type == 'table_row':
436
  table_num = metadata.get('table_number', 'unknown')