MrSimple07 commited on
Commit
2395c5e
·
1 Parent(s): e393fbc

new chunking showing code

Browse files
Files changed (2) hide show
  1. app.py +41 -20
  2. utils.py +2 -2
app.py CHANGED
@@ -21,8 +21,9 @@ def create_chunks_display_html(chunk_info):
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
 
24
- # Get section display info similar to format_context_for_llm
25
- section_display = format_section_for_display(chunk)
 
26
 
27
  html += f"""
28
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
@@ -30,7 +31,7 @@ def create_chunks_display_html(chunk_info):
30
  <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
31
  <strong style='color: black;'>Содержание:</strong><br>
32
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
33
- {chunk['chunk_text']}
34
  </div>
35
  </div>
36
  """
@@ -38,13 +39,10 @@ def create_chunks_display_html(chunk_info):
38
  html += "</div>"
39
  return html
40
 
41
- def format_section_for_display(chunk):
 
42
  section_path = chunk.get('section_path', '')
43
  section_id = chunk.get('section_id', 'unknown')
44
- section_text = chunk.get('section_text', '')
45
- level = chunk.get('level', '')
46
- parent_section = chunk.get('parent_section', '')
47
- parent_title = chunk.get('parent_title', '')
48
  doc_type = chunk.get('type', 'text')
49
 
50
  if doc_type == 'table' and chunk.get('table_number'):
@@ -59,23 +57,46 @@ def format_section_for_display(chunk):
59
  image_num = f"№{image_num}"
60
  return f"рисунок {image_num}"
61
 
 
62
  if section_path:
63
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
- return f"пункт {section_path} в разделе {parent_section} ({parent_title})"
65
- elif section_text:
66
- return f"пункт {section_path} ({section_text})"
67
- else:
68
- return f"пункт {section_path}"
69
  elif section_id and section_id != 'unknown':
70
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
71
- return f"пункт {section_id} в разделе {parent_section} ({parent_title})"
72
- elif section_text:
73
- return f"пункт {section_id} ({section_text})"
74
- else:
75
- return f"пункт {section_id}"
76
 
77
  return section_id
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
81
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
 
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
 
24
+ # Get section display info
25
+ section_display = get_section_display(chunk)
26
+ formatted_content = get_formatted_content(chunk)
27
 
28
  html += f"""
29
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
 
31
  <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
32
  <strong style='color: black;'>Содержание:</strong><br>
33
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
34
+ {formatted_content}
35
  </div>
36
  </div>
37
  """
 
39
  html += "</div>"
40
  return html
41
 
42
+ def get_section_display(chunk):
43
+ """Get section display for the 'Раздел' field - without 'пункт' prefix"""
44
  section_path = chunk.get('section_path', '')
45
  section_id = chunk.get('section_id', 'unknown')
 
 
 
 
46
  doc_type = chunk.get('type', 'text')
47
 
48
  if doc_type == 'table' and chunk.get('table_number'):
 
57
  image_num = f"№{image_num}"
58
  return f"рисунок {image_num}"
59
 
60
+ # For text documents, return just the section_path or section_id without "пункт"
61
  if section_path:
62
+ return section_path
 
 
 
 
 
63
  elif section_id and section_id != 'unknown':
64
+ return section_id
 
 
 
 
 
65
 
66
  return section_id
67
 
68
+ def get_formatted_content(chunk):
69
+ """Format the content with proper section context"""
70
+ document_id = chunk.get('document_id', 'unknown')
71
+ section_path = chunk.get('section_path', '')
72
+ section_id = chunk.get('section_id', 'unknown')
73
+ parent_section = chunk.get('parent_section', '')
74
+ level = chunk.get('level', '')
75
+ chunk_text = chunk.get('chunk_text', '')
76
+ doc_type = chunk.get('type', 'text')
77
+
78
+ if doc_type == 'table':
79
+ table_num = chunk.get('table_number', 'unknown')
80
+ if not str(table_num).startswith('№'):
81
+ table_num = f"№{table_num}"
82
+ return f"В таблице {table_num} документа {document_id}: {chunk_text}"
83
+
84
+ if doc_type == 'image':
85
+ image_num = chunk.get('image_number', 'unknown')
86
+ if not str(image_num).startswith('№'):
87
+ image_num = f"№{image_num}"
88
+ return f"В рисунке {image_num} документа {document_id}: {chunk_text}"
89
+
90
+ # For text documents
91
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
92
+ # For subsections: В разделе X в документе Y, пункт X.X content
93
+ current_section = section_path if section_path else section_id
94
+ return f"В разделе {parent_section} в документе {document_id}, пункт {current_section} {chunk_text}"
95
+ else:
96
+ # For main sections: В разделе X в документе Y пункт X content
97
+ current_section = section_path if section_path else section_id
98
+ return f"В разделе {current_section} в документе {document_id} пункт {current_section} {chunk_text}"
99
+
100
 
101
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
102
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
utils.py CHANGED
@@ -142,9 +142,9 @@ def generate_sources_html(nodes, chunks_df=None):
142
 
143
  # Add section information
144
  if section_path:
145
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
146
  elif section_id and section_id != 'unknown':
147
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
148
 
149
  # Generate HTML for each unique source
150
  for source_info in sources_by_doc.values():
 
142
 
143
  # Add section information
144
  if section_path:
145
+ sources_by_doc[key]['sections'].add(f"{section_path}")
146
  elif section_id and section_id != 'unknown':
147
+ sources_by_doc[key]['sections'].add(f"{section_id}")
148
 
149
  # Generate HTML for each unique source
150
  for source_info in sources_by_doc.values():