Spaces:
Sleeping
Sleeping
Commit
·
2395c5e
1
Parent(s):
e393fbc
new chunking showing code
Browse files
app.py
CHANGED
|
@@ -21,8 +21,9 @@ def create_chunks_display_html(chunk_info):
|
|
| 21 |
for i, chunk in enumerate(chunk_info):
|
| 22 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
| 23 |
|
| 24 |
-
# Get section display info
|
| 25 |
-
section_display =
|
|
|
|
| 26 |
|
| 27 |
html += f"""
|
| 28 |
<div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
|
|
@@ -30,7 +31,7 @@ def create_chunks_display_html(chunk_info):
|
|
| 30 |
<strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
|
| 31 |
<strong style='color: black;'>Содержание:</strong><br>
|
| 32 |
<div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
|
| 33 |
-
{
|
| 34 |
</div>
|
| 35 |
</div>
|
| 36 |
"""
|
|
@@ -38,13 +39,10 @@ def create_chunks_display_html(chunk_info):
|
|
| 38 |
html += "</div>"
|
| 39 |
return html
|
| 40 |
|
| 41 |
-
def
|
|
|
|
| 42 |
section_path = chunk.get('section_path', '')
|
| 43 |
section_id = chunk.get('section_id', 'unknown')
|
| 44 |
-
section_text = chunk.get('section_text', '')
|
| 45 |
-
level = chunk.get('level', '')
|
| 46 |
-
parent_section = chunk.get('parent_section', '')
|
| 47 |
-
parent_title = chunk.get('parent_title', '')
|
| 48 |
doc_type = chunk.get('type', 'text')
|
| 49 |
|
| 50 |
if doc_type == 'table' and chunk.get('table_number'):
|
|
@@ -59,23 +57,46 @@ def format_section_for_display(chunk):
|
|
| 59 |
image_num = f"№{image_num}"
|
| 60 |
return f"рисунок {image_num}"
|
| 61 |
|
|
|
|
| 62 |
if section_path:
|
| 63 |
-
|
| 64 |
-
return f"пункт {section_path} в разделе {parent_section} ({parent_title})"
|
| 65 |
-
elif section_text:
|
| 66 |
-
return f"пункт {section_path} ({section_text})"
|
| 67 |
-
else:
|
| 68 |
-
return f"пункт {section_path}"
|
| 69 |
elif section_id and section_id != 'unknown':
|
| 70 |
-
|
| 71 |
-
return f"пункт {section_id} в разделе {parent_section} ({parent_title})"
|
| 72 |
-
elif section_text:
|
| 73 |
-
return f"пункт {section_id} ({section_text})"
|
| 74 |
-
else:
|
| 75 |
-
return f"пункт {section_id}"
|
| 76 |
|
| 77 |
return section_id
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 81 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
|
|
|
| 21 |
for i, chunk in enumerate(chunk_info):
|
| 22 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
| 23 |
|
| 24 |
+
# Get section display info
|
| 25 |
+
section_display = get_section_display(chunk)
|
| 26 |
+
formatted_content = get_formatted_content(chunk)
|
| 27 |
|
| 28 |
html += f"""
|
| 29 |
<div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
|
|
|
|
| 31 |
<strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
|
| 32 |
<strong style='color: black;'>Содержание:</strong><br>
|
| 33 |
<div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
|
| 34 |
+
{formatted_content}
|
| 35 |
</div>
|
| 36 |
</div>
|
| 37 |
"""
|
|
|
|
| 39 |
html += "</div>"
|
| 40 |
return html
|
| 41 |
|
| 42 |
+
def get_section_display(chunk):
|
| 43 |
+
"""Get section display for the 'Раздел' field - without 'пункт' prefix"""
|
| 44 |
section_path = chunk.get('section_path', '')
|
| 45 |
section_id = chunk.get('section_id', 'unknown')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
doc_type = chunk.get('type', 'text')
|
| 47 |
|
| 48 |
if doc_type == 'table' and chunk.get('table_number'):
|
|
|
|
| 57 |
image_num = f"№{image_num}"
|
| 58 |
return f"рисунок {image_num}"
|
| 59 |
|
| 60 |
+
# For text documents, return just the section_path or section_id without "пункт"
|
| 61 |
if section_path:
|
| 62 |
+
return section_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
elif section_id and section_id != 'unknown':
|
| 64 |
+
return section_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
return section_id
|
| 67 |
|
| 68 |
+
def get_formatted_content(chunk):
|
| 69 |
+
"""Format the content with proper section context"""
|
| 70 |
+
document_id = chunk.get('document_id', 'unknown')
|
| 71 |
+
section_path = chunk.get('section_path', '')
|
| 72 |
+
section_id = chunk.get('section_id', 'unknown')
|
| 73 |
+
parent_section = chunk.get('parent_section', '')
|
| 74 |
+
level = chunk.get('level', '')
|
| 75 |
+
chunk_text = chunk.get('chunk_text', '')
|
| 76 |
+
doc_type = chunk.get('type', 'text')
|
| 77 |
+
|
| 78 |
+
if doc_type == 'table':
|
| 79 |
+
table_num = chunk.get('table_number', 'unknown')
|
| 80 |
+
if not str(table_num).startswith('№'):
|
| 81 |
+
table_num = f"№{table_num}"
|
| 82 |
+
return f"В таблице {table_num} документа {document_id}: {chunk_text}"
|
| 83 |
+
|
| 84 |
+
if doc_type == 'image':
|
| 85 |
+
image_num = chunk.get('image_number', 'unknown')
|
| 86 |
+
if not str(image_num).startswith('№'):
|
| 87 |
+
image_num = f"№{image_num}"
|
| 88 |
+
return f"В рисунке {image_num} документа {document_id}: {chunk_text}"
|
| 89 |
+
|
| 90 |
+
# For text documents
|
| 91 |
+
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
|
| 92 |
+
# For subsections: В разделе X в документе Y, пункт X.X content
|
| 93 |
+
current_section = section_path if section_path else section_id
|
| 94 |
+
return f"В разделе {parent_section} в документе {document_id}, пункт {current_section} {chunk_text}"
|
| 95 |
+
else:
|
| 96 |
+
# For main sections: В разделе X в документе Y пункт X content
|
| 97 |
+
current_section = section_path if section_path else section_id
|
| 98 |
+
return f"В разделе {current_section} в документе {document_id} пункт {current_section} {chunk_text}"
|
| 99 |
+
|
| 100 |
|
| 101 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 102 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
utils.py
CHANGED
|
@@ -142,9 +142,9 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 142 |
|
| 143 |
# Add section information
|
| 144 |
if section_path:
|
| 145 |
-
sources_by_doc[key]['sections'].add(f"
|
| 146 |
elif section_id and section_id != 'unknown':
|
| 147 |
-
sources_by_doc[key]['sections'].add(f"
|
| 148 |
|
| 149 |
# Generate HTML for each unique source
|
| 150 |
for source_info in sources_by_doc.values():
|
|
|
|
| 142 |
|
| 143 |
# Add section information
|
| 144 |
if section_path:
|
| 145 |
+
sources_by_doc[key]['sections'].add(f"{section_path}")
|
| 146 |
elif section_id and section_id != 'unknown':
|
| 147 |
+
sources_by_doc[key]['sections'].add(f"{section_id}")
|
| 148 |
|
| 149 |
# Generate HTML for each unique source
|
| 150 |
for source_info in sources_by_doc.values():
|