Spaces:
Sleeping
Sleeping
Commit
·
59c7b5b
1
Parent(s):
80b9f4e
removed table + image chunking display
Browse files
app.py
CHANGED
|
@@ -40,7 +40,6 @@ def create_chunks_display_html(chunk_info):
|
|
| 40 |
return html
|
| 41 |
|
| 42 |
def get_section_display(chunk):
|
| 43 |
-
"""Get section display for the 'Раздел' field - without 'пункт' prefix"""
|
| 44 |
section_path = chunk.get('section_path', '')
|
| 45 |
section_id = chunk.get('section_id', 'unknown')
|
| 46 |
doc_type = chunk.get('type', 'text')
|
|
@@ -57,7 +56,6 @@ def get_section_display(chunk):
|
|
| 57 |
image_num = f"№{image_num}"
|
| 58 |
return f"рисунок {image_num}"
|
| 59 |
|
| 60 |
-
# For text documents, return just the section_path or section_id without "пункт"
|
| 61 |
if section_path:
|
| 62 |
return section_path
|
| 63 |
elif section_id and section_id != 'unknown':
|
|
@@ -66,7 +64,6 @@ def get_section_display(chunk):
|
|
| 66 |
return section_id
|
| 67 |
|
| 68 |
def get_formatted_content(chunk):
|
| 69 |
-
"""Format the content with proper section context"""
|
| 70 |
document_id = chunk.get('document_id', 'unknown')
|
| 71 |
section_path = chunk.get('section_path', '')
|
| 72 |
section_id = chunk.get('section_id', 'unknown')
|
|
@@ -77,18 +74,6 @@ def get_formatted_content(chunk):
|
|
| 77 |
chunk_text = chunk.get('chunk_text', '')
|
| 78 |
doc_type = chunk.get('type', 'text')
|
| 79 |
|
| 80 |
-
if doc_type == 'table':
|
| 81 |
-
table_num = chunk.get('table_number', 'unknown')
|
| 82 |
-
if not str(table_num).startswith('№'):
|
| 83 |
-
table_num = f"№{table_num}"
|
| 84 |
-
return f"В таблице {table_num} документа {document_id}: {chunk_text}"
|
| 85 |
-
|
| 86 |
-
if doc_type == 'image':
|
| 87 |
-
image_num = chunk.get('image_number', 'unknown')
|
| 88 |
-
if not str(image_num).startswith('№'):
|
| 89 |
-
image_num = f"№{image_num}"
|
| 90 |
-
return f"В рисунке {image_num} документа {document_id}: {chunk_text}"
|
| 91 |
-
|
| 92 |
# For text documents
|
| 93 |
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
|
| 94 |
current_section = section_path if section_path else section_id
|
|
@@ -96,14 +81,10 @@ def get_formatted_content(chunk):
|
|
| 96 |
return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
|
| 97 |
else:
|
| 98 |
current_section = section_path if section_path else section_id
|
| 99 |
-
|
| 100 |
-
# Clean chunk_text to avoid duplication
|
| 101 |
clean_text = chunk_text
|
| 102 |
if section_text and chunk_text.startswith(section_text):
|
| 103 |
-
# If chunk_text starts with full section_text, use section_text as title
|
| 104 |
section_title = section_text
|
| 105 |
elif chunk_text.startswith(f"{current_section} "):
|
| 106 |
-
# If chunk_text starts with section number, extract the title part
|
| 107 |
clean_text = chunk_text[len(f"{current_section} "):].strip()
|
| 108 |
section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
|
| 109 |
else:
|
|
|
|
| 40 |
return html
|
| 41 |
|
| 42 |
def get_section_display(chunk):
|
|
|
|
| 43 |
section_path = chunk.get('section_path', '')
|
| 44 |
section_id = chunk.get('section_id', 'unknown')
|
| 45 |
doc_type = chunk.get('type', 'text')
|
|
|
|
| 56 |
image_num = f"№{image_num}"
|
| 57 |
return f"рисунок {image_num}"
|
| 58 |
|
|
|
|
| 59 |
if section_path:
|
| 60 |
return section_path
|
| 61 |
elif section_id and section_id != 'unknown':
|
|
|
|
| 64 |
return section_id
|
| 65 |
|
| 66 |
def get_formatted_content(chunk):
|
|
|
|
| 67 |
document_id = chunk.get('document_id', 'unknown')
|
| 68 |
section_path = chunk.get('section_path', '')
|
| 69 |
section_id = chunk.get('section_id', 'unknown')
|
|
|
|
| 74 |
chunk_text = chunk.get('chunk_text', '')
|
| 75 |
doc_type = chunk.get('type', 'text')
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
# For text documents
|
| 78 |
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
|
| 79 |
current_section = section_path if section_path else section_id
|
|
|
|
| 81 |
return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
|
| 82 |
else:
|
| 83 |
current_section = section_path if section_path else section_id
|
|
|
|
|
|
|
| 84 |
clean_text = chunk_text
|
| 85 |
if section_text and chunk_text.startswith(section_text):
|
|
|
|
| 86 |
section_title = section_text
|
| 87 |
elif chunk_text.startswith(f"{current_section} "):
|
|
|
|
| 88 |
clean_text = chunk_text[len(f"{current_section} "):].strip()
|
| 89 |
section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
|
| 90 |
else:
|