Spaces:
Sleeping
Sleeping
Commit ·
63ebb90
1
Parent(s): 38ed4e9
new documents prep
Browse files- app.py +33 -4
- documents_prep.py +2 -20
- utils.py +10 -20
app.py
CHANGED
|
@@ -11,17 +11,46 @@ from config import (
|
|
| 11 |
JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
|
| 12 |
)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def create_chunks_display_html(chunk_info):
|
| 15 |
if not chunk_info:
|
| 16 |
return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
|
| 17 |
|
|
|
|
|
|
|
| 18 |
html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
|
| 19 |
-
html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(
|
| 20 |
|
| 21 |
-
for i, chunk in enumerate(
|
| 22 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
| 23 |
-
|
| 24 |
-
# Get section display info
|
| 25 |
section_display = get_section_display(chunk)
|
| 26 |
formatted_content = get_formatted_content(chunk)
|
| 27 |
|
|
|
|
| 11 |
JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
|
| 12 |
)
|
| 13 |
|
| 14 |
+
|
| 15 |
+
def merge_table_chunks(chunk_info):
|
| 16 |
+
merged = {}
|
| 17 |
+
|
| 18 |
+
for chunk in chunk_info:
|
| 19 |
+
doc_type = chunk.get('type', 'text')
|
| 20 |
+
doc_id = chunk.get('document_id', 'unknown')
|
| 21 |
+
|
| 22 |
+
if doc_type == 'table' or doc_type == 'table_row':
|
| 23 |
+
table_num = chunk.get('table_number', '')
|
| 24 |
+
key = f"{doc_id}_{table_num}"
|
| 25 |
+
|
| 26 |
+
if key not in merged:
|
| 27 |
+
merged[key] = {
|
| 28 |
+
'document_id': doc_id,
|
| 29 |
+
'type': 'table',
|
| 30 |
+
'table_number': table_num,
|
| 31 |
+
'section_id': chunk.get('section_id', 'unknown'),
|
| 32 |
+
'chunk_text': chunk.get('chunk_text', '')
|
| 33 |
+
}
|
| 34 |
+
else:
|
| 35 |
+
merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
|
| 36 |
+
else:
|
| 37 |
+
unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
|
| 38 |
+
merged[unique_key] = chunk
|
| 39 |
+
|
| 40 |
+
return list(merged.values())
|
| 41 |
+
|
| 42 |
+
|
| 43 |
def create_chunks_display_html(chunk_info):
|
| 44 |
if not chunk_info:
|
| 45 |
return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
|
| 46 |
|
| 47 |
+
merged_chunks = merge_table_chunks(chunk_info)
|
| 48 |
+
|
| 49 |
html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
|
| 50 |
+
html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
|
| 51 |
|
| 52 |
+
for i, chunk in enumerate(merged_chunks):
|
| 53 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
|
|
|
|
|
|
| 54 |
section_display = get_section_display(chunk)
|
| 55 |
formatted_content = get_formatted_content(chunk)
|
| 56 |
|
documents_prep.py
CHANGED
|
@@ -162,30 +162,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
|
| 162 |
|
| 163 |
|
| 164 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 165 |
-
"
|
| 166 |
-
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 167 |
-
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 168 |
-
content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
|
| 169 |
-
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
| 170 |
if table_title:
|
| 171 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
| 172 |
if section:
|
| 173 |
content += f"РАЗДЕЛ: {section}\n"
|
| 174 |
-
content += f"{'='*70}\n
|
| 175 |
-
|
| 176 |
-
# Enhanced search keywords
|
| 177 |
-
content += f"Это таблица {table_identifier} из документа {doc_id}. "
|
| 178 |
-
content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
|
| 179 |
-
|
| 180 |
-
if section:
|
| 181 |
-
content += f"Раздел: {section}. "
|
| 182 |
-
if 'приложени' in section.lower():
|
| 183 |
-
content += f"Таблица из приложения. "
|
| 184 |
-
|
| 185 |
-
if table_title:
|
| 186 |
-
content += f"Название: {table_title}. "
|
| 187 |
-
|
| 188 |
-
content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
|
| 189 |
|
| 190 |
if headers:
|
| 191 |
header_str = ' | '.join(str(h) for h in headers)
|
|
|
|
| 162 |
|
| 163 |
|
| 164 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 165 |
+
content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
if table_title:
|
| 167 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
| 168 |
if section:
|
| 169 |
content += f"РАЗДЕЛ: {section}\n"
|
| 170 |
+
content += f"{'='*70}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
if headers:
|
| 173 |
header_str = ' | '.join(str(h) for h in headers)
|
utils.py
CHANGED
|
@@ -53,19 +53,16 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 53 |
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 54 |
doc_type = metadata.get('type', 'text')
|
| 55 |
doc_id = metadata.get('document_id', 'unknown')
|
| 56 |
-
section_id = metadata.get('section_id', '')
|
| 57 |
-
section_text = metadata.get('section_text', '')
|
| 58 |
-
section_path = metadata.get('section_path', '')
|
| 59 |
|
| 60 |
-
|
| 61 |
-
if doc_type == 'table':
|
| 62 |
table_num = metadata.get('table_number', 'unknown')
|
| 63 |
key = f"{doc_id}_table_{table_num}"
|
| 64 |
elif doc_type == 'image':
|
| 65 |
image_num = metadata.get('image_number', 'unknown')
|
| 66 |
key = f"{doc_id}_image_{image_num}"
|
| 67 |
else:
|
| 68 |
-
|
|
|
|
| 69 |
section_key = section_path if section_path else section_id
|
| 70 |
key = f"{doc_id}_text_{section_key}"
|
| 71 |
|
|
@@ -77,13 +74,14 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 77 |
'sections': set()
|
| 78 |
}
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
# Generate HTML for each unique source
|
| 87 |
for source_info in sources_by_doc.values():
|
| 88 |
metadata = source_info['metadata']
|
| 89 |
doc_type = source_info['doc_type']
|
|
@@ -93,7 +91,6 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 93 |
|
| 94 |
if doc_type == 'text':
|
| 95 |
html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
|
| 96 |
-
|
| 97 |
elif doc_type == 'table' or doc_type == 'table_row':
|
| 98 |
table_num = metadata.get('table_number', 'unknown')
|
| 99 |
table_title = metadata.get('table_title', '')
|
|
@@ -105,23 +102,16 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 105 |
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
|
| 106 |
else:
|
| 107 |
html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
|
| 108 |
-
|
| 109 |
elif doc_type == 'image':
|
| 110 |
image_num = metadata.get('image_number', 'unknown')
|
| 111 |
image_title = metadata.get('image_title', '')
|
| 112 |
-
section = metadata.get('section', '')
|
| 113 |
if image_num and image_num != 'unknown':
|
| 114 |
if not str(image_num).startswith('№'):
|
| 115 |
image_num = f"№{image_num}"
|
| 116 |
html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
|
| 117 |
if image_title and image_title != 'unknown':
|
| 118 |
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
|
| 119 |
-
if section and section != 'unknown':
|
| 120 |
-
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
|
| 121 |
-
else:
|
| 122 |
-
html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
|
| 123 |
|
| 124 |
-
# Add file link if available
|
| 125 |
if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
|
| 126 |
doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
|
| 127 |
if not doc_rows.empty:
|
|
|
|
| 53 |
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 54 |
doc_type = metadata.get('type', 'text')
|
| 55 |
doc_id = metadata.get('document_id', 'unknown')
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
if doc_type == 'table' or doc_type == 'table_row':
|
|
|
|
| 58 |
table_num = metadata.get('table_number', 'unknown')
|
| 59 |
key = f"{doc_id}_table_{table_num}"
|
| 60 |
elif doc_type == 'image':
|
| 61 |
image_num = metadata.get('image_number', 'unknown')
|
| 62 |
key = f"{doc_id}_image_{image_num}"
|
| 63 |
else:
|
| 64 |
+
section_path = metadata.get('section_path', '')
|
| 65 |
+
section_id = metadata.get('section_id', '')
|
| 66 |
section_key = section_path if section_path else section_id
|
| 67 |
key = f"{doc_id}_text_{section_key}"
|
| 68 |
|
|
|
|
| 74 |
'sections': set()
|
| 75 |
}
|
| 76 |
|
| 77 |
+
if doc_type not in ['table', 'table_row', 'image']:
|
| 78 |
+
section_path = metadata.get('section_path', '')
|
| 79 |
+
section_id = metadata.get('section_id', '')
|
| 80 |
+
if section_path:
|
| 81 |
+
sources_by_doc[key]['sections'].add(f"пункт {section_path}")
|
| 82 |
+
elif section_id and section_id != 'unknown':
|
| 83 |
+
sources_by_doc[key]['sections'].add(f"пункт {section_id}")
|
| 84 |
|
|
|
|
| 85 |
for source_info in sources_by_doc.values():
|
| 86 |
metadata = source_info['metadata']
|
| 87 |
doc_type = source_info['doc_type']
|
|
|
|
| 91 |
|
| 92 |
if doc_type == 'text':
|
| 93 |
html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
|
|
|
|
| 94 |
elif doc_type == 'table' or doc_type == 'table_row':
|
| 95 |
table_num = metadata.get('table_number', 'unknown')
|
| 96 |
table_title = metadata.get('table_title', '')
|
|
|
|
| 102 |
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
|
| 103 |
else:
|
| 104 |
html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
|
|
|
|
| 105 |
elif doc_type == 'image':
|
| 106 |
image_num = metadata.get('image_number', 'unknown')
|
| 107 |
image_title = metadata.get('image_title', '')
|
|
|
|
| 108 |
if image_num and image_num != 'unknown':
|
| 109 |
if not str(image_num).startswith('№'):
|
| 110 |
image_num = f"№{image_num}"
|
| 111 |
html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
|
| 112 |
if image_title and image_title != 'unknown':
|
| 113 |
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
|
|
|
| 115 |
if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
|
| 116 |
doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
|
| 117 |
if not doc_rows.empty:
|