MrSimple07 commited on
Commit
63ebb90
·
1 Parent(s): 38ed4e9

new documents prep

Browse files
Files changed (3) hide show
  1. app.py +33 -4
  2. documents_prep.py +2 -20
  3. utils.py +10 -20
app.py CHANGED
@@ -11,17 +11,46 @@ from config import (
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def create_chunks_display_html(chunk_info):
15
  if not chunk_info:
16
  return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
17
 
 
 
18
  html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
19
- html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
20
 
21
- for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
-
24
- # Get section display info
25
  section_display = get_section_display(chunk)
26
  formatted_content = get_formatted_content(chunk)
27
 
 
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
13
 
14
+
15
+ def merge_table_chunks(chunk_info):
16
+ merged = {}
17
+
18
+ for chunk in chunk_info:
19
+ doc_type = chunk.get('type', 'text')
20
+ doc_id = chunk.get('document_id', 'unknown')
21
+
22
+ if doc_type == 'table' or doc_type == 'table_row':
23
+ table_num = chunk.get('table_number', '')
24
+ key = f"{doc_id}_{table_num}"
25
+
26
+ if key not in merged:
27
+ merged[key] = {
28
+ 'document_id': doc_id,
29
+ 'type': 'table',
30
+ 'table_number': table_num,
31
+ 'section_id': chunk.get('section_id', 'unknown'),
32
+ 'chunk_text': chunk.get('chunk_text', '')
33
+ }
34
+ else:
35
+ merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
36
+ else:
37
+ unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
38
+ merged[unique_key] = chunk
39
+
40
+ return list(merged.values())
41
+
42
+
43
  def create_chunks_display_html(chunk_info):
44
  if not chunk_info:
45
  return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
46
 
47
+ merged_chunks = merge_table_chunks(chunk_info)
48
+
49
  html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
50
+ html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
51
 
52
+ for i, chunk in enumerate(merged_chunks):
53
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
 
 
54
  section_display = get_section_display(chunk)
55
  formatted_content = get_formatted_content(chunk)
56
 
documents_prep.py CHANGED
@@ -162,30 +162,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
162
 
163
 
164
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
165
- """Format consistent table header"""
166
- content = f"ДОКУМЕНТ: {doc_id}\n"
167
- content += f"ТАБЛИЦА: {table_identifier}\n"
168
- content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
169
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
170
  if table_title:
171
  content += f"НАЗВАНИЕ: {table_title}\n"
172
  if section:
173
  content += f"РАЗДЕЛ: {section}\n"
174
- content += f"{'='*70}\n\n"
175
-
176
- # Enhanced search keywords
177
- content += f"Это таблица {table_identifier} из документа {doc_id}. "
178
- content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
179
-
180
- if section:
181
- content += f"Раздел: {section}. "
182
- if 'приложени' in section.lower():
183
- content += f"Таблица из приложения. "
184
-
185
- if table_title:
186
- content += f"Название: {table_title}. "
187
-
188
- content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
189
 
190
  if headers:
191
  header_str = ' | '.join(str(h) for h in headers)
 
162
 
163
 
164
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
165
+ content = f"ТАБЛИЦА {table_identifier} из {doc_id}\n"
 
 
 
 
166
  if table_title:
167
  content += f"НАЗВАНИЕ: {table_title}\n"
168
  if section:
169
  content += f"РАЗДЕЛ: {section}\n"
170
+ content += f"{'='*70}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  if headers:
173
  header_str = ' | '.join(str(h) for h in headers)
utils.py CHANGED
@@ -53,19 +53,16 @@ def generate_sources_html(nodes, chunks_df=None):
53
  metadata = node.metadata if hasattr(node, 'metadata') else {}
54
  doc_type = metadata.get('type', 'text')
55
  doc_id = metadata.get('document_id', 'unknown')
56
- section_id = metadata.get('section_id', '')
57
- section_text = metadata.get('section_text', '')
58
- section_path = metadata.get('section_path', '')
59
 
60
- # Create a unique key for grouping
61
- if doc_type == 'table':
62
  table_num = metadata.get('table_number', 'unknown')
63
  key = f"{doc_id}_table_{table_num}"
64
  elif doc_type == 'image':
65
  image_num = metadata.get('image_number', 'unknown')
66
  key = f"{doc_id}_image_{image_num}"
67
  else:
68
- # For text documents, group by section path or section id
 
69
  section_key = section_path if section_path else section_id
70
  key = f"{doc_id}_text_{section_key}"
71
 
@@ -77,13 +74,14 @@ def generate_sources_html(nodes, chunks_df=None):
77
  'sections': set()
78
  }
79
 
80
- # Add section information
81
- if section_path:
82
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
83
- elif section_id and section_id != 'unknown':
84
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
 
 
85
 
86
- # Generate HTML for each unique source
87
  for source_info in sources_by_doc.values():
88
  metadata = source_info['metadata']
89
  doc_type = source_info['doc_type']
@@ -93,7 +91,6 @@ def generate_sources_html(nodes, chunks_df=None):
93
 
94
  if doc_type == 'text':
95
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
96
-
97
  elif doc_type == 'table' or doc_type == 'table_row':
98
  table_num = metadata.get('table_number', 'unknown')
99
  table_title = metadata.get('table_title', '')
@@ -105,23 +102,16 @@ def generate_sources_html(nodes, chunks_df=None):
105
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
106
  else:
107
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
108
-
109
  elif doc_type == 'image':
110
  image_num = metadata.get('image_number', 'unknown')
111
  image_title = metadata.get('image_title', '')
112
- section = metadata.get('section', '')
113
  if image_num and image_num != 'unknown':
114
  if not str(image_num).startswith('№'):
115
  image_num = f"№{image_num}"
116
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
117
  if image_title and image_title != 'unknown':
118
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
119
- if section and section != 'unknown':
120
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
121
- else:
122
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
123
 
124
- # Add file link if available
125
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
126
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
127
  if not doc_rows.empty:
 
53
  metadata = node.metadata if hasattr(node, 'metadata') else {}
54
  doc_type = metadata.get('type', 'text')
55
  doc_id = metadata.get('document_id', 'unknown')
 
 
 
56
 
57
+ if doc_type == 'table' or doc_type == 'table_row':
 
58
  table_num = metadata.get('table_number', 'unknown')
59
  key = f"{doc_id}_table_{table_num}"
60
  elif doc_type == 'image':
61
  image_num = metadata.get('image_number', 'unknown')
62
  key = f"{doc_id}_image_{image_num}"
63
  else:
64
+ section_path = metadata.get('section_path', '')
65
+ section_id = metadata.get('section_id', '')
66
  section_key = section_path if section_path else section_id
67
  key = f"{doc_id}_text_{section_key}"
68
 
 
74
  'sections': set()
75
  }
76
 
77
+ if doc_type not in ['table', 'table_row', 'image']:
78
+ section_path = metadata.get('section_path', '')
79
+ section_id = metadata.get('section_id', '')
80
+ if section_path:
81
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
82
+ elif section_id and section_id != 'unknown':
83
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
84
 
 
85
  for source_info in sources_by_doc.values():
86
  metadata = source_info['metadata']
87
  doc_type = source_info['doc_type']
 
91
 
92
  if doc_type == 'text':
93
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
 
94
  elif doc_type == 'table' or doc_type == 'table_row':
95
  table_num = metadata.get('table_number', 'unknown')
96
  table_title = metadata.get('table_title', '')
 
102
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
103
  else:
104
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
 
105
  elif doc_type == 'image':
106
  image_num = metadata.get('image_number', 'unknown')
107
  image_title = metadata.get('image_title', '')
 
108
  if image_num and image_num != 'unknown':
109
  if not str(image_num).startswith('№'):
110
  image_num = f"№{image_num}"
111
  html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
112
  if image_title and image_title != 'unknown':
113
  html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
 
 
 
 
114
 
 
115
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
116
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
117
  if not doc_rows.empty: