MrSimple07 commited on
Commit
b38db64
·
1 Parent(s): abdb242

added the table and image sources

Browse files
Files changed (1) hide show
  1. utils.py +56 -6
utils.py CHANGED
@@ -99,36 +99,86 @@ def generate_sources_html(nodes, chunks_df=None):
99
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
100
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
101
 
 
 
 
102
  for i, node in enumerate(nodes):
103
  metadata = node.metadata if hasattr(node, 'metadata') else {}
104
  doc_type = metadata.get('type', 'text')
105
  doc_id = metadata.get('document_id', 'unknown')
106
- section_id = metadata.get('section_id', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
109
 
110
  if doc_type == 'text':
111
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
112
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📌 {section_id}</h4>"
 
 
 
113
 
114
- elif doc_type == 'table':
115
  table_num = metadata.get('table_number', 'unknown')
 
116
  if table_num and table_num != 'unknown':
117
- if not table_num.startswith('№'):
118
  table_num = f"№{table_num}"
119
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
 
 
120
  else:
121
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
 
122
  elif doc_type == 'image':
123
  image_num = metadata.get('image_number', 'unknown')
 
124
  section = metadata.get('section', '')
125
  if image_num and image_num != 'unknown':
126
  if not str(image_num).startswith('№'):
127
  image_num = f"№{image_num}"
128
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
 
 
 
 
129
  else:
130
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
131
 
 
132
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
133
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
134
  if not doc_rows.empty:
 
99
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
100
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
101
 
102
+ # Group nodes by document to avoid duplicates
103
+ sources_by_doc = {}
104
+
105
  for i, node in enumerate(nodes):
106
  metadata = node.metadata if hasattr(node, 'metadata') else {}
107
  doc_type = metadata.get('type', 'text')
108
  doc_id = metadata.get('document_id', 'unknown')
109
+ section_id = metadata.get('section_id', '')
110
+ section_text = metadata.get('section_text', '')
111
+ section_path = metadata.get('section_path', '')
112
+
113
+ # Create a unique key for grouping
114
+ if doc_type == 'table':
115
+ table_num = metadata.get('table_number', 'unknown')
116
+ key = f"{doc_id}_table_{table_num}"
117
+ elif doc_type == 'image':
118
+ image_num = metadata.get('image_number', 'unknown')
119
+ key = f"{doc_id}_image_{image_num}"
120
+ else:
121
+ # For text documents, group by section path or section id
122
+ section_key = section_path if section_path else section_id
123
+ key = f"{doc_id}_text_{section_key}"
124
+
125
+ if key not in sources_by_doc:
126
+ sources_by_doc[key] = {
127
+ 'doc_id': doc_id,
128
+ 'doc_type': doc_type,
129
+ 'metadata': metadata,
130
+ 'sections': set()
131
+ }
132
+
133
+ # Add section information
134
+ if section_path:
135
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
136
+ elif section_id and section_id != 'unknown':
137
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
138
+
139
+ # Generate HTML for each unique source
140
+ for source_info in sources_by_doc.values():
141
+ metadata = source_info['metadata']
142
+ doc_type = source_info['doc_type']
143
+ doc_id = source_info['doc_id']
144
 
145
  html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
146
 
147
  if doc_type == 'text':
148
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
149
+ # Show all sections for this document
150
+ if source_info['sections']:
151
+ sections_text = ", ".join(sorted(source_info['sections']))
152
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{sections_text}</p>"
153
 
154
+ elif doc_type == 'table' or doc_type == 'table_row':
155
  table_num = metadata.get('table_number', 'unknown')
156
+ table_title = metadata.get('table_title', '')
157
  if table_num and table_num != 'unknown':
158
+ if not str(table_num).startswith('№'):
159
  table_num = f"№{table_num}"
160
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
161
+ if table_title and table_title != 'unknown':
162
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
163
  else:
164
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
165
+
166
  elif doc_type == 'image':
167
  image_num = metadata.get('image_number', 'unknown')
168
+ image_title = metadata.get('image_title', '')
169
  section = metadata.get('section', '')
170
  if image_num and image_num != 'unknown':
171
  if not str(image_num).startswith('№'):
172
  image_num = f"№{image_num}"
173
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
174
+ if image_title and image_title != 'unknown':
175
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
176
+ if section and section != 'unknown':
177
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
178
  else:
179
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
180
 
181
+ # Add file link if available
182
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
183
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
184
  if not doc_rows.empty: