Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 16, 2025

Commit

52249e8

1 Parent(s): 52f7579

font is black + fixed table + image downloading issues

Browse files

Files changed (3) hide show

app.py +4 -4
documents_prep.py +9 -4
utils.py +2 -3

app.py CHANGED Viewed

@@ -22,10 +22,10 @@ def create_chunks_display_html(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
-            <strong>Документ:</strong> {chunk['document_id']}<br>
-            <strong>Раздел:</strong> {chunk['section_id']}<br>
-            <strong>Ранг:</strong> {i+1} | <strong>Размер:</strong> {chunk['chunk_size']} символов<br>
-            <strong>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
                 {chunk['chunk_text']}
             </div>

         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
+            <strong style='color: black;'>Ранг:</strong> <span style='color: black;'>{i+1}</span> | <strong style='color: black;'>Размер:</strong> <span style='color: black;'>{chunk['chunk_size']} символов</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
                 {chunk['chunk_text']}
             </div>

documents_prep.py CHANGED Viewed

@@ -232,7 +232,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():
         return ""
@@ -312,7 +312,8 @@ def table_to_document(table_data, document_id=None):
             "table_number": table_data.get('table_number', 'unknown'),
             "table_title": table_data.get('table_title', 'unknown'),
             "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
-            "section": table_data.get('section', 'unknown')
         }
     )
@@ -398,11 +399,13 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                 log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
                 for _, row in df.iterrows():
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
                     content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
-                    content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
                     content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
                     doc = Document(
@@ -412,7 +415,8 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                             "image_number": row.get('№ Изображения', 'unknown'),
                             "document_id": row.get('Обозначение документа', 'unknown'),
                             "file_path": row.get('Файл изображения', 'unknown'),
-                            "section": row.get('Раздел документа', 'unknown')
                         }
                     )
                     image_documents.append(doc)
@@ -428,6 +432,7 @@ def load_image_data(repo_id, hf_token, image_data_dir):
         log_message(f"Ошибка загрузки данных изображений: {str(e)}")
         return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("Загружаю данные чанков из CSV")

         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():
         return ""
             "table_number": table_data.get('table_number', 'unknown'),
             "table_title": table_data.get('table_title', 'unknown'),
             "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
+            "section": table_data.get('section', 'unknown'),
+            "section_id": table_data.get('section', 'unknown')
         }
     )
                 log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
                 for _, row in df.iterrows():
+                    section_value = row.get('Раздел документа', row.get('section', 'Неизвестно'))
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
                     content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {section_value}\n"
                     content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
                     doc = Document(
                             "image_number": row.get('№ Изображения', 'unknown'),
                             "document_id": row.get('Обозначение документа', 'unknown'),
                             "file_path": row.get('Файл изображения', 'unknown'),
+                            "section": section_value,
+                            "section_id": section_value
                         }
                     )
                     image_documents.append(doc)
         log_message(f"Ошибка загрузки данных изображений: {str(e)}")
         return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("Загружаю данные чанков из CSV")

utils.py CHANGED Viewed

@@ -186,19 +186,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         </div>
         </div>"""
-        # Релевантные чанки (text snippets)
         chunk_info = []
         for node in reranked_nodes:
             chunk_info.append({
                 'document_id': node.metadata.get('document_id', 'unknown'),
-                'section_id': node.metadata.get('section_id', 'unknown'),
                 'chunk_size': len(node.text),
                 'chunk_text': node.text
             })
         from app import create_chunks_display_html
         chunks_html = create_chunks_display_html(chunk_info)
         return answer_with_time, sources_html, chunks_html
     except Exception as e:

         </div>
         </div>"""
         chunk_info = []
         for node in reranked_nodes:
+            section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
             chunk_info.append({
                 'document_id': node.metadata.get('document_id', 'unknown'),
+                'section_id': section_id,
                 'chunk_size': len(node.text),
                 'chunk_text': node.text
             })
         from app import create_chunks_display_html
         chunks_html = create_chunks_display_html(chunk_info)
         return answer_with_time, sources_html, chunks_html
     except Exception as e: