Spaces:
Sleeping
Sleeping
Commit
·
80b9f4e
1
Parent(s):
0f9c9b1
400 symbols to show in logging 2
Browse files
app.py
CHANGED
|
@@ -70,7 +70,9 @@ def get_formatted_content(chunk):
|
|
| 70 |
document_id = chunk.get('document_id', 'unknown')
|
| 71 |
section_path = chunk.get('section_path', '')
|
| 72 |
section_id = chunk.get('section_id', 'unknown')
|
|
|
|
| 73 |
parent_section = chunk.get('parent_section', '')
|
|
|
|
| 74 |
level = chunk.get('level', '')
|
| 75 |
chunk_text = chunk.get('chunk_text', '')
|
| 76 |
doc_type = chunk.get('type', 'text')
|
|
@@ -89,14 +91,25 @@ def get_formatted_content(chunk):
|
|
| 89 |
|
| 90 |
# For text documents
|
| 91 |
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
|
| 92 |
-
# For subsections: В разделе X в документе Y, пункт X.X content
|
| 93 |
current_section = section_path if section_path else section_id
|
| 94 |
-
|
|
|
|
| 95 |
else:
|
| 96 |
-
# For main sections: В разделе X в документе Y пункт X content
|
| 97 |
current_section = section_path if section_path else section_id
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 102 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
|
|
|
| 70 |
document_id = chunk.get('document_id', 'unknown')
|
| 71 |
section_path = chunk.get('section_path', '')
|
| 72 |
section_id = chunk.get('section_id', 'unknown')
|
| 73 |
+
section_text = chunk.get('section_text', '')
|
| 74 |
parent_section = chunk.get('parent_section', '')
|
| 75 |
+
parent_title = chunk.get('parent_title', '')
|
| 76 |
level = chunk.get('level', '')
|
| 77 |
chunk_text = chunk.get('chunk_text', '')
|
| 78 |
doc_type = chunk.get('type', 'text')
|
|
|
|
| 91 |
|
| 92 |
# For text documents
|
| 93 |
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
|
|
|
|
| 94 |
current_section = section_path if section_path else section_id
|
| 95 |
+
parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
|
| 96 |
+
return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
|
| 97 |
else:
|
|
|
|
| 98 |
current_section = section_path if section_path else section_id
|
| 99 |
+
|
| 100 |
+
# Clean chunk_text to avoid duplication
|
| 101 |
+
clean_text = chunk_text
|
| 102 |
+
if section_text and chunk_text.startswith(section_text):
|
| 103 |
+
# If chunk_text starts with full section_text, use section_text as title
|
| 104 |
+
section_title = section_text
|
| 105 |
+
elif chunk_text.startswith(f"{current_section} "):
|
| 106 |
+
# If chunk_text starts with section number, extract the title part
|
| 107 |
+
clean_text = chunk_text[len(f"{current_section} "):].strip()
|
| 108 |
+
section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
|
| 109 |
+
else:
|
| 110 |
+
section_title = section_text if section_text else current_section
|
| 111 |
+
|
| 112 |
+
return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
|
| 113 |
|
| 114 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 115 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|