Spaces:
Sleeping
Sleeping
Commit
·
e393fbc
1
Parent(s):
19e03d0
new chunking showing + improved context giving to LLM
Browse files- __pycache__/config.cpython-311.pyc +0 -0
- __pycache__/index_retriever.cpython-311.pyc +0 -0
- __pycache__/my_logging.cpython-311.pyc +0 -0
- app.py +44 -1
- index_retriever.py +1 -1
- utils.py +24 -10
__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (66.4 kB). View file
|
|
|
__pycache__/index_retriever.cpython-311.pyc
ADDED
|
Binary file (4.25 kB). View file
|
|
|
__pycache__/my_logging.cpython-311.pyc
ADDED
|
Binary file (811 Bytes). View file
|
|
|
app.py
CHANGED
|
@@ -20,10 +20,14 @@ def create_chunks_display_html(chunk_info):
|
|
| 20 |
|
| 21 |
for i, chunk in enumerate(chunk_info):
|
| 22 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
html += f"""
|
| 24 |
<div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
|
| 25 |
<strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
|
| 26 |
-
<strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{
|
| 27 |
<strong style='color: black;'>Содержание:</strong><br>
|
| 28 |
<div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
|
| 29 |
{chunk['chunk_text']}
|
|
@@ -34,6 +38,45 @@ def create_chunks_display_html(chunk_info):
|
|
| 34 |
html += "</div>"
|
| 35 |
return html
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 38 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
| 39 |
use_json_instead_csv=False):
|
|
|
|
| 20 |
|
| 21 |
for i, chunk in enumerate(chunk_info):
|
| 22 |
bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
|
| 23 |
+
|
| 24 |
+
# Get section display info similar to format_context_for_llm
|
| 25 |
+
section_display = format_section_for_display(chunk)
|
| 26 |
+
|
| 27 |
html += f"""
|
| 28 |
<div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
|
| 29 |
<strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
|
| 30 |
+
<strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
|
| 31 |
<strong style='color: black;'>Содержание:</strong><br>
|
| 32 |
<div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
|
| 33 |
{chunk['chunk_text']}
|
|
|
|
| 38 |
html += "</div>"
|
| 39 |
return html
|
| 40 |
|
| 41 |
+
def format_section_for_display(chunk):
|
| 42 |
+
section_path = chunk.get('section_path', '')
|
| 43 |
+
section_id = chunk.get('section_id', 'unknown')
|
| 44 |
+
section_text = chunk.get('section_text', '')
|
| 45 |
+
level = chunk.get('level', '')
|
| 46 |
+
parent_section = chunk.get('parent_section', '')
|
| 47 |
+
parent_title = chunk.get('parent_title', '')
|
| 48 |
+
doc_type = chunk.get('type', 'text')
|
| 49 |
+
|
| 50 |
+
if doc_type == 'table' and chunk.get('table_number'):
|
| 51 |
+
table_num = chunk.get('table_number')
|
| 52 |
+
if not str(table_num).startswith('№'):
|
| 53 |
+
table_num = f"№{table_num}"
|
| 54 |
+
return f"таблица {table_num}"
|
| 55 |
+
|
| 56 |
+
if doc_type == 'image' and chunk.get('image_number'):
|
| 57 |
+
image_num = chunk.get('image_number')
|
| 58 |
+
if not str(image_num).startswith('№'):
|
| 59 |
+
image_num = f"№{image_num}"
|
| 60 |
+
return f"рисунок {image_num}"
|
| 61 |
+
|
| 62 |
+
if section_path:
|
| 63 |
+
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
|
| 64 |
+
return f"пункт {section_path} в разделе {parent_section} ({parent_title})"
|
| 65 |
+
elif section_text:
|
| 66 |
+
return f"пункт {section_path} ({section_text})"
|
| 67 |
+
else:
|
| 68 |
+
return f"пункт {section_path}"
|
| 69 |
+
elif section_id and section_id != 'unknown':
|
| 70 |
+
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
|
| 71 |
+
return f"пункт {section_id} в разделе {parent_section} ({parent_title})"
|
| 72 |
+
elif section_text:
|
| 73 |
+
return f"пункт {section_id} ({section_text})"
|
| 74 |
+
else:
|
| 75 |
+
return f"пункт {section_id}"
|
| 76 |
+
|
| 77 |
+
return section_id
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 81 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
| 82 |
use_json_instead_csv=False):
|
index_retriever.py
CHANGED
|
@@ -22,7 +22,7 @@ def create_query_engine(vector_index):
|
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
| 23 |
index=vector_index,
|
| 24 |
similarity_top_k=30,
|
| 25 |
-
similarity_cutoff=0.
|
| 26 |
)
|
| 27 |
|
| 28 |
hybrid_retriever = QueryFusionRetriever(
|
|
|
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
| 23 |
index=vector_index,
|
| 24 |
similarity_top_k=30,
|
| 25 |
+
similarity_cutoff=0.8
|
| 26 |
)
|
| 27 |
|
| 28 |
hybrid_retriever = QueryFusionRetriever(
|
utils.py
CHANGED
|
@@ -57,17 +57,27 @@ def format_context_for_llm(nodes):
|
|
| 57 |
section_text = metadata.get('section_text', '')
|
| 58 |
parent_section = metadata.get('parent_section', '')
|
| 59 |
parent_title = metadata.get('parent_title', '')
|
|
|
|
| 60 |
|
| 61 |
-
if
|
| 62 |
-
|
|
|
|
| 63 |
elif section_text:
|
|
|
|
| 64 |
section_info = f"пункт {section_path} ({section_text})"
|
| 65 |
else:
|
| 66 |
section_info = f"пункт {section_path}"
|
| 67 |
elif metadata.get('section_id'):
|
| 68 |
section_id = metadata['section_id']
|
| 69 |
section_text = metadata.get('section_text', '')
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
section_info = f"пункт {section_id} ({section_text})"
|
| 72 |
else:
|
| 73 |
section_info = f"пункт {section_id}"
|
|
@@ -249,10 +259,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 249 |
|
| 250 |
chunk_info = []
|
| 251 |
for node in reranked_nodes:
|
| 252 |
-
|
| 253 |
chunk_info.append({
|
| 254 |
-
'document_id':
|
| 255 |
-
'section_id': section_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
'chunk_size': len(node.text),
|
| 257 |
'chunk_text': node.text
|
| 258 |
})
|
|
@@ -413,10 +431,6 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 413 |
|
| 414 |
if doc_type == 'text':
|
| 415 |
html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
|
| 416 |
-
# Show all sections for this document
|
| 417 |
-
if source_info['sections']:
|
| 418 |
-
sections_text = ", ".join(sorted(source_info['sections']))
|
| 419 |
-
html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{sections_text}</p>"
|
| 420 |
|
| 421 |
elif doc_type == 'table' or doc_type == 'table_row':
|
| 422 |
table_num = metadata.get('table_number', 'unknown')
|
|
|
|
| 57 |
section_text = metadata.get('section_text', '')
|
| 58 |
parent_section = metadata.get('parent_section', '')
|
| 59 |
parent_title = metadata.get('parent_title', '')
|
| 60 |
+
level = metadata.get('level', '')
|
| 61 |
|
| 62 |
+
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
|
| 63 |
+
# For subsections, show: пункт X.X в разделе X (Title)
|
| 64 |
+
section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
|
| 65 |
elif section_text:
|
| 66 |
+
# For main sections, show: пункт X (Title)
|
| 67 |
section_info = f"пункт {section_path} ({section_text})"
|
| 68 |
else:
|
| 69 |
section_info = f"пункт {section_path}"
|
| 70 |
elif metadata.get('section_id'):
|
| 71 |
section_id = metadata['section_id']
|
| 72 |
section_text = metadata.get('section_text', '')
|
| 73 |
+
level = metadata.get('level', '')
|
| 74 |
+
parent_section = metadata.get('parent_section', '')
|
| 75 |
+
parent_title = metadata.get('parent_title', '')
|
| 76 |
+
|
| 77 |
+
if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
|
| 78 |
+
# For subsections without section_path, show: пункт X.X в разделе X (Title)
|
| 79 |
+
section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
|
| 80 |
+
elif section_text:
|
| 81 |
section_info = f"пункт {section_id} ({section_text})"
|
| 82 |
else:
|
| 83 |
section_info = f"пункт {section_id}"
|
|
|
|
| 259 |
|
| 260 |
chunk_info = []
|
| 261 |
for node in reranked_nodes:
|
| 262 |
+
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 263 |
chunk_info.append({
|
| 264 |
+
'document_id': metadata.get('document_id', 'unknown'),
|
| 265 |
+
'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
|
| 266 |
+
'section_path': metadata.get('section_path', ''),
|
| 267 |
+
'section_text': metadata.get('section_text', ''),
|
| 268 |
+
'level': metadata.get('level', ''),
|
| 269 |
+
'parent_section': metadata.get('parent_section', ''),
|
| 270 |
+
'parent_title': metadata.get('parent_title', ''),
|
| 271 |
+
'type': metadata.get('type', 'text'),
|
| 272 |
+
'table_number': metadata.get('table_number', ''),
|
| 273 |
+
'image_number': metadata.get('image_number', ''),
|
| 274 |
'chunk_size': len(node.text),
|
| 275 |
'chunk_text': node.text
|
| 276 |
})
|
|
|
|
| 431 |
|
| 432 |
if doc_type == 'text':
|
| 433 |
html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
elif doc_type == 'table' or doc_type == 'table_row':
|
| 436 |
table_num = metadata.get('table_number', 'unknown')
|