Spaces:
Sleeping
Sleeping
Commit
·
c33deff
1
Parent(s):
31659d7
removed normalization doc id
Browse files
utils.py
CHANGED
|
@@ -39,62 +39,15 @@ def format_sources(nodes):
|
|
| 39 |
|
| 40 |
import re
|
| 41 |
|
| 42 |
-
def extract_document_id(query):
|
| 43 |
-
"""Extract explicit document IDs from query"""
|
| 44 |
-
patterns = [
|
| 45 |
-
r'ГОСТ\s*Р?\s*[\d.-]+(?:-\d{4})?', # ГОСТ 59023.4 or ГОСТ Р 59023.5-2020
|
| 46 |
-
r'НП-\d+-\d+',
|
| 47 |
-
r'МУ[_\s][\d.]+',
|
| 48 |
-
]
|
| 49 |
-
|
| 50 |
-
for pattern in patterns:
|
| 51 |
-
match = re.search(pattern, query, re.IGNORECASE)
|
| 52 |
-
if match:
|
| 53 |
-
return match.group(0).strip()
|
| 54 |
-
return None
|
| 55 |
-
|
| 56 |
-
def normalize_doc_id(doc_id):
|
| 57 |
-
normalized = doc_id.replace(' ', '').replace('р', '').replace('Р', '').lower()
|
| 58 |
-
normalized = re.sub(r'-\d{4}$', '', normalized)
|
| 59 |
-
normalized = normalized.replace('.', '') # Remove dots for flexible matching
|
| 60 |
-
return normalized
|
| 61 |
-
|
| 62 |
def answer_question(question, query_engine, reranker):
|
| 63 |
try:
|
| 64 |
log_message(f"\n{'='*70}")
|
| 65 |
log_message(f"QUERY: {question}")
|
| 66 |
|
| 67 |
-
target_doc_id = extract_document_id(question)
|
| 68 |
-
found_docs = set(normalize_doc_id(node.metadata.get('document_id', 'unknown')) for node in query_engine.retrieve(question))
|
| 69 |
-
log_message(f"NORMALIZED DOCS IN RETRIEVED: {', '.join(list(found_docs))}")
|
| 70 |
-
if target_doc_id:
|
| 71 |
-
log_message(f"TARGET DOCUMENT: {target_doc_id}")
|
| 72 |
-
|
| 73 |
retrieved = query_engine.retrieve(question)
|
| 74 |
log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
target_normalized = normalize_doc_id(target_doc_id)
|
| 78 |
-
filtered = [
|
| 79 |
-
node for node in retrieved
|
| 80 |
-
if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
|
| 81 |
-
]
|
| 82 |
-
|
| 83 |
-
log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
|
| 84 |
-
|
| 85 |
-
# Debug: show what document IDs were found
|
| 86 |
-
if not filtered and len(retrieved) > 0:
|
| 87 |
-
found_docs = set(node.metadata.get('document_id', 'unknown') for node in retrieved[:10])
|
| 88 |
-
log_message(f"AVAILABLE DOCS (sample): {', '.join(list(found_docs)[:5])}")
|
| 89 |
-
|
| 90 |
-
if not filtered:
|
| 91 |
-
log_message(f"WARNING: No nodes found for {target_doc_id}")
|
| 92 |
-
return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
|
| 93 |
-
|
| 94 |
-
retrieved = filtered
|
| 95 |
-
|
| 96 |
-
# Rest stays the same...
|
| 97 |
-
reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.25)
|
| 98 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 99 |
|
| 100 |
context_parts = []
|
|
@@ -102,7 +55,6 @@ def answer_question(question, query_engine, reranker):
|
|
| 102 |
meta = n.metadata
|
| 103 |
doc_id = meta.get('document_id', 'unknown')
|
| 104 |
doc_type = meta.get('type', 'text')
|
| 105 |
-
|
| 106 |
if doc_type == 'table':
|
| 107 |
table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
|
| 108 |
title = meta.get('table_title', '')
|
|
@@ -111,7 +63,6 @@ def answer_question(question, query_engine, reranker):
|
|
| 111 |
source_label += f" {title}"
|
| 112 |
else:
|
| 113 |
source_label = f"[{doc_id}]"
|
| 114 |
-
|
| 115 |
context_parts.append(f"{source_label}\n{n.text}")
|
| 116 |
|
| 117 |
context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
|
|
@@ -119,24 +70,11 @@ def answer_question(question, query_engine, reranker):
|
|
| 119 |
from config import CUSTOM_PROMPT
|
| 120 |
prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
|
| 121 |
log_message(f"PROMPT LENGTH: {len(prompt)} chars")
|
| 122 |
-
|
| 123 |
from llama_index.core import Settings
|
| 124 |
response = Settings.llm.complete(prompt)
|
| 125 |
|
| 126 |
sources = format_sources(reranked)
|
| 127 |
-
|
| 128 |
-
log_message(f"\n{'='*70}")
|
| 129 |
-
log_message("RETRIEVED CHUNKS:")
|
| 130 |
-
for i, node in enumerate(reranked, 1):
|
| 131 |
-
log_message(f"\n--- Chunk {i} ---")
|
| 132 |
-
log_message(f"Document: {node.metadata.get('document_id')}")
|
| 133 |
-
log_message(f"Type: {node.metadata.get('type')}")
|
| 134 |
-
if node.metadata.get('type') == 'table':
|
| 135 |
-
table_id = node.metadata.get('table_identifier')
|
| 136 |
-
rows = f"{node.metadata.get('row_start', 0)}-{node.metadata.get('row_end', 0)}"
|
| 137 |
-
log_message(f"Table: {table_id} (rows {rows})")
|
| 138 |
-
log_message(f"Text: {node.text[:300]}...")
|
| 139 |
-
|
| 140 |
return response.text, sources
|
| 141 |
|
| 142 |
except Exception as e:
|
|
|
|
| 39 |
|
| 40 |
import re
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def answer_question(question, query_engine, reranker):
|
| 43 |
try:
|
| 44 |
log_message(f"\n{'='*70}")
|
| 45 |
log_message(f"QUERY: {question}")
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
retrieved = query_engine.retrieve(question)
|
| 48 |
log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
|
| 49 |
+
|
| 50 |
+
reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 52 |
|
| 53 |
context_parts = []
|
|
|
|
| 55 |
meta = n.metadata
|
| 56 |
doc_id = meta.get('document_id', 'unknown')
|
| 57 |
doc_type = meta.get('type', 'text')
|
|
|
|
| 58 |
if doc_type == 'table':
|
| 59 |
table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
|
| 60 |
title = meta.get('table_title', '')
|
|
|
|
| 63 |
source_label += f" {title}"
|
| 64 |
else:
|
| 65 |
source_label = f"[{doc_id}]"
|
|
|
|
| 66 |
context_parts.append(f"{source_label}\n{n.text}")
|
| 67 |
|
| 68 |
context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
|
|
|
|
| 70 |
from config import CUSTOM_PROMPT
|
| 71 |
prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
|
| 72 |
log_message(f"PROMPT LENGTH: {len(prompt)} chars")
|
| 73 |
+
|
| 74 |
from llama_index.core import Settings
|
| 75 |
response = Settings.llm.complete(prompt)
|
| 76 |
|
| 77 |
sources = format_sources(reranked)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
return response.text, sources
|
| 79 |
|
| 80 |
except Exception as e:
|