Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -197,23 +197,78 @@ def debug_search_tables(vector_index, search_term="С-25"):
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 201 |
|
|
|
|
| 202 |
normalized_question = normalize_text(question)
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
if change_list:
|
| 207 |
-
log_message(f"Query changes: {', '.join(change_list)}")
|
|
|
|
| 208 |
if query_engine is None:
|
| 209 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 210 |
|
| 211 |
try:
|
| 212 |
start_time = time.time()
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
| 214 |
log_message(f"user query: {question}")
|
| 215 |
log_message(f"normalized query: {normalized_question}")
|
| 216 |
log_message(f"after steel normalization: {normalized_question_2}")
|
|
|
|
| 217 |
log_message(f"Steel grades normalized in query: {query_changes}")
|
| 218 |
|
| 219 |
|
|
@@ -243,11 +298,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 243 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 244 |
|
| 245 |
# Simple reranking with NORMALIZED question and PARAMETERIZED top_k
|
| 246 |
-
reranked_nodes = rerank_nodes(
|
| 247 |
top_k=rerank_top_k) # NOW PARAMETERIZED
|
| 248 |
|
| 249 |
# Direct query without formatting - use normalized question
|
| 250 |
-
response = query_engine.query(
|
| 251 |
|
| 252 |
end_time = time.time()
|
| 253 |
processing_time = end_time - start_time
|
|
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
| 200 |
+
def enhance_query_for_steel_grades(query):
|
| 201 |
+
"""Expand query with related terms for better steel grade retrieval"""
|
| 202 |
+
import re
|
| 203 |
+
|
| 204 |
+
# Detect if query contains steel grades
|
| 205 |
+
steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
|
| 206 |
+
matches = re.findall(steel_pattern, query, re.IGNORECASE)
|
| 207 |
+
|
| 208 |
+
if matches:
|
| 209 |
+
# Add contextual terms
|
| 210 |
+
enhanced = query + " стандарт материал марка стали применение"
|
| 211 |
+
log_message(f"Enhanced query with steel context: {enhanced}")
|
| 212 |
+
return enhanced
|
| 213 |
+
|
| 214 |
+
return query
|
| 215 |
+
|
| 216 |
+
def generate_sub_questions(question, llm_model):
|
| 217 |
+
"""Generate 3-5 related sub-questions to expand query coverage"""
|
| 218 |
+
|
| 219 |
+
expansion_prompt = f"""Ты эксперт по нормативной документации.
|
| 220 |
+
Пользователь задал вопрос: "{question}"
|
| 221 |
+
|
| 222 |
+
Сгенерируй 3-5 дополнительных вопросов, которые помогут найти полный ответ на основной вопрос.
|
| 223 |
+
Вопросы должны быть:
|
| 224 |
+
- Максимально близкими и релевантными к основному вопросу
|
| 225 |
+
- Покрывать разные аспекты темы (стандарты, материалы, методы, требования)
|
| 226 |
+
- Короткими и конкретными
|
| 227 |
+
|
| 228 |
+
Формат ответа - просто список вопросов, по одному на строку, без нумерации:"""
|
| 229 |
+
|
| 230 |
+
try:
|
| 231 |
+
response = llm_model.complete(expansion_prompt)
|
| 232 |
+
sub_questions = [q.strip() for q in response.text.strip().split('\n') if q.strip()]
|
| 233 |
+
|
| 234 |
+
# Take only first 5
|
| 235 |
+
sub_questions = sub_questions[:5]
|
| 236 |
+
|
| 237 |
+
log_message(f"Generated {len(sub_questions)} sub-questions:")
|
| 238 |
+
for sq in sub_questions:
|
| 239 |
+
log_message(f" - {sq}")
|
| 240 |
+
|
| 241 |
+
return sub_questions
|
| 242 |
+
except Exception as e:
|
| 243 |
+
log_message(f"Error generating sub-questions: {e}")
|
| 244 |
+
return []
|
| 245 |
+
|
| 246 |
+
|
| 247 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 248 |
|
| 249 |
+
# FIXED: Apply all normalizations in correct order
|
| 250 |
normalized_question = normalize_text(question)
|
| 251 |
+
normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
|
| 252 |
+
|
| 253 |
+
# FIX: Actually call enhance_query_for_steel_grades!
|
| 254 |
+
enhanced_query = enhance_query_for_steel_grades(normalized_question_2)
|
| 255 |
+
|
| 256 |
if change_list:
|
| 257 |
+
log_message(f"Query changes: {', '.join(change_list)}")
|
| 258 |
+
|
| 259 |
if query_engine is None:
|
| 260 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 261 |
|
| 262 |
try:
|
| 263 |
start_time = time.time()
|
| 264 |
+
|
| 265 |
+
# FIX: Use enhanced_query instead of normalized_question_2
|
| 266 |
+
retrieved_nodes = query_engine.retriever.retrieve(enhanced_query)
|
| 267 |
+
|
| 268 |
log_message(f"user query: {question}")
|
| 269 |
log_message(f"normalized query: {normalized_question}")
|
| 270 |
log_message(f"after steel normalization: {normalized_question_2}")
|
| 271 |
+
log_message(f"enhanced query: {enhanced_query}") # NEW LOG
|
| 272 |
log_message(f"Steel grades normalized in query: {query_changes}")
|
| 273 |
|
| 274 |
|
|
|
|
| 298 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 299 |
|
| 300 |
# Simple reranking with NORMALIZED question and PARAMETERIZED top_k
|
| 301 |
+
reranked_nodes = rerank_nodes(enhanced_query, unique_retrieved, reranker,
|
| 302 |
top_k=rerank_top_k) # NOW PARAMETERIZED
|
| 303 |
|
| 304 |
# Direct query without formatting - use normalized question
|
| 305 |
+
response = query_engine.query(enhanced_query)
|
| 306 |
|
| 307 |
end_time = time.time()
|
| 308 |
processing_time = end_time - start_time
|