Spaces:
Sleeping
Sleeping
Commit
·
0d6b2c5
1
Parent(s):
3dcab53
old version with fixed, 3000, 30
Browse files- config.py +1 -1
- documents_prep.py +31 -61
- utils.py +25 -47
config.py
CHANGED
|
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
-
MAX_CHARS_TABLE =
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
|
|
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
+
MAX_CHARS_TABLE = 3000
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
|
documents_prep.py
CHANGED
|
@@ -26,9 +26,9 @@ import re
|
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
-
Normalize steel designations by converting
|
| 30 |
-
|
| 31 |
-
|
| 32 |
Returns: (normalized_text, changes_count, changes_list)
|
| 33 |
"""
|
| 34 |
if not text:
|
|
@@ -39,24 +39,25 @@ def normalize_steel_designations(text):
|
|
| 39 |
changes_count = 0
|
| 40 |
changes_list = []
|
| 41 |
|
| 42 |
-
# Mapping of
|
| 43 |
replacements = {
|
| 44 |
-
'
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
-
'
|
| 48 |
-
'
|
| 49 |
-
'
|
| 50 |
-
'
|
| 51 |
-
'
|
| 52 |
-
'
|
| 53 |
}
|
| 54 |
|
| 55 |
-
# Pattern
|
|
|
|
| 56 |
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 57 |
-
|
| 58 |
-
#
|
| 59 |
-
pattern_wire = r'\b[
|
| 60 |
|
| 61 |
def replace_in_steel_grade(match):
|
| 62 |
nonlocal changes_count, changes_list
|
|
@@ -75,7 +76,6 @@ def normalize_steel_designations(text):
|
|
| 75 |
|
| 76 |
|
| 77 |
|
| 78 |
-
|
| 79 |
def chunk_text_documents(documents):
|
| 80 |
text_splitter = SentenceSplitter(
|
| 81 |
chunk_size=CHUNK_SIZE,
|
|
@@ -195,12 +195,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 195 |
normalized_rows.append(row)
|
| 196 |
|
| 197 |
# Log normalization stats with examples
|
| 198 |
-
if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
|
| 199 |
-
sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
|
| 200 |
-
cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
|
| 201 |
-
if cyrillic_chars:
|
| 202 |
-
log_message(f" ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
|
| 203 |
-
|
| 204 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 205 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 206 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
|
@@ -227,43 +221,19 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 227 |
if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
|
| 228 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
|
| 244 |
-
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
|
| 245 |
-
}
|
| 246 |
-
|
| 247 |
-
# Add this helper function:
|
| 248 |
-
def extract_steel_grades_from_table(rows, title):
|
| 249 |
-
"""Extract all steel grade mentions for metadata"""
|
| 250 |
-
import re
|
| 251 |
-
grades = set()
|
| 252 |
-
|
| 253 |
-
# Pattern for steel grades (both normalized and original)
|
| 254 |
-
pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
|
| 255 |
-
|
| 256 |
-
# Check title
|
| 257 |
-
if title:
|
| 258 |
-
grades.update(re.findall(pattern, str(title), re.IGNORECASE))
|
| 259 |
-
|
| 260 |
-
# Check rows (limit to first 20 rows to avoid bloat)
|
| 261 |
-
for row in rows[:20]:
|
| 262 |
-
if isinstance(row, dict):
|
| 263 |
-
for v in row.values():
|
| 264 |
-
grades.update(re.findall(pattern, str(v), re.IGNORECASE))
|
| 265 |
-
|
| 266 |
-
return list(grades)[:50]
|
| 267 |
|
| 268 |
log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
|
| 269 |
return [Document(text=content, metadata=metadata)]
|
|
|
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
+
Normalize steel designations by converting Cyrillic letters to Latin.
|
| 30 |
+
This improves search/retrieval since embedding models work better with Latin.
|
| 31 |
+
Handles patterns like 08Х18Н10Т → 08X18H10T
|
| 32 |
Returns: (normalized_text, changes_count, changes_list)
|
| 33 |
"""
|
| 34 |
if not text:
|
|
|
|
| 39 |
changes_count = 0
|
| 40 |
changes_list = []
|
| 41 |
|
| 42 |
+
# Mapping of Cyrillic to Latin for steel designations
|
| 43 |
replacements = {
|
| 44 |
+
'Х': 'X', # Cyrillic Kha → Latin X
|
| 45 |
+
'Н': 'H', # Cyrillic En → Latin H
|
| 46 |
+
'Т': 'T', # Cyrillic Te → Latin T
|
| 47 |
+
'С': 'C', # Cyrillic Es → Latin C
|
| 48 |
+
'В': 'B', # Cyrillic Ve → Latin B
|
| 49 |
+
'К': 'K', # Cyrillic Ka → Latin K
|
| 50 |
+
'М': 'M', # Cyrillic Em → Latin M
|
| 51 |
+
'А': 'A', # Cyrillic A → Latin A
|
| 52 |
+
'Р': 'P', # Cyrillic Er → Latin P
|
| 53 |
}
|
| 54 |
|
| 55 |
+
# Pattern: starts with digits, then letters+digits (steel grade pattern)
|
| 56 |
+
# Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
|
| 57 |
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 58 |
+
|
| 59 |
+
# Also match welding wire patterns like СВ-08Х19Н10
|
| 60 |
+
pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 61 |
|
| 62 |
def replace_in_steel_grade(match):
|
| 63 |
nonlocal changes_count, changes_list
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
|
|
|
|
| 79 |
def chunk_text_documents(documents):
|
| 80 |
text_splitter = SentenceSplitter(
|
| 81 |
chunk_size=CHUNK_SIZE,
|
|
|
|
| 195 |
normalized_rows.append(row)
|
| 196 |
|
| 197 |
# Log normalization stats with examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 199 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 200 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
|
|
|
| 221 |
if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
|
| 222 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 223 |
|
| 224 |
+
metadata = {
|
| 225 |
+
'type': 'table',
|
| 226 |
+
'document_id': doc_id,
|
| 227 |
+
'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
|
| 228 |
+
'table_identifier': table_identifier,
|
| 229 |
+
'table_title': table_title,
|
| 230 |
+
'section': section,
|
| 231 |
+
'sheet_name': sheet_name,
|
| 232 |
+
'total_rows': len(normalized_rows),
|
| 233 |
+
'chunk_size': len(content),
|
| 234 |
+
'is_complete_table': True,
|
| 235 |
+
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
|
| 236 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
|
| 239 |
return [Document(text=content, metadata=metadata)]
|
utils.py
CHANGED
|
@@ -197,71 +197,47 @@ def debug_search_tables(vector_index, search_term="С-25"):
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
| 200 |
-
def
|
| 201 |
-
"""
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
log_message(f"Generated {len(variations)} query variations:")
|
| 216 |
-
for i, var in enumerate(variations, 1):
|
| 217 |
-
log_message(f" {i}. {var}")
|
| 218 |
-
|
| 219 |
-
# Combine original + variations
|
| 220 |
-
combined_query = query + " " + " ".join(variations)
|
| 221 |
-
return combined_query
|
| 222 |
-
else:
|
| 223 |
-
log_message("No variations generated, using original query")
|
| 224 |
-
return query
|
| 225 |
-
|
| 226 |
-
except Exception as e:
|
| 227 |
-
log_message(f"Error generating query variations: {e}")
|
| 228 |
-
return query
|
| 229 |
|
| 230 |
|
| 231 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 232 |
|
| 233 |
-
# Apply normalizations
|
| 234 |
normalized_question = normalize_text(question)
|
| 235 |
-
normalized_question_2, query_changes, change_list = normalize_steel_designations(
|
| 236 |
-
|
| 237 |
if change_list:
|
| 238 |
-
log_message(f"Query changes: {', '.join(change_list)}")
|
| 239 |
-
|
| 240 |
if query_engine is None:
|
| 241 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 242 |
|
| 243 |
try:
|
| 244 |
start_time = time.time()
|
| 245 |
-
|
| 246 |
-
# EXPAND QUERY USING LLM
|
| 247 |
-
from utils import get_llm_model
|
| 248 |
-
llm = get_llm_model(current_model)
|
| 249 |
-
expanded_query = expand_query_with_llm(normalized_question_2, llm)
|
| 250 |
-
|
| 251 |
-
# Use expanded query for retrieval
|
| 252 |
-
retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
|
| 253 |
-
|
| 254 |
log_message(f"user query: {question}")
|
| 255 |
log_message(f"normalized query: {normalized_question}")
|
| 256 |
log_message(f"after steel normalization: {normalized_question_2}")
|
| 257 |
-
log_message(f"expanded query length: {len(expanded_query)} chars")
|
| 258 |
log_message(f"Steel grades normalized in query: {query_changes}")
|
|
|
|
| 259 |
|
| 260 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 261 |
|
| 262 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
|
|
|
|
|
|
| 263 |
log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
|
| 264 |
-
|
| 265 |
for i, node in enumerate(unique_retrieved):
|
| 266 |
node_type = node.metadata.get('type', 'text')
|
| 267 |
doc_id = node.metadata.get('document_id', 'N/A')
|
|
@@ -270,6 +246,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 270 |
table_num = node.metadata.get('table_number', 'N/A')
|
| 271 |
table_id = node.metadata.get('table_identifier', 'N/A')
|
| 272 |
table_title = node.metadata.get('table_title', 'N/A')
|
|
|
|
| 273 |
content_preview = node.text[:200].replace('\n', ' ')
|
| 274 |
log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
|
| 275 |
log_message(f" Title: {table_title[:80]}")
|
|
@@ -280,10 +257,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 280 |
|
| 281 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 282 |
|
|
|
|
| 283 |
reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
|
| 284 |
-
top_k=rerank_top_k)
|
| 285 |
|
| 286 |
-
#
|
| 287 |
response = query_engine.query(normalized_question_2)
|
| 288 |
|
| 289 |
end_time = time.time()
|
|
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
| 200 |
+
def enhance_query_for_steel_grades(query):
|
| 201 |
+
"""Expand query with related terms for better steel grade retrieval"""
|
| 202 |
+
import re
|
| 203 |
+
|
| 204 |
+
# Detect if query contains steel grades
|
| 205 |
+
steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
|
| 206 |
+
matches = re.findall(steel_pattern, query, re.IGNORECASE)
|
| 207 |
+
|
| 208 |
+
if matches:
|
| 209 |
+
# Add contextual terms
|
| 210 |
+
enhanced = query + " стандарт материал марка стали применение"
|
| 211 |
+
log_message(f"Enhanced query with steel context: {enhanced}")
|
| 212 |
+
return enhanced
|
| 213 |
+
|
| 214 |
+
return query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 218 |
|
|
|
|
| 219 |
normalized_question = normalize_text(question)
|
| 220 |
+
normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
|
|
|
|
| 221 |
if change_list:
|
| 222 |
+
log_message(f"Query changes: {', '.join(change_list)}")
|
|
|
|
| 223 |
if query_engine is None:
|
| 224 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 225 |
|
| 226 |
try:
|
| 227 |
start_time = time.time()
|
| 228 |
+
retrieved_nodes = query_engine.retriever.retrieve(normalized_question_2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
log_message(f"user query: {question}")
|
| 230 |
log_message(f"normalized query: {normalized_question}")
|
| 231 |
log_message(f"after steel normalization: {normalized_question_2}")
|
|
|
|
| 232 |
log_message(f"Steel grades normalized in query: {query_changes}")
|
| 233 |
+
|
| 234 |
|
| 235 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 236 |
|
| 237 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 238 |
+
|
| 239 |
+
# IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
|
| 240 |
log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
|
|
|
|
| 241 |
for i, node in enumerate(unique_retrieved):
|
| 242 |
node_type = node.metadata.get('type', 'text')
|
| 243 |
doc_id = node.metadata.get('document_id', 'N/A')
|
|
|
|
| 246 |
table_num = node.metadata.get('table_number', 'N/A')
|
| 247 |
table_id = node.metadata.get('table_identifier', 'N/A')
|
| 248 |
table_title = node.metadata.get('table_title', 'N/A')
|
| 249 |
+
# Show first 200 chars of content to verify it's the right table
|
| 250 |
content_preview = node.text[:200].replace('\n', ' ')
|
| 251 |
log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
|
| 252 |
log_message(f" Title: {table_title[:80]}")
|
|
|
|
| 257 |
|
| 258 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 259 |
|
| 260 |
+
# Simple reranking with NORMALIZED question and PARAMETERIZED top_k
|
| 261 |
reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
|
| 262 |
+
top_k=rerank_top_k) # NOW PARAMETERIZED
|
| 263 |
|
| 264 |
+
# Direct query without formatting - use normalized question
|
| 265 |
response = query_engine.query(normalized_question_2)
|
| 266 |
|
| 267 |
end_time = time.time()
|