Spaces:
Sleeping
Sleeping
Commit
·
379f6e4
1
Parent(s):
15ae02f
new keyboard based approachj
Browse files- documents_prep.py +43 -13
- utils.py +18 -2
documents_prep.py
CHANGED
|
@@ -195,6 +195,12 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 195 |
normalized_rows.append(row)
|
| 196 |
|
| 197 |
# Log normalization stats with examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 199 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 200 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
|
@@ -221,19 +227,43 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 221 |
if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
|
| 222 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
|
| 239 |
return [Document(text=content, metadata=metadata)]
|
|
|
|
| 195 |
normalized_rows.append(row)
|
| 196 |
|
| 197 |
# Log normalization stats with examples
|
| 198 |
+
if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
|
| 199 |
+
sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
|
| 200 |
+
cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
|
| 201 |
+
if cyrillic_chars:
|
| 202 |
+
log_message(f" ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
|
| 203 |
+
|
| 204 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 205 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 206 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
|
|
|
| 227 |
if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
|
| 228 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 229 |
|
| 230 |
+
metadata = {
|
| 231 |
+
'type': 'table',
|
| 232 |
+
'document_id': doc_id,
|
| 233 |
+
'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
|
| 234 |
+
'table_identifier': table_identifier,
|
| 235 |
+
'table_title': table_title,
|
| 236 |
+
'section': section,
|
| 237 |
+
'sheet_name': sheet_name,
|
| 238 |
+
'total_rows': len(normalized_rows),
|
| 239 |
+
'chunk_size': len(content),
|
| 240 |
+
'is_complete_table': True,
|
| 241 |
+
|
| 242 |
+
# ADD THESE - extracted steel grades for better matching
|
| 243 |
+
'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
|
| 244 |
+
'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
# Add this helper function:
|
| 248 |
+
def extract_steel_grades_from_table(rows, title):
|
| 249 |
+
"""Extract all steel grade mentions for metadata"""
|
| 250 |
+
import re
|
| 251 |
+
grades = set()
|
| 252 |
+
|
| 253 |
+
# Pattern for steel grades (both normalized and original)
|
| 254 |
+
pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
|
| 255 |
+
|
| 256 |
+
# Check title
|
| 257 |
+
if title:
|
| 258 |
+
grades.update(re.findall(pattern, str(title), re.IGNORECASE))
|
| 259 |
+
|
| 260 |
+
# Check rows (limit to first 20 rows to avoid bloat)
|
| 261 |
+
for row in rows[:20]:
|
| 262 |
+
if isinstance(row, dict):
|
| 263 |
+
for v in row.values():
|
| 264 |
+
grades.update(re.findall(pattern, str(v), re.IGNORECASE))
|
| 265 |
+
|
| 266 |
+
return list(grades)[:50]
|
| 267 |
|
| 268 |
log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
|
| 269 |
return [Document(text=content, metadata=metadata)]
|
utils.py
CHANGED
|
@@ -197,12 +197,28 @@ def debug_search_tables(vector_index, search_term="С-25"):
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 201 |
|
| 202 |
normalized_question = normalize_text(question)
|
| 203 |
-
log_message(f"Normalized question: {normalized_question}")
|
| 204 |
normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
|
| 205 |
-
|
| 206 |
if change_list:
|
| 207 |
log_message(f"Query changes: {', '.join(change_list)}")
|
| 208 |
if query_engine is None:
|
|
|
|
| 197 |
|
| 198 |
from documents_prep import normalize_text, normalize_steel_designations
|
| 199 |
|
| 200 |
+
def enhance_query_for_steel_grades(query):
|
| 201 |
+
"""Expand query with related terms for better steel grade retrieval"""
|
| 202 |
+
import re
|
| 203 |
+
|
| 204 |
+
# Detect if query contains steel grades
|
| 205 |
+
steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
|
| 206 |
+
matches = re.findall(steel_pattern, query, re.IGNORECASE)
|
| 207 |
+
|
| 208 |
+
if matches:
|
| 209 |
+
# Add contextual terms
|
| 210 |
+
enhanced = query + " стандарт материал марка стали применение"
|
| 211 |
+
log_message(f"Enhanced query with steel context: {enhanced}")
|
| 212 |
+
return enhanced
|
| 213 |
+
|
| 214 |
+
return query
|
| 215 |
+
|
| 216 |
+
|
| 217 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
|
| 218 |
|
| 219 |
normalized_question = normalize_text(question)
|
|
|
|
| 220 |
normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
|
| 221 |
+
normalized_question_2 = enhance_query_for_steel_grades(normalized_question_2)
|
| 222 |
if change_list:
|
| 223 |
log_message(f"Query changes: {', '.join(change_list)}")
|
| 224 |
if query_engine is None:
|