MrSimple07 commited on
Commit
78e6c03
·
1 Parent(s): 52b85db

new normalizer C to Latin C

Browse files
Files changed (3) hide show
  1. documents_prep.py +32 -11
  2. index_retriever.py +4 -106
  3. utils.py +20 -21
documents_prep.py CHANGED
@@ -34,6 +34,25 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
  headers = table_data.get('headers', [])
@@ -43,6 +62,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
43
  section = table_data.get('section', '')
44
 
45
  table_num_clean = str(table_num).strip()
 
46
 
47
  import re
48
  if 'приложени' in section.lower():
@@ -60,8 +80,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
60
 
61
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
 
63
- # Calculate base metadata size
64
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
65
  base_size = len(base_content)
66
  available_space = max_chars - base_size - 200
67
 
@@ -74,8 +94,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
74
  'type': 'table',
75
  'document_id': doc_id,
76
  'table_number': table_num_clean,
77
- 'table_identifier': table_identifier,
78
- 'table_title': table_title,
79
  'section': section,
80
  'total_rows': len(rows),
81
  'chunk_size': len(content),
@@ -105,8 +125,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
105
  'type': 'table',
106
  'document_id': doc_id,
107
  'table_number': table_num_clean,
108
- 'table_identifier': table_identifier,
109
- 'table_title': table_title,
110
  'section': section,
111
  'chunk_id': chunk_num,
112
  'row_start': current_rows[0]['_idx'] - 1,
@@ -139,8 +159,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
139
  'type': 'table',
140
  'document_id': doc_id,
141
  'table_number': table_num_clean,
142
- 'table_identifier': table_identifier,
143
- 'table_title': table_title,
144
  'section': section,
145
  'chunk_id': chunk_num,
146
  'row_start': current_rows[0]['_idx'] - 1,
@@ -156,15 +176,16 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
156
  return chunks
157
 
158
 
 
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
- content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
161
 
162
  # Add table type/number prominently for matching
163
  if table_num:
164
- content += f"ТИП: {table_num}\n"
165
 
166
  if table_title:
167
- content += f"НАЗВАНИЕ: {table_title}\n"
168
 
169
  if section:
170
  content += f"РАЗДЕЛ: {section}\n"
 
34
 
35
  return chunked
36
 
37
+ def normalize_text(text):
38
+ """
39
+ Normalize text by converting Latin C to Cyrillic С for consistency
40
+ This ensures "C-25" and "С-25" are treated as the same in search
41
+ """
42
+ if not text:
43
+ return text
44
+
45
+ # Replace Latin 'C' with Cyrillic 'С' (U+0421)
46
+ # This is for welding types like C-25 -> С-25
47
+ text = text.replace('C-', 'С-')
48
+ text = text.replace('C ', 'С ')
49
+
50
+ # Also handle cases like "Type C" or variations
51
+ import re
52
+ # Match "C" followed by digit or space in context of welding types
53
+ text = re.sub(r'\bC(\d)', r'С\1', text)
54
+
55
+ return text
56
 
57
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
58
  headers = table_data.get('headers', [])
 
62
  section = table_data.get('section', '')
63
 
64
  table_num_clean = str(table_num).strip()
65
+ table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
66
 
67
  import re
68
  if 'приложени' in section.lower():
 
80
 
81
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
82
 
83
+ # Calculate base metadata size with NORMALIZED title
84
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
85
  base_size = len(base_content)
86
  available_space = max_chars - base_size - 200
87
 
 
94
  'type': 'table',
95
  'document_id': doc_id,
96
  'table_number': table_num_clean,
97
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
98
+ 'table_title': table_title_normalized, # NORMALIZED
99
  'section': section,
100
  'total_rows': len(rows),
101
  'chunk_size': len(content),
 
125
  'type': 'table',
126
  'document_id': doc_id,
127
  'table_number': table_num_clean,
128
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
129
+ 'table_title': table_title_normalized, # NORMALIZED
130
  'section': section,
131
  'chunk_id': chunk_num,
132
  'row_start': current_rows[0]['_idx'] - 1,
 
159
  'type': 'table',
160
  'document_id': doc_id,
161
  'table_number': table_num_clean,
162
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
163
+ 'table_title': table_title_normalized, # NORMALIZED
164
  'section': section,
165
  'chunk_id': chunk_num,
166
  'row_start': current_rows[0]['_idx'] - 1,
 
176
  return chunks
177
 
178
 
179
+ # MODIFIED: Update format_table_header function
180
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
181
+ content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
182
 
183
  # Add table type/number prominently for matching
184
  if table_num:
185
+ content += f"ТИП: {normalize_text(table_num)}\n"
186
 
187
  if table_title:
188
+ content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
189
 
190
  if section:
191
  content += f"РАЗДЕЛ: {section}\n"
index_retriever.py CHANGED
@@ -65,96 +65,9 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
65
  log_message(f"Ошибка переранжировки: {str(e)}")
66
  return nodes[:top_k]
67
 
68
- def extract_weld_type_from_query(query):
69
- """Extract welded joint type (С-XX, У-XX, etc.) from query"""
70
- import re
71
-
72
- # Pattern for Russian weld types: С-25, У-12, Т-5, etc.
73
- patterns = [
74
- r'[СУТ]-\d+', # Matches С-25, У-12, Т-5
75
- r'(?:тип|тип[а-я]*)\s+([СУТ]-\d+)', # "тип С-25" or "тип: С-25"
76
- ]
77
-
78
- for pattern in patterns:
79
- match = re.search(pattern, query, re.IGNORECASE)
80
- if match:
81
- if '-' in match.group(0):
82
- return match.group(0).upper()
83
- elif len(match.groups()) > 0:
84
- return match.group(1).upper()
85
-
86
- return None
87
-
88
-
89
- def retrieve_nodes_with_weld_type_priority(query, vector_index, hybrid_retriever, reranker, top_k=20):
90
- """
91
- Enhanced retrieval that prioritizes welded joint type matches
92
- """
93
- from utils import deduplicate_nodes
94
-
95
- log_message(f"Enhanced retrieval for query: {query}")
96
-
97
- # Step 1: Try to extract weld type from query
98
- weld_type = extract_weld_type_from_query(query)
99
-
100
- if weld_type:
101
- log_message(f"Detected weld type in query: {weld_type}")
102
-
103
- # Step 2: Direct lookup in docstore for this weld type
104
- direct_matches = []
105
- all_nodes = list(vector_index.docstore.docs.values())
106
-
107
- for node in all_nodes:
108
- metadata = node.metadata if hasattr(node, 'metadata') else {}
109
-
110
- # Check if this is a table node with matching weld type
111
- if metadata.get('type') == 'table':
112
- table_num = metadata.get('table_number', '')
113
- table_title = metadata.get('table_title', '')
114
-
115
- # Check multiple fields for the weld type
116
- if (weld_type in str(table_num) or
117
- weld_type in str(table_title) or
118
- weld_type in str(metadata.get('section', ''))):
119
- direct_matches.append(node)
120
- log_message(f" Direct match found: {metadata.get('document_id')} - {table_title}")
121
-
122
- if direct_matches:
123
- # Remove duplicates
124
- direct_matches = deduplicate_nodes(direct_matches)
125
- log_message(f"Found {len(direct_matches)} direct matches for {weld_type}")
126
-
127
- # Add some context from hybrid retriever
128
- hybrid_results = hybrid_retriever.retrieve(query)
129
-
130
- # Combine: prioritize direct matches, supplement with hybrid results
131
- combined = direct_matches + hybrid_results
132
- combined = deduplicate_nodes(combined)
133
-
134
- # Rerank combined results
135
- reranked = rerank_nodes(query, combined, reranker, top_k=top_k)
136
- log_message(f"Combined retrieval: {len(direct_matches)} direct + hybrid, returning {len(reranked)} reranked")
137
-
138
- return reranked
139
-
140
- # Step 3: Fall back to normal hybrid retrieval if no weld type found
141
- log_message("No weld type detected, using standard hybrid retrieval")
142
- retrieved_nodes = hybrid_retriever.retrieve(query)
143
- retrieved_nodes = deduplicate_nodes(retrieved_nodes)
144
- reranked_nodes = rerank_nodes(query, retrieved_nodes, reranker, top_k=top_k)
145
-
146
- return reranked_nodes
147
-
148
-
149
- # Update create_query_engine to use the enhanced retrieval
150
  def create_query_engine(vector_index):
151
  try:
152
  from config import CUSTOM_PROMPT
153
- from llama_index.core.prompts import PromptTemplate
154
- from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
155
- from llama_index.core.query_engine import RetrieverQueryEngine
156
- from llama_index.retrievers.bm25 import BM25Retriever
157
- from llama_index.core.retrievers import QueryFusionRetriever, VectorIndexRetriever
158
 
159
  bm25_retriever = BM25Retriever.from_defaults(
160
  docstore=vector_index.docstore,
@@ -179,29 +92,14 @@ def create_query_engine(vector_index):
179
  text_qa_template=custom_prompt_template
180
  )
181
 
182
- # Create custom query engine with enhanced retrieval
183
- class EnhancedRetrieverQueryEngine(RetrieverQueryEngine):
184
- def __init__(self, retriever, response_synthesizer, vector_index, reranker):
185
- super().__init__(retriever=retriever, response_synthesizer=response_synthesizer)
186
- self.vector_index = vector_index
187
- self.reranker = reranker
188
-
189
- def retrieve(self, query):
190
- """Override retrieve to use enhanced weld-type-aware retrieval"""
191
- return retrieve_nodes_with_weld_type_priority(
192
- query, self.vector_index, self.retriever, self.reranker, top_k=20
193
- )
194
-
195
- query_engine = EnhancedRetrieverQueryEngine(
196
  retriever=hybrid_retriever,
197
- response_synthesizer=response_synthesizer,
198
- vector_index=vector_index,
199
- reranker=None # Will be passed in later
200
  )
201
 
202
- log_message("Enhanced query engine created with weld-type prioritization")
203
  return query_engine
204
 
205
  except Exception as e:
206
- log_message(f"Error creating enhanced query engine: {str(e)}")
207
  raise
 
65
  log_message(f"Ошибка переранжировки: {str(e)}")
66
  return nodes[:top_k]
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def create_query_engine(vector_index):
69
  try:
70
  from config import CUSTOM_PROMPT
 
 
 
 
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
 
92
  text_qa_template=custom_prompt_template
93
  )
94
 
95
+ query_engine = RetrieverQueryEngine(
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  retriever=hybrid_retriever,
97
+ response_synthesizer=response_synthesizer
 
 
98
  )
99
 
100
+ log_message("Query engine успешно создан")
101
  return query_engine
102
 
103
  except Exception as e:
104
+ log_message(f"Ошибка создания query engine: {str(e)}")
105
  raise
utils.py CHANGED
@@ -195,42 +195,43 @@ def debug_search_tables(vector_index, search_term="С-25"):
195
 
196
  return matching
197
 
 
 
 
 
198
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
 
 
 
199
  if query_engine is None:
200
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
201
 
202
  try:
203
  start_time = time.time()
204
-
205
- # Use enhanced retrieval if available
206
- if hasattr(query_engine, 'retrieve'):
207
- # Use the enhanced retrieval that's aware of weld types
208
- retrieved_nodes = query_engine.retrieve(question)
209
- else:
210
- # Fallback to standard retrieval
211
- retrieved_nodes = query_engine.retriever.retrieve(question)
212
-
213
  log_message(f"user query: {question}")
 
 
 
214
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
215
 
216
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
217
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
218
-
219
- for i, node in enumerate(unique_retrieved[:15]): # Log first 15
220
  table_num = node.metadata.get('table_number', 'N/A')
221
  table_title = node.metadata.get('table_title', 'N/A')
222
  doc_id = node.metadata.get('document_id', 'N/A')
223
  log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
 
224
 
225
- # Rerank only if we have nodes
226
- if unique_retrieved:
227
- reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
228
- else:
229
- reranked_nodes = []
230
- log_message("WARNING: No nodes to rerank!")
231
 
232
- # Direct query without formatting
233
- response = query_engine.query(question)
234
 
235
  end_time = time.time()
236
  processing_time = end_time - start_time
@@ -269,7 +270,5 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
269
 
270
  except Exception as e:
271
  log_message(f"Ошибка: {str(e)}")
272
- import traceback
273
- log_message(traceback.format_exc())
274
  error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
275
  return error_msg, "", ""
 
195
 
196
  return matching
197
 
198
+ # Add this import at the top of utils.py
199
+ from documents_prep import normalize_text
200
+
201
+ # MODIFIED: Update answer_question function
202
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
203
+ # NORMALIZE the question to convert C to С
204
+ normalized_question = normalize_text(question)
205
+
206
  if query_engine is None:
207
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
208
 
209
  try:
210
  start_time = time.time()
211
+ # Use NORMALIZED question for retrieval
212
+ retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
 
 
 
 
 
 
 
213
  log_message(f"user query: {question}")
214
+ log_message(f"normalized query: {normalized_question}")
215
+
216
+
217
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
218
 
219
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
220
+
221
+ # DEBUG: Log what was retrieved
222
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
223
+ for i, node in enumerate(unique_retrieved): # All debug
 
224
  table_num = node.metadata.get('table_number', 'N/A')
225
  table_title = node.metadata.get('table_title', 'N/A')
226
  doc_id = node.metadata.get('document_id', 'N/A')
227
  log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
228
+ log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
229
 
230
+ # Simple reranking with NORMALIZED question
231
+ reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
 
 
 
 
232
 
233
+ # Direct query without formatting - use normalized question
234
+ response = query_engine.query(normalized_question)
235
 
236
  end_time = time.time()
237
  processing_time = end_time - start_time
 
270
 
271
  except Exception as e:
272
  log_message(f"Ошибка: {str(e)}")
 
 
273
  error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
274
  return error_msg, "", ""