MrSimple07 commited on
Commit
0d6b2c5
·
1 Parent(s): 3dcab53

old version with fixed, 3000, 30

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +31 -61
  3. utils.py +25 -47
config.py CHANGED
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
- MAX_CHARS_TABLE = 4000
55
  MAX_ROWS_TABLE = 30
56
 
57
 
 
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
+ MAX_CHARS_TABLE = 3000
55
  MAX_ROWS_TABLE = 30
56
 
57
 
documents_prep.py CHANGED
@@ -26,9 +26,9 @@ import re
26
 
27
  def normalize_steel_designations(text):
28
  """
29
- Normalize steel designations by converting Latin letters to Cyrillic.
30
- Handles patterns like 08X18H10T 08Х18Н10Т.
31
- Useful when aligning with Russian technical documentation.
32
  Returns: (normalized_text, changes_count, changes_list)
33
  """
34
  if not text:
@@ -39,24 +39,25 @@ def normalize_steel_designations(text):
39
  changes_count = 0
40
  changes_list = []
41
 
42
- # Mapping of Latin to Cyrillic for steel designations
43
  replacements = {
44
- 'X': 'Х', # Latin XCyrillic Х
45
- 'H': 'Н', # Latin HCyrillic Н
46
- 'T': 'Т', # Latin TCyrillic Т
47
- 'C': 'С', # Latin CCyrillic С
48
- 'B': 'В', # Latin BCyrillic В
49
- 'K': 'К', # Latin KCyrillic К
50
- 'M': 'М', # Latin MCyrillic М
51
- 'A': 'А', # Latin A → Cyrillic А
52
- 'P': 'Р', # Latin PCyrillic Р
53
  }
54
 
55
- # Pattern for steel grades (digits + letters)
 
56
  pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
57
-
58
- # Pattern for welding wire designations (e.g. CB-08X19H10)
59
- pattern_wire = r'\b[CSС][BVВ]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
60
 
61
  def replace_in_steel_grade(match):
62
  nonlocal changes_count, changes_list
@@ -75,7 +76,6 @@ def normalize_steel_designations(text):
75
 
76
 
77
 
78
-
79
  def chunk_text_documents(documents):
80
  text_splitter = SentenceSplitter(
81
  chunk_size=CHUNK_SIZE,
@@ -195,12 +195,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
195
  normalized_rows.append(row)
196
 
197
  # Log normalization stats with examples
198
- if total_row_changes == 0 and title_changes == 0 and section_changes == 0:
199
- sample_text = str(table_title) + ' ' + str(rows[0] if rows else '')
200
- cyrillic_chars = [c for c in sample_text if '\u0400' <= c <= '\u04FF']
201
- if cyrillic_chars:
202
- log_message(f" ⚠️ WARNING: Found Cyrillic chars but no normalization: {cyrillic_chars[:10]}")
203
-
204
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
205
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
206
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
@@ -227,43 +221,19 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
227
  if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
228
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
229
 
230
- metadata = {
231
- 'type': 'table',
232
- 'document_id': doc_id,
233
- 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
234
- 'table_identifier': table_identifier,
235
- 'table_title': table_title,
236
- 'section': section,
237
- 'sheet_name': sheet_name,
238
- 'total_rows': len(normalized_rows),
239
- 'chunk_size': len(content),
240
- 'is_complete_table': True,
241
-
242
- # ADD THESE - extracted steel grades for better matching
243
- 'steel_grades': extract_steel_grades_from_table(normalized_rows, table_title),
244
- 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал стандарт {' '.join(extract_steel_grades_from_table(normalized_rows, table_title))}"
245
- }
246
-
247
- # Add this helper function:
248
- def extract_steel_grades_from_table(rows, title):
249
- """Extract all steel grade mentions for metadata"""
250
- import re
251
- grades = set()
252
-
253
- # Pattern for steel grades (both normalized and original)
254
- pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
255
-
256
- # Check title
257
- if title:
258
- grades.update(re.findall(pattern, str(title), re.IGNORECASE))
259
-
260
- # Check rows (limit to first 20 rows to avoid bloat)
261
- for row in rows[:20]:
262
- if isinstance(row, dict):
263
- for v in row.values():
264
- grades.update(re.findall(pattern, str(v), re.IGNORECASE))
265
-
266
- return list(grades)[:50]
267
 
268
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
269
  return [Document(text=content, metadata=metadata)]
 
26
 
27
  def normalize_steel_designations(text):
28
  """
29
+ Normalize steel designations by converting Cyrillic letters to Latin.
30
+ This improves search/retrieval since embedding models work better with Latin.
31
+ Handles patterns like 08Х18Н10Т 08X18H10T
32
  Returns: (normalized_text, changes_count, changes_list)
33
  """
34
  if not text:
 
39
  changes_count = 0
40
  changes_list = []
41
 
42
+ # Mapping of Cyrillic to Latin for steel designations
43
  replacements = {
44
+ 'Х': 'X', # Cyrillic KhaLatin X
45
+ 'Н': 'H', # Cyrillic EnLatin H
46
+ 'Т': 'T', # Cyrillic TeLatin T
47
+ 'С': 'C', # Cyrillic EsLatin C
48
+ 'В': 'B', # Cyrillic VeLatin B
49
+ 'К': 'K', # Cyrillic KaLatin K
50
+ 'М': 'M', # Cyrillic EmLatin M
51
+ 'А': 'A', # Cyrillic A → Latin A
52
+ 'Р': 'P', # Cyrillic ErLatin P
53
  }
54
 
55
+ # Pattern: starts with digits, then letters+digits (steel grade pattern)
56
+ # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
57
  pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
58
+
59
+ # Also match welding wire patterns like СВ-08Х19Н10
60
+ pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
61
 
62
  def replace_in_steel_grade(match):
63
  nonlocal changes_count, changes_list
 
76
 
77
 
78
 
 
79
  def chunk_text_documents(documents):
80
  text_splitter = SentenceSplitter(
81
  chunk_size=CHUNK_SIZE,
 
195
  normalized_rows.append(row)
196
 
197
  # Log normalization stats with examples
 
 
 
 
 
 
198
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
199
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
200
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
 
221
  if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
222
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
223
 
224
+ metadata = {
225
+ 'type': 'table',
226
+ 'document_id': doc_id,
227
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
228
+ 'table_identifier': table_identifier,
229
+ 'table_title': table_title,
230
+ 'section': section,
231
+ 'sheet_name': sheet_name,
232
+ 'total_rows': len(normalized_rows),
233
+ 'chunk_size': len(content),
234
+ 'is_complete_table': True,
235
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
236
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
239
  return [Document(text=content, metadata=metadata)]
utils.py CHANGED
@@ -197,71 +197,47 @@ def debug_search_tables(vector_index, search_term="С-25"):
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
- def expand_query_with_llm(query, llm_model):
201
- """Generate 5 alternative query formulations using LLM"""
202
- try:
203
- from config import QUERY_EXPANSION_PROMPT
204
-
205
- expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=query)
206
-
207
- log_message(f"Generating query variations for: {query}")
208
- response = llm_model.complete(expansion_prompt)
209
-
210
- # Parse response - split by newlines and filter empty
211
- variations = [line.strip() for line in response.text.split('\n') if line.strip()]
212
- variations = variations[:5] # Take only first 5
213
-
214
- if variations:
215
- log_message(f"Generated {len(variations)} query variations:")
216
- for i, var in enumerate(variations, 1):
217
- log_message(f" {i}. {var}")
218
-
219
- # Combine original + variations
220
- combined_query = query + " " + " ".join(variations)
221
- return combined_query
222
- else:
223
- log_message("No variations generated, using original query")
224
- return query
225
-
226
- except Exception as e:
227
- log_message(f"Error generating query variations: {e}")
228
- return query
229
 
230
 
231
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
232
 
233
- # Apply normalizations
234
  normalized_question = normalize_text(question)
235
- normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
236
-
237
  if change_list:
238
- log_message(f"Query changes: {', '.join(change_list)}")
239
-
240
  if query_engine is None:
241
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
242
 
243
  try:
244
  start_time = time.time()
245
-
246
- # EXPAND QUERY USING LLM
247
- from utils import get_llm_model
248
- llm = get_llm_model(current_model)
249
- expanded_query = expand_query_with_llm(normalized_question_2, llm)
250
-
251
- # Use expanded query for retrieval
252
- retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
253
-
254
  log_message(f"user query: {question}")
255
  log_message(f"normalized query: {normalized_question}")
256
  log_message(f"after steel normalization: {normalized_question_2}")
257
- log_message(f"expanded query length: {len(expanded_query)} chars")
258
  log_message(f"Steel grades normalized in query: {query_changes}")
 
259
 
260
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
261
 
262
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
263
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
264
-
265
  for i, node in enumerate(unique_retrieved):
266
  node_type = node.metadata.get('type', 'text')
267
  doc_id = node.metadata.get('document_id', 'N/A')
@@ -270,6 +246,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
270
  table_num = node.metadata.get('table_number', 'N/A')
271
  table_id = node.metadata.get('table_identifier', 'N/A')
272
  table_title = node.metadata.get('table_title', 'N/A')
 
273
  content_preview = node.text[:200].replace('\n', ' ')
274
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
275
  log_message(f" Title: {table_title[:80]}")
@@ -280,10 +257,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
280
 
281
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
282
 
 
283
  reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
284
- top_k=rerank_top_k)
285
 
286
- # Use ORIGINAL normalized question for final answer generation
287
  response = query_engine.query(normalized_question_2)
288
 
289
  end_time = time.time()
 
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
+ def enhance_query_for_steel_grades(query):
201
+ """Expand query with related terms for better steel grade retrieval"""
202
+ import re
203
+
204
+ # Detect if query contains steel grades
205
+ steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
206
+ matches = re.findall(steel_pattern, query, re.IGNORECASE)
207
+
208
+ if matches:
209
+ # Add contextual terms
210
+ enhanced = query + " стандарт материал марка стали применение"
211
+ log_message(f"Enhanced query with steel context: {enhanced}")
212
+ return enhanced
213
+
214
+ return query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
218
 
 
219
  normalized_question = normalize_text(question)
220
+ normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
 
221
  if change_list:
222
+ log_message(f"Query changes: {', '.join(change_list)}")
 
223
  if query_engine is None:
224
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
225
 
226
  try:
227
  start_time = time.time()
228
+ retrieved_nodes = query_engine.retriever.retrieve(normalized_question_2)
 
 
 
 
 
 
 
 
229
  log_message(f"user query: {question}")
230
  log_message(f"normalized query: {normalized_question}")
231
  log_message(f"after steel normalization: {normalized_question_2}")
 
232
  log_message(f"Steel grades normalized in query: {query_changes}")
233
+
234
 
235
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
236
 
237
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
238
+
239
+ # IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
240
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
 
241
  for i, node in enumerate(unique_retrieved):
242
  node_type = node.metadata.get('type', 'text')
243
  doc_id = node.metadata.get('document_id', 'N/A')
 
246
  table_num = node.metadata.get('table_number', 'N/A')
247
  table_id = node.metadata.get('table_identifier', 'N/A')
248
  table_title = node.metadata.get('table_title', 'N/A')
249
+ # Show first 200 chars of content to verify it's the right table
250
  content_preview = node.text[:200].replace('\n', ' ')
251
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
252
  log_message(f" Title: {table_title[:80]}")
 
257
 
258
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
259
 
260
+ # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
261
  reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
262
+ top_k=rerank_top_k) # NOW PARAMETERIZED
263
 
264
+ # Direct query without formatting - use normalized question
265
  response = query_engine.query(normalized_question_2)
266
 
267
  end_time = time.time()