MrSimple07 commited on
Commit
57a8908
·
1 Parent(s): 49bfa92

normalized fixed + in header text as well

Browse files
Files changed (3) hide show
  1. documents_prep.py +20 -9
  2. index_retriever.py +12 -4
  3. utils.py +4 -3
documents_prep.py CHANGED
@@ -186,12 +186,13 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
186
  content += f"ТАБЛИЦА: {table_identifier}\n"
187
 
188
  if table_title:
189
- content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
 
 
190
 
191
- # Extract and normalize connection type
192
  connection_type = extract_connection_type(table_title)
193
  if connection_type:
194
- # Show normalized version for searchability
195
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
196
 
197
  if table_num and table_num != table_identifier:
@@ -205,7 +206,9 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
205
  if headers:
206
  content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
207
  for i, h in enumerate(headers, 1):
208
- content += f" {i}. {h}\n"
 
 
209
  content += "\n"
210
 
211
  content += "ДАННЫЕ ТАБЛИЦЫ:\n"
@@ -213,19 +216,27 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
213
 
214
 
215
  def format_single_row(row, idx):
216
- """Format a single row"""
217
  if isinstance(row, dict):
218
- parts = [f"{k}: {v}" for k, v in row.items()
219
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
220
  if parts:
221
  return f"{idx}. {' | '.join(parts)}\n"
222
  elif isinstance(row, list):
223
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
 
224
  if parts:
225
  return f"{idx}. {' | '.join(parts)}\n"
226
  return ""
227
 
228
-
229
  def format_table_rows(rows):
230
  """Format multiple rows"""
231
  content = ""
 
186
  content += f"ТАБЛИЦА: {table_identifier}\n"
187
 
188
  if table_title:
189
+ # Normalize the title text itself for better searchability
190
+ normalized_title = normalize_connection_type(table_title)
191
+ content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
192
 
193
+ # Extract and store the normalized connection type
194
  connection_type = extract_connection_type(table_title)
195
  if connection_type:
 
196
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
197
 
198
  if table_num and table_num != table_identifier:
 
206
  if headers:
207
  content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
208
  for i, h in enumerate(headers, 1):
209
+ # NORMALIZE HEADERS TOO
210
+ normalized_header = normalize_connection_type(h)
211
+ content += f" {i}. {normalized_header}\n"
212
  content += "\n"
213
 
214
  content += "ДАННЫЕ ТАБЛИЦЫ:\n"
 
216
 
217
 
218
  def format_single_row(row, idx):
219
+ """Format a single row with normalization"""
220
  if isinstance(row, dict):
221
+ # NORMALIZE VALUES IN ROWS
222
+ parts = []
223
+ for k, v in row.items():
224
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
225
+ normalized_v = normalize_connection_type(str(v))
226
+ parts.append(f"{k}: {normalized_v}")
227
  if parts:
228
  return f"{idx}. {' | '.join(parts)}\n"
229
  elif isinstance(row, list):
230
+ # NORMALIZE LIST VALUES
231
+ parts = []
232
+ for v in row:
233
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
234
+ normalized_v = normalize_connection_type(str(v))
235
+ parts.append(normalized_v)
236
  if parts:
237
  return f"{idx}. {' | '.join(parts)}\n"
238
  return ""
239
 
 
240
  def format_table_rows(rows):
241
  """Format multiple rows"""
242
  content = ""
index_retriever.py CHANGED
@@ -11,10 +11,19 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
 
 
14
  connection_type_sources = {}
15
  table_count = 0
16
 
17
  for doc in documents:
 
 
 
 
 
 
 
18
  if doc.metadata.get('type') == 'table':
19
  table_count += 1
20
  conn_type = doc.metadata.get('connection_type', '')
@@ -25,17 +34,16 @@ def create_vector_index(documents):
25
  connection_type_sources[conn_type].append(table_id)
26
 
27
  log_message("="*60)
28
- log_message(f"INDEXING {table_count} TABLE CHUNKS")
29
  log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
30
  for conn_type in sorted(connection_type_sources.keys()):
31
- sources = list(set(connection_type_sources[conn_type])) # Unique sources
32
  log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
33
  for src in sources:
34
  log_message(f" - {src}")
35
  log_message("="*60)
36
 
37
- return VectorStoreIndex.from_documents(documents)
38
-
39
 
40
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
41
  if not nodes or not reranker:
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
 
14
+ # PREPROCESS ALL DOCUMENTS FOR CONSISTENT TOKENIZATION
15
+ processed_docs = []
16
  connection_type_sources = {}
17
  table_count = 0
18
 
19
  for doc in documents:
20
+ # Normalize text content for BM25
21
+ if hasattr(doc, 'text'):
22
+ from documents_prep import normalize_connection_type
23
+ doc.text = normalize_connection_type(doc.text)
24
+
25
+ processed_docs.append(doc)
26
+
27
  if doc.metadata.get('type') == 'table':
28
  table_count += 1
29
  conn_type = doc.metadata.get('connection_type', '')
 
34
  connection_type_sources[conn_type].append(table_id)
35
 
36
  log_message("="*60)
37
+ log_message(f"INDEXING {table_count} TABLE CHUNKS (NORMALIZED)")
38
  log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
39
  for conn_type in sorted(connection_type_sources.keys()):
40
+ sources = list(set(connection_type_sources[conn_type]))
41
  log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
42
  for src in sources:
43
  log_message(f" - {src}")
44
  log_message("="*60)
45
 
46
+ return VectorStoreIndex.from_documents(processed_docs)
 
47
 
48
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
49
  if not nodes or not reranker:
utils.py CHANGED
@@ -172,7 +172,6 @@ def deduplicate_nodes(nodes):
172
 
173
  return unique_nodes
174
 
175
-
176
  def normalize_query(query):
177
  """Normalize query to match stored format"""
178
  import re
@@ -180,8 +179,10 @@ def normalize_query(query):
180
  query = query.replace('С-', 'C-').replace('с-', 'c-')
181
  query = query.replace('У-', 'U-').replace('у-', 'u-')
182
  query = query.replace('Т-', 'T-').replace('т-', 't-')
183
- query = query.replace('-', '')
184
-
 
 
185
  return query
186
 
187
 
 
172
 
173
  return unique_nodes
174
 
 
175
  def normalize_query(query):
176
  """Normalize query to match stored format"""
177
  import re
 
179
  query = query.replace('С-', 'C-').replace('с-', 'c-')
180
  query = query.replace('У-', 'U-').replace('у-', 'u-')
181
  query = query.replace('Т-', 'T-').replace('т-', 't-')
182
+
183
+ # Remove hyphens from connection type patterns (C-25 -> C25)
184
+ query = re.sub(r'([CUTcut])(\d)', r'\1\2', query)
185
+
186
  return query
187
 
188