MrSimple07 commited on
Commit
429d2d4
·
1 Parent(s): 154e611

removed the part removing hyperh + top 80, cutoff = 0.55

Browse files
Files changed (3) hide show
  1. documents_prep.py +7 -6
  2. index_retriever.py +4 -4
  3. utils.py +16 -11
documents_prep.py CHANGED
@@ -36,11 +36,14 @@ def chunk_text_documents(documents):
36
 
37
  def normalize_connection_type(s):
38
  # Replace Cyrillic with Latin
39
- s = s.replace('С', 'C').replace('с', 'c')
40
- s = s.replace('У', 'U').replace('у', 'u')
41
- s = s.replace('Т', 'T').replace('т', 't')
 
 
 
42
  # REMOVE ALL HYPHENS for consistent tokenization
43
- s = s.replace('-', '')
44
  return s
45
 
46
  def extract_connection_type(text):
@@ -77,8 +80,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
77
  return []
78
 
79
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
80
-
81
- # Calculate base metadata size - NOW INCLUDING DESCRIPTION
82
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
83
 
84
  # ADD DESCRIPTION HERE if it exists
 
36
 
37
  def normalize_connection_type(s):
38
  # Replace Cyrillic with Latin
39
+ # s = s.replace('С', 'C').replace('с', 'c')
40
+ # s = s.replace('У', 'U').replace('у', 'u')
41
+ # s = s.replace('Т', 'T').replace('т', 't')
42
+ s= s.replace('С-', 'C-').replace('с-', 'c-')
43
+ s = s.replace('У-', 'U-').replace('у-', 'u-')
44
+ s = s.replace('Т-', 'T-').replace('т-', 't-')
45
  # REMOVE ALL HYPHENS for consistent tokenization
46
+ # s = s.replace('-', '')
47
  return s
48
 
49
  def extract_connection_type(text):
 
80
  return []
81
 
82
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
 
 
83
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
84
 
85
  # ADD DESCRIPTION HERE if it exists
index_retriever.py CHANGED
@@ -71,18 +71,18 @@ def create_query_engine(vector_index):
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=100
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=100,
80
- similarity_cutoff=0.45
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
- similarity_top_k=100,
86
  num_queries=1
87
  )
88
 
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
+ similarity_top_k=80
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
+ similarity_top_k=80,
80
+ similarity_cutoff=0.55
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
+ similarity_top_k=80,
86
  num_queries=1
87
  )
88
 
utils.py CHANGED
@@ -179,7 +179,10 @@ def normalize_query(query):
179
  query = query.replace('С-', 'C-').replace('с-', 'c-')
180
  query = query.replace('У-', 'U-').replace('у-', 'u-')
181
  query = query.replace('Т-', 'T-').replace('т-', 't-')
182
- query = query.replace('-', '')
 
 
 
183
 
184
  return query
185
 
@@ -191,7 +194,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
191
  try:
192
  start_time = time.time()
193
 
194
- # NORMALIZE QUERY: Convert Cyrillic to Latin
195
  normalized_question = normalize_query(question)
196
  log_message(f"Original query: {question}")
197
  log_message(f"Normalized query: {normalized_question}")
@@ -218,12 +221,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
218
  for ct, cnt in sorted(conn_types_retrieved.items()):
219
  log_message(f" {ct}: {cnt} chunks")
220
 
221
- # Check if target type was retrieved (keep original Cyrillic)
222
- if 'С-25' in question: # Use Cyrillic
223
- if 'С-25' in conn_types_retrieved:
224
- log_message(f"✓ С-25 RETRIEVED: {conn_types_retrieved['С-25']} chunks")
 
 
225
  else:
226
- log_message("✗ С-25 NOT RETRIEVED despite being in query!")
227
 
228
  # Sample of retrieved tables
229
  log_message("SAMPLE OF RETRIEVED TABLES:")
@@ -235,11 +240,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
235
  doc_id = node.metadata.get('document_id', 'N/A')
236
  log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
 
238
- # Rerank
239
- reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
240
 
241
- # Direct query without formatting
242
- response = query_engine.query(question)
243
 
244
  end_time = time.time()
245
  processing_time = end_time - start_time
 
179
  query = query.replace('С-', 'C-').replace('с-', 'c-')
180
  query = query.replace('У-', 'U-').replace('у-', 'u-')
181
  query = query.replace('Т-', 'T-').replace('т-', 't-')
182
+ # query = query.replace('С', 'C').replace('с', 'C')
183
+ # query = query.replace('У', 'U').replace('у', 'U')
184
+ # query = query.replace('Т', 'T').replace('т', 'T')
185
+ # query = query.replace('-', '')
186
 
187
  return query
188
 
 
194
  try:
195
  start_time = time.time()
196
 
197
+ # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
198
  normalized_question = normalize_query(question)
199
  log_message(f"Original query: {question}")
200
  log_message(f"Normalized query: {normalized_question}")
 
221
  for ct, cnt in sorted(conn_types_retrieved.items()):
222
  log_message(f" {ct}: {cnt} chunks")
223
 
224
+ # Check if target type was retrieved
225
+ # Normalize the check as well
226
+ normalized_check = normalize_query('С-25') # Will become C25
227
+ if normalized_check in question or 'С-25' in question or 'C-25' in question:
228
+ if 'C25' in conn_types_retrieved:
229
+ log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
230
  else:
231
+ log_message("✗ C25 NOT RETRIEVED despite being in query!")
232
 
233
  # Sample of retrieved tables
234
  log_message("SAMPLE OF RETRIEVED TABLES:")
 
240
  doc_id = node.metadata.get('document_id', 'N/A')
241
  log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
242
 
243
+ # Rerank - use normalized query for consistency
244
+ reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
245
 
246
+ # CRITICAL FIX: Use normalized query for LLM as well
247
+ response = query_engine.query(normalized_question)
248
 
249
  end_time = time.time()
250
  processing_time = end_time - start_time