Peterase commited on
Commit
12d3d4d
Β·
1 Parent(s): e43cd24

fix(reranker+jina): auto-fallback to CrossEncoder + silence Jina noise

Browse files

Reranker fix (bge_reranker_adapter.py):
- Tokenizer error keeps changing (prepare_for_model -> build_inputs ->
special_tokens_pattern) β€” whack-a-mole patching doesn't work
- New approach: try FlagReranker, if it throws ANY exception auto-fallback
to CrossEncoder (which has no tokenizer issues)
- Added special_tokens_pattern and convert_tokens_to_string to patch list
- CrossEncoder fallback is reliable and produces valid rerank scores
- No more 'Reranker scoring failed' errors in logs

Jina fix (jina_reader_adapter.py):
- MSN, Yahoo, BBC block Jina with connection errors β€” expected behavior
- Downgraded extraction errors from logger.error to logger.debug
- Downgraded non-200 status from logger.warning to logger.debug
- Logs are now clean β€” Jina failures are silent (fallback to snippet works)

src/infrastructure/adapters/bge_reranker_adapter.py CHANGED
@@ -60,35 +60,43 @@ class BgeRerankerAdapter(RerankerPort):
60
  logger.info(f"Loading reranker model: {self.model_name}")
61
  try:
62
  if HAS_FLAG_RERANKER and "bge-reranker" in self.model_name.lower():
63
- # Patch tokenizer compatibility issues before loading
 
64
  try:
65
- import transformers
66
  from transformers import XLMRobertaTokenizer, PreTrainedTokenizer
67
- # Patch all missing methods that different transformers versions may lack
68
  for method_name in [
69
  "prepare_for_model",
70
  "build_inputs_with_special_tokens",
71
  "create_token_type_ids_from_sequences",
72
  "get_special_tokens_mask",
 
 
73
  ]:
74
  if not hasattr(XLMRobertaTokenizer, method_name):
75
  base_method = getattr(PreTrainedTokenizer, method_name, None)
76
  if base_method:
77
  setattr(XLMRobertaTokenizer, method_name, base_method)
78
- logger.debug(f"Patched XLMRobertaTokenizer.{method_name}")
79
  except Exception as patch_err:
80
  logger.debug(f"Tokenizer patch skipped: {patch_err}")
81
-
82
- # FlagReranker: use_fp16=True halves memory, normalize=True gives [0,1] scores
83
- # trust_remote_code=True fixes tokenizer compatibility issues
84
- self.model = FlagReranker(
85
- self.model_name,
86
- use_fp16=True,
87
- normalize=True,
88
- trust_remote_code=True # Fix tokenizer compatibility
89
- )
90
- self._use_flag = True
91
- logger.info(f"βœ… Loaded {self.model_name} via FlagReranker (multilingual, fp16)")
 
 
 
 
 
 
 
 
92
  elif HAS_CROSS_ENCODER:
93
  self.model = CrossEncoder(self.model_name)
94
  self._use_flag = False
 
60
  logger.info(f"Loading reranker model: {self.model_name}")
61
  try:
62
  if HAS_FLAG_RERANKER and "bge-reranker" in self.model_name.lower():
63
+ # Patch ALL potentially missing XLMRobertaTokenizer methods
64
+ # Different transformers versions on HF Spaces may lack different methods
65
  try:
 
66
  from transformers import XLMRobertaTokenizer, PreTrainedTokenizer
 
67
  for method_name in [
68
  "prepare_for_model",
69
  "build_inputs_with_special_tokens",
70
  "create_token_type_ids_from_sequences",
71
  "get_special_tokens_mask",
72
+ "special_tokens_pattern",
73
+ "convert_tokens_to_string",
74
  ]:
75
  if not hasattr(XLMRobertaTokenizer, method_name):
76
  base_method = getattr(PreTrainedTokenizer, method_name, None)
77
  if base_method:
78
  setattr(XLMRobertaTokenizer, method_name, base_method)
 
79
  except Exception as patch_err:
80
  logger.debug(f"Tokenizer patch skipped: {patch_err}")
81
+
82
+ try:
83
+ self.model = FlagReranker(
84
+ self.model_name,
85
+ use_fp16=True,
86
+ normalize=True,
87
+ trust_remote_code=True,
88
+ )
89
+ self._use_flag = True
90
+ logger.info(f"βœ… Loaded {self.model_name} via FlagReranker (multilingual, fp16)")
91
+ except Exception as flag_err:
92
+ logger.warning(f"FlagReranker failed ({flag_err}) β€” falling back to CrossEncoder")
93
+ if HAS_CROSS_ENCODER:
94
+ self.model = CrossEncoder(self.model_name)
95
+ self._use_flag = False
96
+ logger.info(f"βœ… Loaded {self.model_name} via CrossEncoder (fallback)")
97
+ else:
98
+ raise
99
+
100
  elif HAS_CROSS_ENCODER:
101
  self.model = CrossEncoder(self.model_name)
102
  self._use_flag = False
src/infrastructure/adapters/jina_reader_adapter.py CHANGED
@@ -179,7 +179,7 @@ class JinaReaderAdapter:
179
  }
180
 
181
  else:
182
- logger.warning(
183
  f"Jina returned status {response.status_code} for {url[:50]}"
184
  )
185
  return {
@@ -189,7 +189,7 @@ class JinaReaderAdapter:
189
  }
190
 
191
  except asyncio.TimeoutError:
192
- logger.warning(f"Jina timeout ({self.timeout}s) for {url[:50]}")
193
  return {
194
  "success": False,
195
  "url": url,
@@ -197,7 +197,7 @@ class JinaReaderAdapter:
197
  }
198
 
199
  except Exception as e:
200
- logger.error(f"Jina extraction error for {url[:50]}: {e}")
201
  return {
202
  "success": False,
203
  "url": url,
 
179
  }
180
 
181
  else:
182
+ logger.debug(
183
  f"Jina returned status {response.status_code} for {url[:50]}"
184
  )
185
  return {
 
189
  }
190
 
191
  except asyncio.TimeoutError:
192
+ logger.debug(f"Jina timeout ({self.timeout}s) for {url[:50]}")
193
  return {
194
  "success": False,
195
  "url": url,
 
197
  }
198
 
199
  except Exception as e:
200
+ logger.debug(f"Jina extraction error for {url[:50]}: {e}")
201
  return {
202
  "success": False,
203
  "url": url,