Suhasdev commited on
Commit
b4d947d
·
1 Parent(s): 6c19fd6

Fix: GEPA not generating candidates - training minibatch was incorrectly cached

Browse files

ROOT CAUSE:
- When GEPA calls adapter.evaluate(capture_traces=True), it needs trajectories
for reflective mutation to generate new candidates
- The adapter was returning cached results with trajectories=None for training
minibatches because len(batch)=4 >= valset_size=3 was True
- GEPA checks 'if not trajectories: skip', so no new candidates were generated

FIX:
1. When capture_traces=True, ALWAYS treat as training minibatch (dfeedback)
- Never use cache when traces are needed
2. Changed valset detection from '>=' to '==' for exact match
3. Added 'and not capture_traces' check before using cache

src/gepa_optimizer/core/universal_adapter.py CHANGED
@@ -236,13 +236,19 @@ Output the improved prompt directly and only the prompt."""
236
  # Determine dataset type first (needed for cache check)
237
  batch_size_threshold = self._config.batch_size if hasattr(self, '_config') and self._config else 8
238
 
239
- # 🔥 CRITICAL FIX: If _is_baseline_evaluation is True, we KNOW this is the validation set
240
- # This fixes the issue where valset_size might not be set yet when baseline detection happens
241
- if hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation:
 
 
 
 
 
242
  dataset_type = 'dpareto' # Baseline is ALWAYS evaluated on validation set
243
  self.logger.debug(f"🎯 Forced dataset_type to 'dpareto' (baseline evaluation flag is True)")
244
- elif hasattr(self, '_valset_size') and self._valset_size is not None and len(batch) >= self._valset_size:
245
- dataset_type = 'dpareto' # Full validation set size = Dpareto
 
246
  elif len(batch) > batch_size_threshold * 1.5:
247
  dataset_type = 'dpareto' # Much larger than batch = likely full valset
248
  else:
@@ -250,7 +256,8 @@ Output the improved prompt directly and only the prompt."""
250
 
251
  # 🔥 CRITICAL: Check cache to avoid re-evaluating same prompt on Dpareto
252
  # This ensures seed prompt is evaluated ONLY ONCE
253
- if dataset_type == 'dpareto':
 
254
  normalized_prompt = system_prompt.strip().strip('"\'')
255
  if normalized_prompt in self._dpareto_evaluated_candidates:
256
  existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]
 
236
  # Determine dataset type first (needed for cache check)
237
  batch_size_threshold = self._config.batch_size if hasattr(self, '_config') and self._config else 8
238
 
239
+ # 🔥 CRITICAL FIX: If capture_traces=True, this is a TRAINING MINIBATCH for reflection
240
+ # GEPA calls with capture_traces=True when it needs trajectories for reflective mutation
241
+ # We must NEVER use cache in this case, otherwise trajectories=None breaks GEPA!
242
+ if capture_traces:
243
+ dataset_type = 'dfeedback' # Training minibatch - need fresh evaluation with trajectories
244
+ self.logger.debug(f"🎯 Forced dataset_type to 'dfeedback' (capture_traces=True)")
245
+ # If _is_baseline_evaluation is True, we KNOW this is the validation set
246
+ elif hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation:
247
  dataset_type = 'dpareto' # Baseline is ALWAYS evaluated on validation set
248
  self.logger.debug(f"🎯 Forced dataset_type to 'dpareto' (baseline evaluation flag is True)")
249
+ elif hasattr(self, '_valset_size') and self._valset_size is not None and len(batch) == self._valset_size:
250
+ # 🔥 FIX: Use == not >= to avoid misclassifying training minibatches as validation set
251
+ dataset_type = 'dpareto' # EXACT validation set size = Dpareto
252
  elif len(batch) > batch_size_threshold * 1.5:
253
  dataset_type = 'dpareto' # Much larger than batch = likely full valset
254
  else:
 
256
 
257
  # 🔥 CRITICAL: Check cache to avoid re-evaluating same prompt on Dpareto
258
  # This ensures seed prompt is evaluated ONLY ONCE
259
+ # NOTE: Only applies when capture_traces=False (validation set evaluation)
260
+ if dataset_type == 'dpareto' and not capture_traces:
261
  normalized_prompt = system_prompt.strip().strip('"\'')
262
  if normalized_prompt in self._dpareto_evaluated_candidates:
263
  existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]