Suhasdev commited on
Commit
74d4bea
Β·
1 Parent(s): 539c75b

Fix duplicate seed prompts in Pareto front - add deduplication checks

Browse files
src/gepa_optimizer/utils/pareto_logger.py CHANGED
@@ -60,6 +60,17 @@ class ParetoLogger:
60
  # Rule: Only candidates with f(candidate) > f(Sβ‚€) can enter Pareto front
61
  # Exception: Seed prompt (Sβ‚€) itself is always added as the baseline
62
  if candidate_type == 'seed':
 
 
 
 
 
 
 
 
 
 
 
63
  logger.info(f"\n βœ… {cand_notation} is seed prompt - always added as baseline")
64
 
65
  # Set baseline if not already set (safety check - adapter should have done this)
@@ -279,6 +290,22 @@ class ParetoLogger:
279
  logger.info(f"πŸ”₯ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
280
  logger.info("═" * 80)
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  # Step 1: Filter by baseline (score > baseline_score)
283
  if self.baseline_score is None:
284
  logger.error("❌ Baseline score not set - cannot perform batch update")
@@ -294,7 +321,18 @@ class ParetoLogger:
294
 
295
  # Seed is always included (it's the baseline)
296
  if cand_type == 'seed':
297
- filtered.append(cand)
 
 
 
 
 
 
 
 
 
 
 
298
  continue
299
 
300
  # Non-seed candidates must be better than baseline
 
60
  # Rule: Only candidates with f(candidate) > f(Sβ‚€) can enter Pareto front
61
  # Exception: Seed prompt (Sβ‚€) itself is always added as the baseline
62
  if candidate_type == 'seed':
63
+ # πŸ”₯ FIX: Check if seed prompt is already in Pareto front to prevent duplicates
64
+ normalized_prompt = prompt.strip().strip('"\'')
65
+ for existing_cand in self.pareto_front:
66
+ existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
67
+ if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
68
+ logger.info(f"\n ⚠️ {cand_notation} is already in Pareto Front P (duplicate detected)")
69
+ logger.info(f" Skipping duplicate seed prompt addition")
70
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
71
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
72
+ return # Skip adding duplicate
73
+
74
  logger.info(f"\n βœ… {cand_notation} is seed prompt - always added as baseline")
75
 
76
  # Set baseline if not already set (safety check - adapter should have done this)
 
290
  logger.info(f"πŸ”₯ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
291
  logger.info("═" * 80)
292
 
293
+ # Step 0: Deduplicate input candidates by prompt text
294
+ seen_prompts = set()
295
+ deduplicated_candidates = []
296
+ for cand in candidates_with_scores:
297
+ normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
298
+ if normalized_prompt not in seen_prompts:
299
+ seen_prompts.add(normalized_prompt)
300
+ deduplicated_candidates.append(cand)
301
+ else:
302
+ logger.info(f" ⚠️ Skipping duplicate candidate: {cand.get('notation', 'S')} (prompt already in batch)")
303
+
304
+ if len(deduplicated_candidates) < len(candidates_with_scores):
305
+ logger.info(f" πŸ“Š Deduplicated: {len(candidates_with_scores)} β†’ {len(deduplicated_candidates)} candidates")
306
+
307
+ candidates_with_scores = deduplicated_candidates
308
+
309
  # Step 1: Filter by baseline (score > baseline_score)
310
  if self.baseline_score is None:
311
  logger.error("❌ Baseline score not set - cannot perform batch update")
 
321
 
322
  # Seed is always included (it's the baseline)
323
  if cand_type == 'seed':
324
+ # πŸ”₯ FIX: Check if seed is already in Pareto front
325
+ normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
326
+ already_in_front = False
327
+ for existing_cand in self.pareto_front:
328
+ existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
329
+ if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
330
+ already_in_front = True
331
+ logger.info(f" ⚠️ Seed prompt already in Pareto front - skipping duplicate")
332
+ break
333
+
334
+ if not already_in_front:
335
+ filtered.append(cand)
336
  continue
337
 
338
  # Non-seed candidates must be better than baseline