Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

App Files Files Community

Suhasdev commited on Dec 12, 2025

Commit

74d4bea

1 Parent(s): 539c75b

Fix duplicate seed prompts in Pareto front - add deduplication checks

Browse files

Files changed (1) hide show

src/gepa_optimizer/utils/pareto_logger.py +39 -1

src/gepa_optimizer/utils/pareto_logger.py CHANGED Viewed

@@ -60,6 +60,17 @@ class ParetoLogger:
         # Rule: Only candidates with f(candidate) > f(S₀) can enter Pareto front
         # Exception: Seed prompt (S₀) itself is always added as the baseline
         if candidate_type == 'seed':
             logger.info(f"\n   ✅ {cand_notation} is seed prompt - always added as baseline")
             # Set baseline if not already set (safety check - adapter should have done this)
@@ -279,6 +290,22 @@ class ParetoLogger:
         logger.info(f"🔥 BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
         logger.info("═" * 80)
         # Step 1: Filter by baseline (score > baseline_score)
         if self.baseline_score is None:
             logger.error("❌ Baseline score not set - cannot perform batch update")
@@ -294,7 +321,18 @@ class ParetoLogger:
             # Seed is always included (it's the baseline)
             if cand_type == 'seed':
-                filtered.append(cand)
                 continue
             # Non-seed candidates must be better than baseline

         # Rule: Only candidates with f(candidate) > f(S₀) can enter Pareto front
         # Exception: Seed prompt (S₀) itself is always added as the baseline
         if candidate_type == 'seed':
+            # 🔥 FIX: Check if seed prompt is already in Pareto front to prevent duplicates
+            normalized_prompt = prompt.strip().strip('"\'')
+            for existing_cand in self.pareto_front:
+                existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
+                if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
+                    logger.info(f"\n   ⚠️  {cand_notation} is already in Pareto Front P (duplicate detected)")
+                    logger.info(f"      Skipping duplicate seed prompt addition")
+                    front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+                    logger.info(f"      P = {{{', '.join(front_notations)}}}")
+                    return  # Skip adding duplicate
             logger.info(f"\n   ✅ {cand_notation} is seed prompt - always added as baseline")
             # Set baseline if not already set (safety check - adapter should have done this)
         logger.info(f"🔥 BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
         logger.info("═" * 80)
+        # Step 0: Deduplicate input candidates by prompt text
+        seen_prompts = set()
+        deduplicated_candidates = []
+        for cand in candidates_with_scores:
+            normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
+            if normalized_prompt not in seen_prompts:
+                seen_prompts.add(normalized_prompt)
+                deduplicated_candidates.append(cand)
+            else:
+                logger.info(f"   ⚠️  Skipping duplicate candidate: {cand.get('notation', 'S')} (prompt already in batch)")
+        if len(deduplicated_candidates) < len(candidates_with_scores):
+            logger.info(f"   📊 Deduplicated: {len(candidates_with_scores)} → {len(deduplicated_candidates)} candidates")
+        candidates_with_scores = deduplicated_candidates
         # Step 1: Filter by baseline (score > baseline_score)
         if self.baseline_score is None:
             logger.error("❌ Baseline score not set - cannot perform batch update")
             # Seed is always included (it's the baseline)
             if cand_type == 'seed':
+                # 🔥 FIX: Check if seed is already in Pareto front
+                normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
+                already_in_front = False
+                for existing_cand in self.pareto_front:
+                    existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
+                    if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
+                        already_in_front = True
+                        logger.info(f"   ⚠️  Seed prompt already in Pareto front - skipping duplicate")
+                        break
+                if not already_in_front:
+                    filtered.append(cand)
                 continue
             # Non-seed candidates must be better than baseline