Spaces:
Sleeping
Sleeping
Fix duplicate seed prompts in Pareto front - add deduplication checks
Browse files
src/gepa_optimizer/utils/pareto_logger.py
CHANGED
|
@@ -60,6 +60,17 @@ class ParetoLogger:
|
|
| 60 |
# Rule: Only candidates with f(candidate) > f(Sβ) can enter Pareto front
|
| 61 |
# Exception: Seed prompt (Sβ) itself is always added as the baseline
|
| 62 |
if candidate_type == 'seed':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
logger.info(f"\n β
{cand_notation} is seed prompt - always added as baseline")
|
| 64 |
|
| 65 |
# Set baseline if not already set (safety check - adapter should have done this)
|
|
@@ -279,6 +290,22 @@ class ParetoLogger:
|
|
| 279 |
logger.info(f"π₯ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
|
| 280 |
logger.info("β" * 80)
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
# Step 1: Filter by baseline (score > baseline_score)
|
| 283 |
if self.baseline_score is None:
|
| 284 |
logger.error("β Baseline score not set - cannot perform batch update")
|
|
@@ -294,7 +321,18 @@ class ParetoLogger:
|
|
| 294 |
|
| 295 |
# Seed is always included (it's the baseline)
|
| 296 |
if cand_type == 'seed':
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
continue
|
| 299 |
|
| 300 |
# Non-seed candidates must be better than baseline
|
|
|
|
| 60 |
# Rule: Only candidates with f(candidate) > f(Sβ) can enter Pareto front
|
| 61 |
# Exception: Seed prompt (Sβ) itself is always added as the baseline
|
| 62 |
if candidate_type == 'seed':
|
| 63 |
+
# π₯ FIX: Check if seed prompt is already in Pareto front to prevent duplicates
|
| 64 |
+
normalized_prompt = prompt.strip().strip('"\'')
|
| 65 |
+
for existing_cand in self.pareto_front:
|
| 66 |
+
existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
|
| 67 |
+
if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
|
| 68 |
+
logger.info(f"\n β οΈ {cand_notation} is already in Pareto Front P (duplicate detected)")
|
| 69 |
+
logger.info(f" Skipping duplicate seed prompt addition")
|
| 70 |
+
front_notations = [c.get('notation', 'S') for c in self.pareto_front]
|
| 71 |
+
logger.info(f" P = {{{', '.join(front_notations)}}}")
|
| 72 |
+
return # Skip adding duplicate
|
| 73 |
+
|
| 74 |
logger.info(f"\n β
{cand_notation} is seed prompt - always added as baseline")
|
| 75 |
|
| 76 |
# Set baseline if not already set (safety check - adapter should have done this)
|
|
|
|
| 290 |
logger.info(f"π₯ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
|
| 291 |
logger.info("β" * 80)
|
| 292 |
|
| 293 |
+
# Step 0: Deduplicate input candidates by prompt text
|
| 294 |
+
seen_prompts = set()
|
| 295 |
+
deduplicated_candidates = []
|
| 296 |
+
for cand in candidates_with_scores:
|
| 297 |
+
normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
|
| 298 |
+
if normalized_prompt not in seen_prompts:
|
| 299 |
+
seen_prompts.add(normalized_prompt)
|
| 300 |
+
deduplicated_candidates.append(cand)
|
| 301 |
+
else:
|
| 302 |
+
logger.info(f" β οΈ Skipping duplicate candidate: {cand.get('notation', 'S')} (prompt already in batch)")
|
| 303 |
+
|
| 304 |
+
if len(deduplicated_candidates) < len(candidates_with_scores):
|
| 305 |
+
logger.info(f" π Deduplicated: {len(candidates_with_scores)} β {len(deduplicated_candidates)} candidates")
|
| 306 |
+
|
| 307 |
+
candidates_with_scores = deduplicated_candidates
|
| 308 |
+
|
| 309 |
# Step 1: Filter by baseline (score > baseline_score)
|
| 310 |
if self.baseline_score is None:
|
| 311 |
logger.error("β Baseline score not set - cannot perform batch update")
|
|
|
|
| 321 |
|
| 322 |
# Seed is always included (it's the baseline)
|
| 323 |
if cand_type == 'seed':
|
| 324 |
+
# π₯ FIX: Check if seed is already in Pareto front
|
| 325 |
+
normalized_prompt = cand.get('prompt', '').strip().strip('"\'')
|
| 326 |
+
already_in_front = False
|
| 327 |
+
for existing_cand in self.pareto_front:
|
| 328 |
+
existing_prompt = existing_cand.get('prompt', '').strip().strip('"\'')
|
| 329 |
+
if existing_prompt == normalized_prompt and existing_cand.get('type') == 'seed':
|
| 330 |
+
already_in_front = True
|
| 331 |
+
logger.info(f" β οΈ Seed prompt already in Pareto front - skipping duplicate")
|
| 332 |
+
break
|
| 333 |
+
|
| 334 |
+
if not already_in_front:
|
| 335 |
+
filtered.append(cand)
|
| 336 |
continue
|
| 337 |
|
| 338 |
# Non-seed candidates must be better than baseline
|