Experimental GRPO "continued pretraining" - rewarding the model for completions that resembled the target data (in varying complex ways). Reward is calculated differently for creative text and code.
Trained for 1034 steps on a 3090, rank 128 QLoRA with alpha 256. Learning rate 1e-6 seemed ideal.
For this one, added LLM as judge to the reward functions; gpt-4o-mini for decent reward with logprobs for continuous and gemini-3-flash-preview for difficult-to-fool binary bonus:
class RewardModel:
"""Multi-domain reward model with LLM judge for creative domains."""
def __init__(self, device: str = "cuda"):
self.device = device
print("Loading reward model components...")
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
self.chrf = CHRF(word_order=2)
self.rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def compute_reward(
self,
prediction: str,
reference: str,
reward_type: str,
prefix: str = None,
) -> float:
if not prediction or not reference:
return 0.0
if reward_type == "llm_judge":
# Base reward (GT-anchored)
embs = self.semantic_model.encode([reference, prediction], convert_to_tensor=True)
sem_score = torch.nn.functional.cosine_similarity(embs[0:1], embs[1:2]).item()
chrf_score = self.chrf.sentence_score(prediction, [reference]).score / 100.0
base_reward = 0.4 * sem_score + 0.6 * chrf_score
# LLM judge bonus
if prefix is not None:
llm_reward, flash_bonus = get_llm_judge_reward(prefix, prediction, reference)
else:
llm_reward, flash_bonus = 0.5, 0.0
# Multiplicative: base * (1 + llm_bonus + flash_bonus)
# This ensures LLM bonus scales with GT similarity
return base_reward * (1 + 0.3 * llm_reward + 0.2 * flash_bonus)
elif reward_type == "creative":
# Original creative reward (fallback)
embs = self.semantic_model.encode([reference, prediction], convert_to_tensor=True)
sem_score = torch.nn.functional.cosine_similarity(embs[0:1], embs[1:2]).item()
chrf_score = self.chrf.sentence_score(prediction, [reference]).score / 100.0
return 0.4 * sem_score + 0.6 * chrf_score
elif reward_type == "hybrid":
embs = self.semantic_model.encode([reference, prediction], convert_to_tensor=True)
sem_score = torch.nn.functional.cosine_similarity(embs[0:1], embs[1:2]).item()
rouge_result = self.rouge.score(reference, prediction)
rouge_l = rouge_result['rougeL'].fmeasure
ref_len = len(reference.split())
pred_len = len(prediction.split())
if ref_len > 0:
len_ratio = pred_len / ref_len
length_penalty = max(0.0, 1.0 - abs(1.0 - len_ratio))
else:
length_penalty = 1.0 if pred_len == 0 else 0.0
return 0.6 * sem_score + 0.3 * rouge_l + 0.1 * length_penalty
elif reward_type == "levenshtein":
max_len = max(len(prediction), len(reference))
if max_len == 0:
return 1.0
dist = Levenshtein.distance(prediction, reference)
return max(0.0, 1.0 - (dist / max_len))
else:
raise ValueError(f"Unknown reward type: {reward_type}")
Quick capability diagnostics compared to original (most rapid subset to test both on):
| Task | Metric | Base | Trained | Delta |
|---|---|---|---|---|
| arc_easy | acc | 0.7891 | 0.7925 | +0.43% |
| arc_easy | acc_norm | 0.7609 | 0.7647 | +0.50% |
| lambada_openai | acc | 0.6912 | 0.6971 | +0.85% |
| lambada_openai | perplexity | 4.2433 | 4.0663 | -4.2% ↓ |
| openbookqa | acc | 0.3160 | 0.3180 | +0.63% |
| openbookqa | acc_norm | 0.4100 | 0.4080 | -0.49% |
| piqa | acc | 0.7797 | 0.7824 | +0.35% |
| piqa | acc_norm | 0.7807 | 0.7797 | -0.13% |
- Downloads last month
- 5
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support