Experimental GRPO "continued pretraining" - rewarding the model for completions that resembled the target data (in varying complex ways). Reward is calculated differently for creative text and code.
Trained for 1034 steps on a 3090, rank 128 QLoRA with alpha 256. Learning rate 1e-6 seemed ideal.
Reward functions:
class RewardModel:
"""Multi-domain reward model for GRPO pretraining."""
def __init__(self, device: str = "cuda"):
self.device = device
# Load components
print("Loading reward model components...")
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
self.chrf = CHRF(word_order=2)
self.rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def _levenshtein_distance(self, s1: str, s2: str) -> int:
"""C-optimized Levenshtein distance."""
return Levenshtein.distance(s1, s2)
def compute_reward(
self,
prediction: str,
reference: str,
reward_type: str
) -> float:
"""Compute reward for a single prediction."""
if not prediction or not reference:
return 0.0
if reward_type == "creative":
# Semantic + chrF++
embs = self.semantic_model.encode([reference, prediction], convert_to_tensor=True)
sem_score = torch.nn.functional.cosine_similarity(embs[0:1], embs[1:2]).item()
chrf_score = self.chrf.sentence_score(prediction, [reference]).score / 100.0
return 0.4 * sem_score + 0.6 * chrf_score
elif reward_type == "hybrid":
# Semantic + ROUGE-L + length penalty
embs = self.semantic_model.encode([reference, prediction], convert_to_tensor=True)
sem_score = torch.nn.functional.cosine_similarity(embs[0:1], embs[1:2]).item()
rouge_result = self.rouge.score(reference, prediction)
rouge_l = rouge_result['rougeL'].fmeasure
ref_len = len(reference.split())
pred_len = len(prediction.split())
if ref_len > 0:
len_ratio = pred_len / ref_len
length_penalty = max(0.0, 1.0 - abs(1.0 - len_ratio))
else:
length_penalty = 1.0 if pred_len == 0 else 0.0
return 0.6 * sem_score + 0.3 * rouge_l + 0.1 * length_penalty
elif reward_type == "levenshtein":
# Edit distance
max_len = max(len(prediction), len(reference))
if max_len == 0:
return 1.0
dist = self._levenshtein_distance(prediction, reference)
return max(0.0, 1.0 - (dist / max_len))
else:
raise ValueError(f"Unknown reward type: {reward_type}")
Quick capability diagnostics compared to original (most rapid subset to test both on):
| Task | Metric | Base | Trained | Delta |
|---|---|---|---|---|
| arc_easy | acc | 0.7891 | 0.7896 | +0.05% |
| arc_easy | acc_norm | 0.7609 | 0.7618 | +0.09% |
| lambada_openai | acc | 0.6912 | 0.6973 | +0.61% |
| lambada_openai | perplexity | 4.2433 | 4.0443 | -4.7% ↓ |
| openbookqa | acc | 0.3160 | 0.3180 | +0.20% |
| openbookqa | acc_norm | 0.4100 | 0.4040 | -0.60% |
| piqa | acc | 0.7797 | 0.7818 | +0.21% |
| piqa | acc_norm | 0.7807 | 0.7786 | -0.21% |
- Downloads last month
- 390
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support