Spaces:

Siddh12334
/

context-corruption-training

Paused

App Files Files Community

Siddh12334 commited on Apr 25

Commit

204fa23

verified ·

1 Parent(s): 5d65fb7

feat: training space with manual start UI

Browse files

Files changed (14) hide show

Dockerfile +20 -0
README.md +12 -4
data/__init__.py +0 -0
data/corruption.py +227 -0
data/generator.py +69 -0
data/loader.py +205 -0
environment/__init__.py +0 -0
environment/actions.py +53 -0
environment/env.py +146 -0
environment/reward.py +75 -0
environment/server.py +24 -0
training/ContextCorruption_GRPO.ipynb +300 -0
training/space_runner.py +139 -0
training/train_grpo.py +324 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+WORKDIR /app
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# Training deps — separate from server requirements
+RUN pip install --no-cache-dir \
+    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
+    trl transformers datasets accelerate \
+    openenv-core fastapi uvicorn pydantic \
+    wandb faker python-dotenv gradio
+COPY . .
+RUN python -m data.loader || echo "Will use fallback facts"
+EXPOSE 7860
+CMD ["python", "-m", "training.space_runner"]

README.md CHANGED Viewed

@@ -1,10 +1,18 @@
 ---
 title: Context Corruption Training
-emoji: 🌖
-colorFrom: green
-colorTo: purple
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Context Corruption Training
+emoji: 🏋️
+colorFrom: purple
+colorTo: red
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+# ContextCorruption-Env — GRPO Training Space
+Click **Start Training** in the UI. Set secrets first in Space Settings:
+- `WANDB_API_KEY`
+- `HF_TOKEN`
+- `HF_HUB_MODEL_ID` (e.g. `Siddh12334/qwen-1.5b-context-corruption`)
+Upgrade hardware to **A10G Small** before starting (~$1.05/hr, ~1.5 hrs total).

data/__init__.py ADDED Viewed

File without changes

data/corruption.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import random
+import re
+try:
+    from faker import Faker
+except ModuleNotFoundError:
+    Faker = None
+class _FallbackFaker:
+    def name(self) -> str:
+        return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"])
+    def last_name(self) -> str:
+        return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"])
+    def company(self) -> str:
+        return random.choice(
+            ["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"]
+        )
+    def word(self) -> str:
+        return random.choice(["revised", "alternate", "disputed", "corrected"])
+fake = Faker() if Faker else _FallbackFaker()
+COUNTRIES = [
+    "France",
+    "Germany",
+    "Brazil",
+    "Japan",
+    "Canada",
+    "India",
+    "Australia",
+    "Kenya",
+    "Mexico",
+    "Norway",
+]
+CITIES = [
+    "Paris",
+    "Berlin",
+    "Tokyo",
+    "Toronto",
+    "Mumbai",
+    "Sydney",
+    "Nairobi",
+    "Mexico City",
+    "Oslo",
+    "Rome",
+]
+ORGANIZATIONS = [
+    "World Health Organization",
+    "United Nations",
+    "NASA",
+    "Oxford University",
+    "Reuters",
+    "Smithsonian Institution",
+    "International Monetary Fund",
+    "Royal Society",
+]
+ANTONYMS = {
+    "largest": "smallest",
+    "smallest": "largest",
+    "first": "last",
+    "last": "first",
+    "highest": "lowest",
+    "lowest": "highest",
+    "won": "lost",
+    "lost": "won",
+    "north": "south",
+    "south": "north",
+    "east": "west",
+    "west": "east",
+    "increase": "decrease",
+    "decrease": "increase",
+    "before": "after",
+    "after": "before",
+    "true": "false",
+    "false": "true",
+    "older": "newer",
+    "newer": "older",
+    "major": "minor",
+    "minor": "major",
+}
+def _preserve_case(original: str, replacement: str) -> str:
+    if original.isupper():
+        return replacement.upper()
+    if original.istitle():
+        return replacement.title()
+    if original.islower():
+        return replacement.lower()
+    return replacement
+def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str:
+    pattern = re.compile(re.escape(target), re.IGNORECASE)
+    def repl(match: re.Match[str]) -> str:
+        return _preserve_case(match.group(0), replacement)
+    return pattern.sub(repl, text, count=1)
+def _different_choice(options: list[str], current: str) -> str:
+    viable = [option for option in options if option.lower() != current.lower()]
+    return random.choice(viable or options)
+def corrupt_number(text: str, answer: str) -> str:
+    numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text)
+    if not numbers:
+        return (
+            f"{text} A later statistical revision changed the reported figure "
+            f"from {answer} to {random.randint(12, 98)}."
+        )
+    original = random.choice(numbers)
+    value = int(original)
+    if len(original) == 4 and 1900 <= value <= 2030:
+        replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20]))
+    else:
+        mutated = value * random.choice([0.5, 2, 3, 5, 10])
+        replacement = str(max(1, int(round(mutated))))
+    return text.replace(original, replacement, 1)
+def corrupt_entity(text: str, answer: str) -> str:
+    answer = answer.strip()
+    pools = [COUNTRIES, CITIES, ORGANIZATIONS]
+    if answer and re.search(re.escape(answer), text, re.IGNORECASE):
+        for pool in pools:
+            if answer in pool:
+                replacement = _different_choice(pool, answer)
+                return _replace_first_case_insensitive(text, answer, replacement)
+        if len(answer.split()) <= 3:
+            generated_names = [fake.name() for _ in range(8)]
+            replacement = _different_choice(generated_names, answer)
+            return _replace_first_case_insensitive(text, answer, replacement)
+    return (
+        f"{text} In a later archive note, researcher {fake.name()} attributed "
+        f"the answer to {fake.name()} instead."
+    )
+def corrupt_inversion(text: str, answer: str) -> str:
+    pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE)
+    def repl(match: re.Match[str]) -> str:
+        word = match.group(0)
+        replacement = ANTONYMS[word.lower()]
+        return _preserve_case(word, replacement)
+    corrupted, count = pattern.subn(repl, text, count=1)
+    if count:
+        return corrupted
+    return (
+        f"{text} This statement contradicts earlier scholarly consensus, "
+        f"which identified {answer} as incorrect."
+    )
+def _generate_wrong_answer(answer: str) -> str:
+    answer = answer.strip()
+    if not answer:
+        return fake.word().title()
+    number_match = re.search(r"\d+", answer)
+    if number_match:
+        original = number_match.group(0)
+        mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5]))
+        return answer.replace(original, mutated, 1)
+    words = answer.split()
+    if len(words) == 1 and words[0][:1].isupper():
+        return fake.last_name()
+    if len(words) > 1:
+        shuffled = words[:]
+        random.shuffle(shuffled)
+        if shuffled != words:
+            return " ".join(shuffled)
+        return f"{answer} Institute"
+    return fake.word()
+def corrupt_coherent(text: str, answer: str) -> str:
+    wrong_answer = _generate_wrong_answer(answer)
+    year = random.randint(2015, 2025)
+    org = fake.company()
+    source = random.choice(
+        [
+            "a peer-reviewed survey",
+            "an institutional archive",
+            "a longitudinal review",
+            "a Reuters-style fact check",
+        ]
+    )
+    if answer and re.search(re.escape(answer), text, re.IGNORECASE):
+        text = _replace_first_case_insensitive(text, answer, wrong_answer)
+    return (
+        f"{text} According to {source} released by {org} in {year}, the verified "
+        f"answer is {wrong_answer}, based on revised primary-source evidence."
+    )
+def corrupt_text(text: str, answer: str, level: int) -> str:
+    try:
+        if level <= 1:
+            return corrupt_number(text, answer)
+        if level == 2:
+            return corrupt_entity(text, answer)
+        if level == 3:
+            return corrupt_inversion(text, answer)
+        return corrupt_coherent(text, answer)
+    except Exception:
+        return (
+            f"{text} A conflicting secondary source reports a different answer "
+            f"than {answer}."
+        )

data/generator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import random
+from typing import Any
+from data.corruption import corrupt_text
+SOURCES = [
+    "Encyclopedia Britannica",
+    "Reuters Fact Check",
+    "National Geographic",
+    "Smithsonian Magazine",
+    "BBC Reference Desk",
+    "Oxford Reference",
+    "World Almanac",
+    "Associated Press Archive",
+    "Library of Congress Notes",
+    "Academic Knowledge Base",
+]
+TEMPLATES = [
+    "{source} summarizes the question '{question}' and identifies the answer as {answer}.",
+    "In its reference entry, {source} states that the correct answer to '{question}' is {answer}.",
+    "{source} records {answer} as the accepted answer when asked: '{question}'",
+    "A background note from {source} explains that {answer} is the established response to '{question}'",
+    "According to {source}, researchers commonly answer '{question}' with {answer}.",
+    "{source} lists the verified answer for '{question}' as {answer}, matching standard references.",
+    "The archive maintained by {source} gives {answer} as the answer to '{question}'",
+    "For the prompt '{question}', {source} reports that the answer is {answer}.",
+]
+def _as_text(value: Any, default: str = "") -> str:
+    if value is None:
+        return default
+    text = str(value).strip()
+    return text or default
+def generate_documents(
+    fact: dict[str, Any],
+    num_docs: int = 8,
+    corrupt_positions: list[int] | None = None,
+) -> list[dict[str, Any]]:
+    question = _as_text(fact.get("question"), "Unknown question?")
+    answer = _as_text(fact.get("answer"), "unknown")
+    corrupt_set = set(corrupt_positions or [])
+    corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])}
+    documents: list[dict[str, Any]] = []
+    for doc_id in range(num_docs):
+        source = random.choice(SOURCES)
+        template = random.choice(TEMPLATES)
+        content = template.format(source=source, question=question, answer=answer)
+        is_corrupt = doc_id in corrupt_set
+        if is_corrupt:
+            level = min(corrupt_order[doc_id], 4)
+            content = corrupt_text(content, answer, level)
+        documents.append(
+            {
+                "id": doc_id,
+                "title": f"{source} Document {doc_id + 1}",
+                "content": content,
+                "is_corrupt": is_corrupt,
+            }
+        )
+    return documents

data/loader.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import json
+import random
+import urllib.request
+import ast
+from pathlib import Path
+from typing import Any
+FACTS_PATH = Path(__file__).parent / "facts.json"
+FAITHEVAL_COUNTERFACTUAL_URL = (
+    "https://raw.githubusercontent.com/SalesforceAIResearch/FaithEval/main/"
+    "data/counterfactual.json"
+)
+def _load_dataset(*args: Any, **kwargs: Any) -> Any:
+    from datasets import load_dataset
+    return load_dataset(*args, **kwargs)
+def _first_text(value: Any) -> str | None:
+    """Extract the first useful text value from nested dataset fields."""
+    if value is None:
+        return None
+    if isinstance(value, str):
+        text = value.strip()
+        if text.startswith("[") and text.endswith("]"):
+            try:
+                parsed = ast.literal_eval(text)
+            except (SyntaxError, ValueError):
+                parsed = None
+            parsed_text = _first_text(parsed)
+            if parsed_text:
+                return parsed_text
+        return text or None
+    if isinstance(value, (int, float)):
+        return str(value)
+    if isinstance(value, dict):
+        for key in ("text", "answer", "answers", "value"):
+            text = _first_text(value.get(key))
+            if text:
+                return text
+        return None
+    if isinstance(value, (list, tuple)):
+        for item in value:
+            text = _first_text(item)
+            if text:
+                return text
+    return None
+def _word_count(text: str) -> int:
+    return len(text.split())
+def _clean_question(text: Any) -> str | None:
+    question = _first_text(text)
+    if not question:
+        return None
+    question = question.strip()
+    if not question.endswith("?"):
+        question = f"{question}?"
+    return question
+def _natural_questions_answer(row: dict[str, Any]) -> str | None:
+    annotations = row.get("annotations") or {}
+    short_answers = annotations.get("short_answers")
+    answer = _first_text(short_answers)
+    if answer and _word_count(answer) <= 5:
+        return answer
+    return None
+def load_natural_questions(n: int = 300) -> list[dict[str, str]]:
+    facts: list[dict[str, str]] = []
+    dataset = _load_dataset(
+        "google-research-datasets/natural_questions",
+        split="train",
+        streaming=True,
+    )
+    for row in dataset:
+        question = _clean_question(row.get("question") or row.get("question_text"))
+        answer = _natural_questions_answer(row)
+        if not question or not answer:
+            continue
+        facts.append(
+            {
+                "question": question,
+                "answer": answer,
+                "source": "natural_questions",
+                "conflict_type": "entity",
+            }
+        )
+        if len(facts) >= n:
+            break
+    return facts
+def load_popqa(n: int = 150) -> list[dict[str, str]]:
+    facts: list[dict[str, str]] = []
+    dataset = _load_dataset("akariasai/PopQA", split="test")
+    for row in dataset:
+        question = _clean_question(row.get("question"))
+        answer = _first_text(row.get("possible_answers"))
+        if not question or not answer:
+            continue
+        facts.append(
+            {
+                "question": question,
+                "answer": answer,
+                "source": "popqa",
+                "conflict_type": "entity",
+                "entity": _first_text(row.get("subj") or row.get("entity")) or "",
+                "relation": _first_text(row.get("prop") or row.get("relation")) or "",
+            }
+        )
+        if len(facts) >= n:
+            break
+    return facts
+def _iter_faitheval_items(payload: Any) -> list[dict[str, Any]]:
+    if isinstance(payload, list):
+        return [item for item in payload if isinstance(item, dict)]
+    if isinstance(payload, dict):
+        for key in ("data", "examples", "items", "counterfactual"):
+            items = payload.get(key)
+            if isinstance(items, list):
+                return [item for item in items if isinstance(item, dict)]
+    return []
+def load_faitheval_counterfactual(n: int = 100) -> list[dict[str, str]]:
+    try:
+        with urllib.request.urlopen(FAITHEVAL_COUNTERFACTUAL_URL, timeout=20) as response:
+            payload = json.loads(response.read().decode("utf-8"))
+    except Exception:
+        return []
+    facts: list[dict[str, str]] = []
+    for item in _iter_faitheval_items(payload):
+        question = _clean_question(
+            item.get("question") or item.get("query") or item.get("claim")
+        )
+        answer = _first_text(
+            item.get("answer")
+            or item.get("gold_answer")
+            or item.get("label")
+            or item.get("target")
+        )
+        if not question or not answer:
+            continue
+        facts.append(
+            {
+                "question": question,
+                "answer": answer,
+                "source": "faitheval",
+                "conflict_type": "counterfactual",
+                "provided_context": _first_text(
+                    item.get("provided_context")
+                    or item.get("context")
+                    or item.get("evidence")
+                )
+                or "",
+            }
+        )
+        if len(facts) >= n:
+            break
+    return facts
+def build_fact_database() -> list[dict[str, str]]:
+    facts = (
+        load_natural_questions()
+        + load_popqa()
+        + load_faitheval_counterfactual()
+    )
+    random.shuffle(facts)
+    FACTS_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(FACTS_PATH, "w", encoding="utf-8") as f:
+        json.dump(facts, f, indent=2, ensure_ascii=False)
+    counts: dict[str, int] = {}
+    for fact in facts:
+        source = fact.get("source", "unknown")
+        counts[source] = counts.get(source, 0) + 1
+    print(f"Wrote {len(facts)} facts to {FACTS_PATH}")
+    print(f"Source counts: {counts}")
+    return facts
+if __name__ == "__main__":
+    build_fact_database()

environment/__init__.py ADDED Viewed

File without changes

environment/actions.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from enum import Enum
+from typing import Optional
+from pydantic import BaseModel, field_validator
+from openenv.core import Action, Observation, State
+class ActionType(str, Enum):
+    read_doc = "read_doc"
+    flag_suspicious = "flag_suspicious"
+    unflag_doc = "unflag_doc"
+    submit_answer = "submit_answer"
+class ContextCorruptionAction(Action):
+    action_type: ActionType
+    doc_id: Optional[int] = None
+    answer: Optional[str] = None
+    confidence: Optional[float] = None
+    @field_validator("confidence")
+    @classmethod
+    def confidence_range(cls, v):
+        if v is not None and not (0.0 <= v <= 1.0):
+            raise ValueError("confidence must be between 0.0 and 1.0")
+        return v
+class Document(BaseModel):
+    id: int
+    title: str
+    content: str
+    is_flagged: bool = False
+class EpisodeObservation(Observation):
+    question: str = ""
+    documents: list[Document] = []
+    flagged_ids: list[int] = []
+    budget_remaining: int = 0
+    turn: int = 0
+    message: Optional[str] = None
+    # `done` and `reward` inherited from Observation
+class ContextCorruptionState(State):
+    question: str = ""
+    ground_truth: str = ""
+    corrupt_ids: list[int] = []
+    flagged_ids: list[int] = []
+    budget_used: int = 0
+    done: bool = False
+    reward: Optional[float] = None
+    breakdown: Optional[dict] = None

environment/env.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import json
+import random
+from pathlib import Path
+from openenv.core import Environment
+from environment.actions import (
+    ActionType, ContextCorruptionAction, Document,
+    EpisodeObservation, ContextCorruptionState,
+)
+from environment.reward import ContextCorruptionRubric
+_FALLBACK_FACTS = [
+    {"question": "What is the capital of France?", "answer": "Paris"}
+]
+class ContextCorruptionEnv(Environment[ContextCorruptionAction, EpisodeObservation, ContextCorruptionState]):
+    MAX_BUDGET = 12
+    NUM_DOCS = 8
+    DIFFICULTY_LEVELS = [1, 2, 3, 4]
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self, difficulty=None):
+        rubric = ContextCorruptionRubric(state_fn=self._state_dict)
+        super().__init__(rubric=rubric)
+        self.difficulty = difficulty
+        facts_path = Path(__file__).parent.parent / "data" / "facts.json"
+        if facts_path.exists():
+            with open(facts_path, encoding="utf-8") as f:
+                self._facts = json.load(f)
+        else:
+            self._facts = _FALLBACK_FACTS
+        self._reset_state()
+    def _reset_state(self):
+        self._question = ""
+        self._ground_truth = ""
+        self._documents: list[dict] = []
+        self._corrupt_ids: list[int] = []
+        self._flagged_ids: list[int] = []
+        self._budget_used = 0
+        self._turn = 0
+        self._done = False
+        self._reward = None
+        self._breakdown = None
+    def reset(self, seed=None, episode_id=None, **kwargs) -> EpisodeObservation:
+        self._reset_rubric()
+        self._reset_state()
+        if seed is not None:
+            random.seed(seed)
+        fact = random.choice(self._facts)
+        n_corrupt = self.difficulty if self.difficulty is not None else random.choice(self.DIFFICULTY_LEVELS)
+        self._corrupt_ids = random.sample(range(self.NUM_DOCS), n_corrupt)
+        self._question = fact["question"]
+        self._ground_truth = fact["answer"]
+        try:
+            from data.generator import generate_documents
+            raw_docs = generate_documents(fact, num_docs=self.NUM_DOCS, corrupt_positions=self._corrupt_ids)
+        except Exception:
+            raw_docs = [
+                {"id": i, "title": f"Document {i}", "content": fact["answer"], "is_corrupt": i in self._corrupt_ids}
+                for i in range(self.NUM_DOCS)
+            ]
+        self._documents = raw_docs
+        return self._apply_transform(self._build_observation())
+    def step(self, action: ContextCorruptionAction, timeout_s=None, **kwargs) -> EpisodeObservation:
+        if self._done:
+            return self._apply_transform(self._build_observation(message="Episode already done."))
+        self._turn += 1
+        self._budget_used += 1
+        if action.action_type == ActionType.read_doc:
+            pass
+        elif action.action_type == ActionType.flag_suspicious:
+            if action.doc_id is not None and action.doc_id not in self._flagged_ids:
+                self._flagged_ids.append(action.doc_id)
+        elif action.action_type == ActionType.unflag_doc:
+            if action.doc_id in self._flagged_ids:
+                self._flagged_ids.remove(action.doc_id)
+        elif action.action_type == ActionType.submit_answer:
+            self._done = True
+        # Force-submit on budget exhaustion
+        if self._budget_used >= self.MAX_BUDGET and not self._done:
+            self._done = True
+        obs = self._build_observation()
+        if obs.done:
+            obs.reward = self._apply_rubric(action, obs)
+            self._reward = obs.reward
+            self._breakdown = self.rubric.last_breakdown if self.rubric else None
+        return self._apply_transform(obs)
+    @property
+    def state(self) -> ContextCorruptionState:
+        return ContextCorruptionState(
+            question=self._question,
+            ground_truth=self._ground_truth,
+            corrupt_ids=list(self._corrupt_ids),
+            flagged_ids=list(self._flagged_ids),
+            budget_used=self._budget_used,
+            done=self._done,
+            reward=self._reward,
+            breakdown=self._breakdown,
+        )
+    def _state_dict(self) -> dict:
+        return {
+            "ground_truth": self._ground_truth,
+            "flagged_ids": list(self._flagged_ids),
+            "corrupt_ids": list(self._corrupt_ids),
+            "budget_used": self._budget_used,
+            "max_budget": self.MAX_BUDGET,
+        }
+    def _build_observation(self, message=None) -> EpisodeObservation:
+        docs = [
+            Document(
+                id=d["id"],
+                title=d["title"],
+                content=d["content"],
+                is_flagged=d["id"] in self._flagged_ids,
+            )
+            for d in self._documents
+        ]
+        return EpisodeObservation(
+            question=self._question,
+            documents=docs,
+            flagged_ids=list(self._flagged_ids),
+            budget_remaining=self.MAX_BUDGET - self._budget_used,
+            turn=self._turn,
+            done=self._done,
+            reward=self._reward,
+            message=message,
+        )

environment/reward.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import re
+from openenv.core.rubrics import Rubric
+def _normalize(text: str) -> str:
+    text = text.lower()
+    text = re.sub(r"[^\w\s]", "", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def compute_reward(
+    submitted_answer: str,
+    ground_truth_answer: str,
+    flagged_ids: list[int],
+    corrupt_ids: list[int],
+    confidence: float,
+    budget_used: int,
+    max_budget: int,
+) -> tuple[float, dict]:
+    correct = _normalize(submitted_answer) == _normalize(ground_truth_answer)
+    answer_score = 0.4 if correct else 0.0
+    true_positives = [i for i in flagged_ids if i in corrupt_ids]
+    recall = len(true_positives) / len(corrupt_ids) if corrupt_ids else 0.0
+    recall_score = 0.3 * recall
+    false_positives = [i for i in flagged_ids if i not in corrupt_ids]
+    precision_score = max(0.0, 0.2 - 0.1 * len(false_positives))
+    confidence = confidence or 0.0
+    calibration_score = (0.1 * confidence) if correct else (-0.2 * confidence)
+    efficiency_score = 0.05 * (1 - budget_used / max_budget)
+    total = answer_score + recall_score + precision_score + calibration_score + efficiency_score
+    breakdown = {
+        "answer_correctness": round(answer_score, 4),
+        "flag_recall": round(recall_score, 4),
+        "false_positive_penalty": round(precision_score, 4),
+        "confidence_calibration": round(calibration_score, 4),
+        "efficiency": round(efficiency_score, 4),
+        "total": round(total, 4),
+    }
+    return round(total, 4), breakdown
+class ContextCorruptionRubric(Rubric):
+    """Scores a completed episode using compute_reward().
+    Requires a state_fn closure to access ground-truth env state that is
+    intentionally hidden from the agent's observation.
+    """
+    def __init__(self, state_fn):
+        super().__init__()
+        self._state_fn = state_fn
+        self.last_breakdown: dict = {}
+    def forward(self, action, observation) -> float:
+        if not observation.done:
+            return 0.0
+        s = self._state_fn()
+        reward, breakdown = compute_reward(
+            submitted_answer=getattr(action, "answer", None) or "",
+            ground_truth_answer=s["ground_truth"],
+            flagged_ids=s["flagged_ids"],
+            corrupt_ids=s["corrupt_ids"],
+            confidence=getattr(action, "confidence", None) or 0.0,
+            budget_used=s["budget_used"],
+            max_budget=s["max_budget"],
+        )
+        self.last_breakdown = breakdown
+        return reward

environment/server.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from dotenv import load_dotenv
+from openenv.core import create_app
+import uvicorn
+load_dotenv()
+from environment.actions import ContextCorruptionAction, EpisodeObservation
+from environment.env import ContextCorruptionEnv
+_difficulty_env = os.getenv("DIFFICULTY")
+_difficulty = int(_difficulty_env) if _difficulty_env else None
+_max_sessions = int(os.getenv("MAX_CONCURRENT_ENVS", "64"))
+app = create_app(
+    env=lambda: ContextCorruptionEnv(difficulty=_difficulty),
+    action_cls=ContextCorruptionAction,
+    observation_cls=EpisodeObservation,
+    env_name="ContextCorruption-Env",
+    max_concurrent_envs=_max_sessions,
+)
+if __name__ == "__main__":
+    uvicorn.run("environment.server:app", host="0.0.0.0", port=7860, reload=False)

training/ContextCorruption_GRPO.ipynb ADDED Viewed

	@@ -0,0 +1,300 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ContextCorruption-Env — GRPO Training\n",
+    "> **OpenEnv Hackathon | Meta × HuggingFace × PyTorch**\n",
+    "\n",
+    "Fine-tunes **Qwen2-1.5B-Instruct** with GRPO to identify corrupted documents and answer questions correctly.\n",
+    "\n",
+    "**Reward signal (fully deterministic, no LLM judge):**\n",
+    "| Component | Weight |\n",
+    "|---|---|\n",
+    "| Answer correctness (exact match after normalisation) | +0.40 |\n",
+    "| Corruption detection recall | +0.30 |\n",
+    "| False-positive penalty | +0.20 |\n",
+    "| Confidence calibration | ±0.10 |\n",
+    "| Efficiency bonus | +0.05 |\n",
+    "\n",
+    "**Random baseline:** avg reward ≈ 0.13 — beat this to show improvement.\n",
+    "\n",
+    "---\n",
+    "⚠️ Requires **GPU runtime** (A100 recommended). Go to `Runtime → Change runtime type → GPU`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Install dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install openenv-core==0.2.3 unsloth trl transformers datasets wandb faker python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Clone repo and generate facts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "REPO_URL = \"https://github.com/sas-dev5/context-corruption-env.git\"\n",
+    "\n",
+    "!git clone {REPO_URL}\n",
+    "%cd context-corruption-env\n",
+    "\n",
+    "# Generate facts.json (pulls NQ + PopQA)\n",
+    "!python -m data.loader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Authenticate WandB and HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "from huggingface_hub import login\n",
+    "\n",
+    "# Paste your keys here or set as Colab secrets\n",
+    "WANDB_API_KEY = os.getenv(\"WANDB_API_KEY\", \"\")\n",
+    "HF_TOKEN      = os.getenv(\"HF_TOKEN\", \"\")\n",
+    "HF_HUB_MODEL_ID = \"\"  # e.g. \"your-username/qwen-1.5b-context-corruption\" — leave blank to skip\n",
+    "\n",
+    "if WANDB_API_KEY:\n",
+    "    wandb.login(key=WANDB_API_KEY)\n",
+    "else:\n",
+    "    wandb.login()  # interactive prompt\n",
+    "\n",
+    "if HF_TOKEN:\n",
+    "    login(token=HF_TOKEN)\n",
+    "\n",
+    "os.environ[\"HF_HUB_MODEL_ID\"] = HF_HUB_MODEL_ID"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Verify environment (smoke test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from environment.env import ContextCorruptionEnv\n",
+    "from environment.actions import ContextCorruptionAction, ActionType\n",
+    "\n",
+    "env = ContextCorruptionEnv(difficulty=2)\n",
+    "obs = env.reset()\n",
+    "assert len(obs.documents) == 8\n",
+    "obs = env.step(ContextCorruptionAction(action_type=ActionType.submit_answer, answer=\"test\", confidence=0.5))\n",
+    "assert obs.done and obs.reward is not None\n",
+    "print(f\"✅ Smoke test passed | reward: {obs.reward:.4f}\")\n",
+    "print(f\"   Question: {env.state.question}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Preview training dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \".\")\n",
+    "from training.train_grpo import build_dataset, SYSTEM_PROMPT\n",
+    "\n",
+    "sample_ds = build_dataset(n_episodes=5, seed=0)\n",
+    "sample = sample_ds[0]\n",
+    "print(\"System:\", sample[\"messages\"][0][\"content\"][:200], \"...\")\n",
+    "print(\"\\nUser message (first 400 chars):\", sample[\"messages\"][1][\"content\"][:400], \"...\")\n",
+    "print(\"\\nGround truth:\", sample[\"ground_truth\"])\n",
+    "print(\"Corrupt doc IDs:\", sample[\"corrupt_ids\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Run GRPO training\n",
+    "\n",
+    "Expected time on A100: ~45–60 min for 3 epochs over 500 episodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from training.train_grpo import main\n",
+    "main()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. View training curves"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image, display\n",
+    "\n",
+    "display(Image(\"assets/reward_curve.png\"))\n",
+    "display(Image(\"assets/loss_curve.png\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Evaluate trained model vs baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json, torch, re\n",
+    "from unsloth import FastLanguageModel\n",
+    "from training.train_grpo import (\n",
+    "    MODEL_NAME, MAX_SEQ_LENGTH, OUTPUT_DIR,\n",
+    "    build_dataset, SYSTEM_PROMPT, _parse_completion\n",
+    ")\n",
+    "from environment.reward import compute_reward\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=f\"{OUTPUT_DIR}-final\",\n",
+    "    max_seq_length=MAX_SEQ_LENGTH,\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "eval_ds = build_dataset(n_episodes=50, seed=999)\n",
+    "rewards = []\n",
+    "\n",
+    "for row in eval_ds:\n",
+    "    prompt = tokenizer.apply_chat_template(\n",
+    "        row[\"messages\"], tokenize=False, add_generation_prompt=True\n",
+    "    )\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+    "    with torch.no_grad():\n",
+    "        out = model.generate(**inputs, max_new_tokens=256, temperature=0.1, do_sample=True)\n",
+    "    completion = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
+    "    parsed = _parse_completion(completion)\n",
+    "    if parsed:\n",
+    "        reward, _ = compute_reward(\n",
+    "            parsed.get(\"answer\", \"\"), row[\"ground_truth\"],\n",
+    "            [int(x) for x in parsed.get(\"suspicious_docs\", [])],\n",
+    "            row[\"corrupt_ids\"], float(parsed.get(\"confidence\", 0.5)),\n",
+    "            budget_used=1, max_budget=12\n",
+    "        )\n",
+    "    else:\n",
+    "        reward = 0.0\n",
+    "    rewards.append(reward)\n",
+    "\n",
+    "avg = sum(rewards) / len(rewards)\n",
+    "print(f\"\\n{'='*50}\")\n",
+    "print(f\"Trained model avg reward : {avg:.4f}\")\n",
+    "print(f\"Random baseline avg      : 0.1302\")\n",
+    "print(f\"Improvement              : {avg - 0.1302:+.4f}\")\n",
+    "print(f\"{'='*50}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Commit plots and results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trained_avg = avg  # from cell above\n",
+    "\n",
+    "results = {\n",
+    "    \"baseline_avg_reward\": 0.1302,\n",
+    "    \"trained_avg_reward\": round(trained_avg, 4),\n",
+    "    \"improvement\": round(trained_avg - 0.1302, 4),\n",
+    "    \"n_eval_episodes\": 50,\n",
+    "    \"model\": \"Qwen2-1.5B-Instruct + LoRA r=16 GRPO\",\n",
+    "}\n",
+    "with open(\"eval/trained_results.json\", \"w\") as f:\n",
+    "    json.dump(results, f, indent=2)\n",
+    "\n",
+    "!git config user.email \"colab@training\"\n",
+    "!git config user.name \"Colab Training Run\"\n",
+    "!git add assets/reward_curve.png assets/loss_curve.png eval/trained_results.json\n",
+    "!git commit -m \"results: add training curves and eval results\"\n",
+    "!git push origin main\n",
+    "print(\"Done — plots and results committed.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "A100",
+   "name": "ContextCorruption_GRPO.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

training/space_runner.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+Gradio UI for the training Space.
+Training does NOT start automatically — user must click "Start Training".
+"""
+import os
+import sys
+import threading
+import time
+from pathlib import Path
+import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+_log_lines: list[str] = []
+_training_status = "idle"  # idle | running | complete | failed
+def _append_log(msg: str):
+    ts = time.strftime("%H:%M:%S")
+    _log_lines.append(f"[{ts}] {msg}")
+def _run_training():
+    global _training_status
+    _training_status = "running"
+    _append_log("Training started.")
+    try:
+        # Redirect stdout so log lines appear in the UI
+        import io
+        import contextlib
+        sys.path.insert(0, str(Path(__file__).parent.parent))
+        from training.train_grpo import main
+        # Capture print output
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        class Tee:
+            def __init__(self, orig):
+                self._orig = orig
+            def write(self, msg):
+                if msg.strip():
+                    _append_log(msg.rstrip())
+                self._orig.write(msg)
+            def flush(self):
+                self._orig.flush()
+        sys.stdout = Tee(old_stdout)
+        sys.stderr = Tee(old_stderr)
+        try:
+            main()
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+        _training_status = "complete"
+        _append_log("✅ Training complete. Check WandB for curves.")
+    except Exception as e:
+        _training_status = "failed"
+        _append_log(f"❌ Training failed: {e}")
+def start_training():
+    global _training_status
+    if _training_status == "running":
+        return "⚠️ Training is already running.", _get_logs()
+    if _training_status == "complete":
+        return "✅ Training already complete.", _get_logs()
+    missing = []
+    if not os.getenv("WANDB_API_KEY"):
+        missing.append("WANDB_API_KEY")
+    if not os.getenv("HF_TOKEN"):
+        missing.append("HF_TOKEN")
+    if not os.getenv("HF_HUB_MODEL_ID"):
+        missing.append("HF_HUB_MODEL_ID")
+    if missing:
+        return f"❌ Missing secrets: {', '.join(missing)}. Set them in Space Settings → Variables and secrets.", _get_logs()
+    threading.Thread(target=_run_training, daemon=True).start()
+    return "🚀 Training started! Logs updating below...", _get_logs()
+def _get_logs() -> str:
+    return "\n".join(_log_lines[-80:]) if _log_lines else "No logs yet."
+def get_status() -> str:
+    icons = {"idle": "⏸️ Idle", "running": "🔄 Training in progress...",
+             "complete": "✅ Complete", "failed": "❌ Failed"}
+    return icons.get(_training_status, _training_status)
+def refresh():
+    return get_status(), _get_logs()
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+with gr.Blocks(title="ContextCorruption Training") as demo:
+    gr.Markdown("""
+# ContextCorruption-Env — GRPO Training
+**Qwen2-1.5B-Instruct** fine-tuned to identify corrupted documents and resist misleading context.
+Before starting, ensure these secrets are set in **Space Settings → Variables and secrets**:
+- `WANDB_API_KEY`
+- `HF_TOKEN`
+- `HF_HUB_MODEL_ID` (e.g. `Siddh12334/qwen-1.5b-context-corruption`)
+""")
+    status_box = gr.Textbox(label="Status", value="⏸️ Idle", interactive=False)
+    log_box = gr.Textbox(label="Training Logs", lines=20, interactive=False,
+                         value="Waiting to start...")
+    msg_box = gr.Textbox(label="Message", interactive=False)
+    with gr.Row():
+        start_btn = gr.Button("🚀 Start Training", variant="primary", scale=2)
+        refresh_btn = gr.Button("🔄 Refresh Logs", scale=1)
+    gr.Markdown("""
+---
+**Config:** 500 episodes · 3 epochs · Qwen2-1.5B · LoRA r=16 · A10G ~1.5 hrs · ~$2
+""")
+    start_btn.click(fn=start_training, outputs=[msg_box, log_box])
+    refresh_btn.click(fn=refresh, outputs=[status_box, log_box])
+    # Auto-refresh every 10s while running
+    demo.load(fn=refresh, outputs=[status_box, log_box], every=10)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

training/train_grpo.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+GRPO fine-tuning of Qwen2-1.5B-Instruct on ContextCorruption-Env.
+Architecture:
+  - Single-turn formulation: model sees question + all 8 docs, responds with
+    JSON {"answer": "...", "suspicious_docs": [0, 3], "confidence": 0.85}
+  - Two reward signals: correctness (from compute_reward) + format (valid JSON)
+  - WandB logs metrics + sample completions every LOGGING_STEPS
+  - Pushes final model to HF Hub after training
+Usage (on GPU machine / HF Space):
+  pip install -r requirements.txt
+  WANDB_API_KEY=... HF_TOKEN=... python -m training.train_grpo
+"""
+import json
+import os
+import random
+import re
+import sys
+from pathlib import Path
+import wandb
+# ── Config ────────────────────────────────────────────────────────────────────
+MODEL_NAME = "unsloth/Qwen2-1.5B-Instruct"
+MAX_SEQ_LENGTH = 2048
+LOAD_IN_4BIT = True
+LORA_R = 16
+LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
+NUM_TRAIN_EPOCHS = 3
+BATCH_SIZE = 4
+GRAD_ACCUM = 4
+LR = 5e-5
+MAX_COMPLETION_LENGTH = 512
+NUM_GENERATIONS = 8
+LOGGING_STEPS = 10
+SAVE_STEPS = 50
+SEED = 42
+N_TRAIN_EPISODES = 500
+N_EVAL_EPISODES = 50
+WANDB_PROJECT = "context-corruption-env"
+WANDB_RUN_NAME = "qwen-1.5b-grpo-run1"
+OUTPUT_DIR = "checkpoints/grpo-qwen-1.5b"
+HF_HUB_MODEL_ID = os.getenv("HF_HUB_MODEL_ID", "")  # set to push model after training
+SYSTEM_PROMPT = """\
+You are an epistemic agent. You will receive a factual question and a set of reference documents.
+Some documents may contain deliberately corrupted information (wrong numbers, wrong names, inverted facts).
+Your task:
+1. Answer the question correctly using your knowledge and the documents.
+2. Identify which document IDs contain false or corrupted information.
+3. Report your confidence (0.0–1.0).
+Respond ONLY with valid JSON on a single line:
+{"answer": "<your answer>", "suspicious_docs": [<doc ids>], "confidence": <0.0-1.0>}
+Examples:
+{"answer": "Paris", "suspicious_docs": [2, 5], "confidence": 0.95}
+{"answer": "1969", "suspicious_docs": [], "confidence": 0.8}"""
+# ── Dataset builder ────────────────────────────────────────────────────────────
+def _format_user_message(question: str, docs: list[dict]) -> str:
+    docs_text = "\n\n".join(
+        f"[Doc {d['id']}] {d['title']}\n{d['content']}" for d in docs
+    )
+    return f"Question: {question}\n\nDocuments:\n{docs_text}"
+def build_dataset(n_episodes: int, seed: int = SEED) -> "datasets.Dataset":
+    from datasets import Dataset
+    from data.generator import generate_documents
+    random.seed(seed)
+    facts_path = Path(__file__).parent.parent / "data" / "facts.json"
+    if not facts_path.exists():
+        raise FileNotFoundError(
+            "data/facts.json not found. Run: python -m data.loader"
+        )
+    facts = json.loads(facts_path.read_text(encoding="utf-8"))
+    rows = []
+    for _ in range(n_episodes):
+        fact = random.choice(facts)
+        n_corrupt = random.choice([1, 2, 3, 4])
+        corrupt_ids = random.sample(range(8), n_corrupt)
+        try:
+            docs = generate_documents(fact, num_docs=8, corrupt_positions=corrupt_ids)
+        except Exception:
+            docs = [
+                {"id": i, "title": f"Doc {i}", "content": fact["answer"],
+                 "is_corrupt": i in corrupt_ids}
+                for i in range(8)
+            ]
+        rows.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": _format_user_message(fact["question"], docs)},
+            ],
+            "ground_truth": fact["answer"],
+            "corrupt_ids": corrupt_ids,
+        })
+    return Dataset.from_list(rows)
+# ── Reward functions ───────────────────────────────────────────────────────────
+def _parse_completion(text: str) -> dict | None:
+    """Extract first JSON object from completion text."""
+    # Strip any <think>...</think> blocks (chain-of-thought models)
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+    # Try direct parse first
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Find first {...} block
+    match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group())
+        except json.JSONDecodeError:
+            pass
+    return None
+def format_reward(prompts, completions, **kwargs) -> list[float]:
+    """Small bonus for structurally valid responses — teaches the output format."""
+    rewards = []
+    for completion in completions:
+        parsed = _parse_completion(completion)
+        if parsed is None:
+            rewards.append(-0.1)
+            continue
+        has_answer = isinstance(parsed.get("answer"), str) and parsed["answer"].strip()
+        has_docs = isinstance(parsed.get("suspicious_docs"), list)
+        has_conf = isinstance(parsed.get("confidence"), (int, float))
+        rewards.append(0.1 if (has_answer and has_docs and has_conf) else 0.0)
+    return rewards
+def correctness_reward(prompts, completions, ground_truth, corrupt_ids, **kwargs) -> list[float]:
+    """Main reward: calls compute_reward() from environment/reward.py."""
+    from environment.reward import compute_reward
+    rewards = []
+    for completion, gt, cids in zip(completions, ground_truth, corrupt_ids):
+        parsed = _parse_completion(completion)
+        if parsed is None:
+            rewards.append(0.0)
+            continue
+        answer = str(parsed.get("answer", "")).strip()
+        flagged = [int(x) for x in parsed.get("suspicious_docs", [])
+                   if isinstance(x, (int, float))]
+        confidence = float(parsed.get("confidence", 0.5))
+        confidence = max(0.0, min(1.0, confidence))
+        cids_list = list(cids) if not isinstance(cids, list) else cids
+        reward, _ = compute_reward(
+            submitted_answer=answer,
+            ground_truth_answer=gt,
+            flagged_ids=flagged,
+            corrupt_ids=cids_list,
+            confidence=confidence,
+            budget_used=1,
+            max_budget=12,
+        )
+        rewards.append(float(reward))
+    return rewards
+# ── Plot saving ────────────────────────────────────────────────────────────────
+def save_training_plots(run_id: str):
+    """Download reward + loss curves from WandB and save to assets/."""
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+        api = wandb.Api()
+        run = api.run(f"{WANDB_PROJECT}/{run_id}")
+        history = run.history(keys=["train/reward", "train/loss"], pandas=True)
+        assets = Path(__file__).parent.parent / "assets"
+        assets.mkdir(exist_ok=True)
+        if "train/reward" in history.columns:
+            fig, ax = plt.subplots(figsize=(8, 4))
+            ax.plot(history["_step"], history["train/reward"])
+            ax.set_xlabel("Training step")
+            ax.set_ylabel("Mean episode reward")
+            ax.set_title("GRPO Training Reward — Qwen2-1.5B")
+            ax.grid(True, alpha=0.3)
+            fig.tight_layout()
+            fig.savefig(assets / "reward_curve.png", dpi=150)
+            plt.close(fig)
+            print(f"Saved reward_curve.png")
+        if "train/loss" in history.columns:
+            fig, ax = plt.subplots(figsize=(8, 4))
+            ax.plot(history["_step"], history["train/loss"])
+            ax.set_xlabel("Training step")
+            ax.set_ylabel("GRPO loss")
+            ax.set_title("GRPO Training Loss — Qwen2-1.5B")
+            ax.grid(True, alpha=0.3)
+            fig.tight_layout()
+            fig.savefig(assets / "loss_curve.png", dpi=150)
+            plt.close(fig)
+            print(f"Saved loss_curve.png")
+    except Exception as e:
+        print(f"[warn] Could not save plots: {e}")
+# ── Main ───────────────────────────────────────────────────────────────────────
+def main():
+    # Guard: must have GPU
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            print("[error] No GPU detected. Training requires CUDA. Exiting.")
+            sys.exit(1)
+    except ImportError:
+        pass
+    from unsloth import FastLanguageModel
+    from trl import GRPOTrainer, GRPOConfig
+    run = wandb.init(
+        project=WANDB_PROJECT,
+        name=WANDB_RUN_NAME,
+        config={
+            "model": MODEL_NAME,
+            "lora_r": LORA_R,
+            "epochs": NUM_TRAIN_EPOCHS,
+            "batch_size": BATCH_SIZE,
+            "grad_accum": GRAD_ACCUM,
+            "lr": LR,
+            "num_generations": NUM_GENERATIONS,
+            "n_train_episodes": N_TRAIN_EPISODES,
+            "seed": SEED,
+        },
+    )
+    print("Building training dataset...")
+    train_dataset = build_dataset(N_TRAIN_EPISODES, seed=SEED)
+    eval_dataset = build_dataset(N_EVAL_EPISODES, seed=SEED + 1)
+    print(f"Train: {len(train_dataset)} episodes | Eval: {len(eval_dataset)} episodes")
+    print("Loading model with Unsloth...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=LOAD_IN_4BIT,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_R,
+        target_modules=LORA_TARGET_MODULES,
+        lora_dropout=0.0,
+        use_gradient_checkpointing="unsloth",
+    )
+    push_to_hub = bool(HF_HUB_MODEL_ID and os.getenv("HF_TOKEN"))
+    config = GRPOConfig(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=NUM_TRAIN_EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        learning_rate=LR,
+        max_completion_length=MAX_COMPLETION_LENGTH,
+        num_generations=NUM_GENERATIONS,
+        report_to="wandb",
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=2,
+        seed=SEED,
+        # Deployment logs: log completions to WandB every logging step
+        log_completions=True,
+        num_completions_to_print=2,
+        # Push to HF Hub if token provided
+        push_to_hub=push_to_hub,
+        hub_model_id=HF_HUB_MODEL_ID if push_to_hub else None,
+        hub_strategy="end",
+        bf16=True,
+        remove_unused_columns=False,
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        args=config,
+        processing_class=tokenizer,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        reward_funcs=[correctness_reward, format_reward],
+    )
+    print("Starting GRPO training...")
+    trainer.train()
+    print("Saving final model...")
+    model.save_pretrained(f"{OUTPUT_DIR}-final")
+    tokenizer.save_pretrained(f"{OUTPUT_DIR}-final")
+    if push_to_hub:
+        model.push_to_hub(HF_HUB_MODEL_ID)
+        tokenizer.push_to_hub(HF_HUB_MODEL_ID)
+        print(f"Model pushed to HF Hub: {HF_HUB_MODEL_ID}")
+    print("Saving training plots...")
+    save_training_plots(run.id)
+    wandb.finish()
+    print("Training complete.")
+if __name__ == "__main__":
+    main()