percyraskova commited on Jan 14

Commit

81b3473

verified ·

1 Parent(s): f2e39e6

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.claude/settings.local.json +19 -0
.gitignore +63 -0
.pre-commit-config.yaml +42 -0
.ruff_cache/.gitignore +2 -0
.ruff_cache/0.14.10/11987423046224602800 +0 -0
.ruff_cache/0.14.10/9230827824541771364 +0 -0
.ruff_cache/0.14.10/9755640140589486738 +0 -0
.ruff_cache/CACHEDIR.TAG +1 -0
.yamllint.yaml +14 -0
AGENTS.md +39 -0
ATTRIBUTION.md +30 -0
CLAUDE.md +120 -0
README.md +77 -0
RunPod LLM Fine-Tuning Strategy.md +339 -0
ai-docs/chatbot-ideology.yaml +373 -0
ai-docs/finetune.yaml +297 -0
ai-docs/reward-modeling.yaml +857 -0
ai-docs/runpod.yaml +362 -0
ai-docs/training-schema.yaml +484 -0
docker/.env.example +124 -0
docker/Dockerfile +132 -0
docker/requirements.txt +31 -0
docker/start.sh +165 -0
notebooks/Marxist_GRPO_RunPod_Optimized.ipynb +1107 -0
pyproject.toml +166 -0
src/prolewiki_llm/__init__.py +99 -0
src/prolewiki_llm/convert_to_qwen.py +37 -0
src/prolewiki_llm/export_grpo_dataset.py +224 -0
src/prolewiki_llm/grpo_rewards.py +1847 -0
src/prolewiki_llm/train_grpo_marxist.py +341 -0
src/prolewiki_llm/train_headless.py +460 -0
src/prolewiki_llm/train_marxist.py +201 -0
src/prolewiki_llm/transform_to_grpo.py +64 -0
src/prolewiki_llm/wandb_logging.py +529 -0
tests/__init__.py +1 -0
tests/conftest.py +114 -0
tests/fixtures/__init__.py +1 -0
tests/fixtures/mock_bin/huggingface-cli +7 -0
tests/fixtures/mock_bin/python +35 -0
tests/fixtures/mock_bin/runpodctl +12 -0
tests/fixtures/mock_bin/wandb +7 -0
tests/fixtures/mock_bin/wget +17 -0
tests/integration/__init__.py +1 -0
tests/integration/test_start_sh.py +462 -0
tests/unit/__init__.py +1 -0
tests/unit/test_grpo_rewards.py +0 -0
tests/unit/test_train_headless.py +248 -0
tests/unit/test_wandb_logging.py +467 -0
training_data/entity_whitelist.json +0 -0
training_data/entity_whitelist_clean.json +0 -0

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(wc:*)",
+      "Bash(du:*)",
+      "mcp__context7__resolve-library-id",
+      "mcp__context7__query-docs",
+      "Bash(uv sync:*)",
+      "Bash(uv run python:*)",
+      "Bash(uv run ruff check:*)",
+      "Bash(uv run pytest:*)",
+      "WebFetch(domain:www.runpod.io)"
+    ]
+  },
+  "disabledMcpjsonServers": [
+    "sequential-thinking",
+    "neovim"
+  ]
+}

.gitignore ADDED Viewed

	@@ -0,0 +1,63 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.venv/
+venv/
+ENV/
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+# Jupyter
+.ipynb_checkpoints/
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.nox/
+# mypy
+.mypy_cache/
+# Archives
+*.tar.gz
+*.zip
+# Model artifacts (large files)
+*.safetensors
+*.bin
+*.gguf
+# Training outputs (generated)
+outputs/
+checkpoints/
+lora-output/
+# OS
+.DS_Store
+Thumbs.db

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.8.0
+    hooks:
+      - id: ruff
+        name: ruff (lint)
+        args: [--fix, --exit-non-zero-on-fix]
+        types_or: [python, pyi]
+      - id: ruff-format
+        name: ruff (format)
+        types_or: [python, pyi]
+  - repo: local
+    hooks:
+      - id: mypy
+        name: mypy (typecheck)
+        entry: uv run mypy
+        language: system
+        types: [python]
+        args: [src/prolewiki_llm/]
+        pass_filenames: false
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--unsafe]
+      - id: check-json
+      - id: check-toml
+      - id: check-added-large-files
+        args: [--maxkb=5000]
+      - id: check-merge-conflict
+      - id: debug-statements
+  - repo: https://github.com/adrienverge/yamllint.git
+    rev: v1.35.1
+    hooks:
+      - id: yamllint
+        name: yamllint (yaml lint)
+        args: [-c=.yamllint.yaml]

.ruff_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Automatically created by ruff.
2	+ *

.ruff_cache/0.14.10/11987423046224602800 ADDED Viewed

Binary file (310 Bytes). View file

.ruff_cache/0.14.10/9230827824541771364 ADDED Viewed

Binary file (228 Bytes). View file

.ruff_cache/0.14.10/9755640140589486738 ADDED Viewed

Binary file (170 Bytes). View file

.ruff_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

.yamllint.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+extends: default
+rules:
+  line-length:
+    max: 200
+    level: warning
+  document-start: disable
+  truthy:
+    allowed-values: ['true', 'false', 'yes', 'no']
+  comments:
+    min-spaces-from-content: 1
+  indentation:
+    spaces: 2
+    indent-sequences: consistent

AGENTS.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Repository Guidelines
+## Project Structure & Module Organization
+- `src/prolewiki_llm/`: core library code (reward functions, dataset transforms, W&B logging).
+- `tests/unit/`: fast unit tests and fixtures under `tests/conftest.py`.
+- `training_data/`: curated datasets (`.jsonl`) and documentation (`MODEL_CARD.yaml`).
+- `notebooks/`: training and experimentation notebooks.
+- `ai-docs/`: AI-consumable documentation and schemas.
+## Build, Test, and Development Commands
+- `uv sync`: install core dependencies.
+- `uv sync --group dev`: install developer tools (pytest, ruff, mypy).
+- `uv sync --group training`: install GPU training dependencies.
+- `python -m spacy download en_core_web_sm`: required model for coherence rewards.
+- `uv run pytest`: run the test suite.
+- `uv run pytest -m "not slow and not gpu"`: run fast tests only.
+- `uv run ruff check .` / `uv run ruff format .`: lint and format.
+- `uv run mypy src`: run strict type checks on library code.
+## Coding Style & Naming Conventions
+- Python 3.12; line length 100; double quotes; spaces for indentation (ruff format).
+- Use `snake_case` for modules/functions and `PascalCase` for classes.
+- Keep reward logic in `src/prolewiki_llm/grpo_rewards.py` and avoid side effects.
+- Name data files descriptively, for example `training_data/synthetic_topic.jsonl`.
+## Testing Guidelines
+- Pytest with markers: `unit`, `slow`, and `gpu` (see `pyproject.toml`).
+- Place tests in `tests/unit/` and name files `test_*.py`.
+- Mark long-running or GPU-dependent tests explicitly.
+## Commit & Pull Request Guidelines
+- Git history only shows "initial commit"; no established convention yet.
+- Use short, imperative commit subjects (for example, "Add format reward tests").
+- PRs should describe changes, list tests run, and call out dataset or model updates.

ATTRIBUTION.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Attribution
+This project was extracted from [pw-mcp](https://github.com/prolewiki/pw-mcp) on December 25, 2025.
+## Origin
+The code in this repository was originally developed as part of the pw-mcp project (ProleWiki MCP Server), which provides semantic vector search over the ProleWiki corpus.
+The AI training components were separated into this standalone repository to:
+- Reduce the dependency footprint of pw-mcp
+- Allow independent versioning and release cycles
+- Separate the ~1GB ChromaDB releases from the ~4-16GB model releases
+- Enable focused development on training infrastructure
+## Original Authors
+The original code was developed by contributors to the pw-mcp project.
+## Files Extracted
+The following files were copied from pw-mcp:
+- `src/pw_mcp/ai_training/*.py` → `src/prolewiki_llm/`
+- `training_data/` (entire directory)
+- `tests/unit/training/` → `tests/unit/`
+- `ai-docs/` (training-related YAML files)
+## License
+This project maintains the same AGPL-3.0-only license as the original pw-mcp project.

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+GRPO fine-tuning infrastructure for training Marxist-Leninist language models. The core functionality is a multi-layer reward system designed to prevent reward hacking during GRPO (Group Relative Policy Optimization) training.
+## Commands
+```bash
+# Install dependencies
+uv sync                        # Core dependencies
+uv sync --group dev            # Dev tools (pytest, ruff, mypy)
+uv sync --group training       # GPU training deps (unsloth, trl, vllm)
+# Required NLP model
+python -m spacy download en_core_web_sm
+# Testing
+uv run pytest                           # All tests
+uv run pytest -m "not slow and not gpu" # Fast tests only
+uv run pytest tests/unit/test_grpo_rewards.py::test_name  # Single test
+# Code quality
+uv run ruff check .            # Lint
+uv run ruff format .           # Format
+uv run mypy src                # Type check (strict mode)
+```
+## Architecture
+### Reward Function System (`src/prolewiki_llm/grpo_rewards.py`)
+The reward system is layered to defeat "word soup" and other reward hacking strategies:
+1. **Format rewards** - Validate `<think>...</think>` reasoning tags
+2. **NLI coherence** - Use BART-large-MNLI to check if response entails ground truth
+3. **Self-consistency** - Detect internal contradictions via pairwise NLI
+4. **Structural coherence** - spaCy dependency parsing verifies terms in meaningful syntactic roles
+5. **Topic relevance** - Ensure response addresses what was asked
+6. **Interconnection depth** - Reward deep analysis, penalize buzzword salad
+Key functions:
+- `full_coherence_reward()` - Comprehensive 5-layer check (recommended for training)
+- `robust_coherence_reward()` - NLI + self-consistency + structure
+- `ideological_firmness_reward()` - Detects capitulation to reactionary framing
+Models are lazy-loaded via `get_embedder()`, `get_nli_pipeline()`, `get_spacy_nlp()`.
+### Reward Function Interface
+All reward functions follow the GRPOTrainer signature:
+```python
+def reward_func(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+```
+### W&B Integration (`src/prolewiki_llm/wandb_logging.py`)
+`WandbSampleLogger` accumulates samples and logs them as tables for debugging reward behavior. Use `create_logging_reward()` to wrap logging into a reward function.
+## Testing Patterns
+- Markers: `unit`, `slow`, `gpu` (see `pyproject.toml`)
+- Fixtures in `tests/conftest.py`: `sample_question`, `sample_good_answer`, `sample_bad_answer`
+- Mock heavy NLP models in unit tests to avoid slow initialization
+## Code Style
+- Python 3.12, line length 100, double quotes
+- `snake_case` functions/modules, `PascalCase` classes
+- Reward functions must accept `**kwargs` (GRPOTrainer interface requirement)
+- Keep reward logic pure (no side effects except lazy model loading)
+## Training
+### Headless RunPod Training (Recommended)
+Docker-based headless training for RunPod deployment:
+```bash
+# 1. Export dataset from ../dataset/ to GRPO format
+uv run python -m prolewiki_llm.export_grpo_dataset
+# 2. Build Docker image
+docker build -t marxist-grpo:latest -f docker/Dockerfile .
+# 3. Deploy to RunPod
+runpodctl create pod \
+  --name "marxist-grpo-training" \
+  --gpuType "NVIDIA A100 80GB PCIe" \
+  --imageName myregistry/marxist-grpo:latest \
+  --env HF_TOKEN=$HF_TOKEN \
+  --env WANDB_API_KEY=$WANDB_API_KEY \
+  --env HF_REPO=my-org/my-model
+```
+Key files:
+- `src/prolewiki_llm/export_grpo_dataset.py` - Dataset export utility
+- `src/prolewiki_llm/train_headless.py` - Headless training script
+- `docker/Dockerfile` - Container definition
+- `docker/start.sh` - Entrypoint (auth → train → upload → stop pod)
+- `docker/.env.example` - Environment variable reference
+Environment variables for `train_headless.py`:
+- `HF_TOKEN`, `WANDB_API_KEY` (required)
+- `MAX_STEPS`, `BATCH_SIZE`, `LEARNING_RATE`, `REWARD_MODE` (optional)
+- `RUNPOD_POD_ID` (enables auto-termination after training)
+### Notebook Training (Interactive)
+See `notebooks/Marxist_GRPO_Training.ipynb` for interactive training. Key settings:
+- Disable `torch.compile` via environment variables (causes hangs on RunPod/Jupyter)
+- Use `use_gradient_checkpointing=True` (not `"unsloth"`)
+- `load_in_4bit=False` required for GRPO (16-bit LoRA adapters)

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# prolewiki-llm
+GRPO fine-tuning and reward functions for training Marxist-Leninist language models.
+## Overview
+This repository contains the AI training infrastructure for fine-tuning language models on Marxist-Leninist theory. It includes:
+- **Reward Functions**: Multi-layer reward system for GRPO training that prevents reward hacking
+- **Training Data**: Curated Q&A pairs and synthetic datasets for ideological consistency
+- **Training Scripts**: Ready-to-run notebooks for RunPod/cloud GPU training
+- **W&B Integration**: Weights & Biases logging for training observability
+## Related Projects
+- [pw-mcp](https://github.com/prolewiki/pw-mcp) - MCP server and ChromaDB pipeline for ProleWiki semantic search
+## Installation
+```bash
+# Basic installation
+uv sync
+# Download spacy model (required for topic/coherence rewards)
+python -m spacy download en_core_web_sm
+# With training dependencies (for GPU training)
+uv sync --group training
+# Development
+uv sync --group dev
+```
+## Usage
+### Reward Functions
+```python
+from prolewiki_llm import full_coherence_reward, format_reward
+# Combined 5-layer coherence check (recommended for training)
+reward = full_coherence_reward(
+    prompts=["What is imperialism?"],
+    completions=["<think>...</think>\n\nImperialism is..."],
+    answer="Lenin defined imperialism as..."
+)
+# Individual reward components
+format_score = format_reward(completions=["<think>...</think>\n\nAnswer..."])
+```
+### Training
+See `notebooks/Marxist_GRPO_Training.ipynb` for a complete training example.
+## Project Structure
+```
+prolewiki-llm/
+├── src/prolewiki_llm/
+│   ├── grpo_rewards.py      # 17+ reward functions
+│   ├── wandb_logging.py     # W&B integration
+│   └── transform_to_grpo.py # Dataset conversion
+├── training_data/
+│   ├── synthetic_*.jsonl    # Training datasets
+│   ├── entity_whitelist.json # Anti-hallucination data
+│   └── MODEL_CARD.yaml      # Dataset documentation
+├── notebooks/
+│   └── Marxist_GRPO_Training.ipynb
+├── tests/
+│   └── unit/
+└── ai-docs/                 # AI-consumable documentation
+```
+## License
+AGPL-3.0-only

RunPod LLM Fine-Tuning Strategy.md ADDED Viewed

	@@ -0,0 +1,339 @@

+# **Operationalizing Headless Large Language Model Fine-Tuning on RunPod: A Comprehensive Infrastructure and Workflow Analysis**
+## **1\. Introduction: The Paradigm Shift to Headless AI Operations**
+The contemporary landscape of Large Language Model (LLM) development is currently undergoing a fundamental transition, shifting from interactive, exploratory environments toward rigorous, automated production pipelines. For years, the Jupyter notebook has served as the *lingua franca* of data science—a canvas for experimentation, visualization, and iterative code development. However, as the field matures from research to engineering, the limitations of the notebook paradigm become increasingly acute, particularly when applied to the resource-intensive and time-critical task of fine-tuning custom LLMs. The user requirement for a workflow that eliminates the notebook interface in favor of a "code-upload-and-train" paradigm reflects a sophisticated understanding of MLOps principles: reproducibility, resource efficiency, and maximizing computational throughput.
+RunPod, as a specialized GPU cloud provider, occupies a unique and critical niche within this evolving ecosystem. Distinct from hyperscalers such as AWS, Azure, or Google Cloud Platform—which often necessitate complex Identity and Access Management (IAM) configurations, quota negotiations, and long-term commitments—RunPod offers a container-centric infrastructure that is ideally optimized for headless, ephemeral training jobs. The platform’s architecture effectively democratizes access to high-performance compute, offering everything from consumer-grade RTX 4090s to enterprise-class NVIDIA H100 clusters. This report provides an exhaustive, expert-level analysis of the architectural, operational, and software strategies necessary to fine-tune custom LLMs on RunPod using a strictly headless approach.
+To fully satisfy the requirement of "training as fast as possible" with "custom training data," this analysis moves beyond simple tutorials to construct a robust engineering framework. It dissects the interplay between hardware selection (Secure vs. Community Cloud), containerization strategies (Docker-based execution), and high-efficiency fine-tuning frameworks (Unsloth and Axolotl). By decoupling the training process from an interactive Integrated Development Environment (IDE), developers can leverage spot instances more effectively, dramatically reduce idle compute costs, and integrate training runs into broader Continuous Integration/Continuous Deployment (CI/CD) pipelines. This report serves as a definitive guide to architecting these headless systems.
+## ---
+**2\. Infrastructure Architecture and Instance Selection Strategy**
+The foundation of any high-performance fine-tuning workflow is the underlying compute architecture. In the context of "training as fast as possible," the choice of hardware dictates not only the wall-clock time of the training run but also the stability, cost-efficiency, and maximum capable model size of the session. RunPod’s inventory is segmented into distinct tiers, each offering specific advantages and liabilities for headless operations. A nuanced understanding of these hardware profiles is essential for optimizing the price-performance ratio.
+### **2.1 The GPU Hierarchy: Performance Profiles and Architectural Suitability**
+The selection of a specific GPU architecture must be directly correlated with the parameter count of the target model (e.g., Llama 3 8B, Mistral, or Llama 3 70B) and the chosen quantization method (Full Fine-Tuning vs. LoRA/QLoRA).
+#### **The Enterprise Tier: NVIDIA H100 and A100**
+For users prioritizing raw speed and throughput above all else, the NVIDIA H100 and A100 Tensor Core GPUs represent the gold standard of current AI acceleration. These cards are designed for datacenter reliability and massive parallel throughput.
+The **NVIDIA H100 (80GB)** stands as the pinnacle of current commercial AI hardware. It is specifically engineered to accelerate Transformer-based models via its fourth-generation Tensor Cores and the dedicated Transformer Engine, which automatically manages mixed-precision calculations using FP8 formats.1 For headless workflows, the H100 offers a distinct advantage: its sheer speed minimizes the "window of vulnerability." In a headless setup, particularly one utilizing spot instances or decentralized nodes, the longer a job runs, the higher the statistical probability of a network disconnect or node preemption. By reducing training time by factors of 3x or more compared to previous generations, the H100 significantly increases the reliability of job completion.2 It is the only viable option for users attempting to full fine-tune models in the 70B+ parameter range within reasonable timeframes. However, this performance comes at a premium, with costs ranging from approximately $2.69 to $4.00 per hour depending on the specific configuration (SXM vs. PCIe) and market demand.1
+The **NVIDIA A100 (80GB)** remains the industry workhorse for LLM training. While it lacks the H100's specific FP8 Transformer Engine, its 80GB of High Bandwidth Memory (HBM2e) provides sufficient capacity to fine-tune 70B models using QLoRA or 8B models with full precision and extended context windows.1 The availability of A100s on RunPod is generally higher than that of H100s, making them a more reliable fallback for automated pipelines that require immediate provisioning without queuing. For users engaging in "headless" operations where the script automatically requests resources, the A100's ubiquity often makes it the path of least resistance.4
+#### **The Prosumer Tier: NVIDIA RTX 4090 and RTX 6000 Ada**
+For users targeting smaller models, such as the 7B or 8B parameter classes (e.g., Llama 3 8B, Mistral, Gemma), the enterprise tier may represent overkill. The **NVIDIA RTX 4090** has emerged as an exceptionally cost-effective alternative for these specific workloads.
+With 24GB of VRAM, the RTX 4090 can comfortably handle 8B models using 4-bit quantization (QLoRA) or, when paired with memory-efficient frameworks like Unsloth, even larger batch sizes.5 The cost efficiency is dramatic: at approximately $0.34 to $0.69 per hour, a developer can run extensive hyperparameter sweeps (grid searches) for the cost of a single hour on an H100.1 However, the use of consumer hardware in a headless workflow introduces specific constraints. These cards are typically hosted in the "Community Cloud" tier, meaning they are decentralized nodes often residing in non-tier-1 datacenters or even private residences. This introduces a higher risk of interruption, necessitating that the headless script implements robust, frequent checkpointing to resume training automatically if a node goes offline.
+The **RTX 6000 Ada Generation** bridges the gap, offering 48GB of VRAM—double that of the 4090—while retaining the Ada Lovelace architecture's efficiency. Priced around $0.79/hr, it allows for training mid-sized models (e.g., 30B parameters with QLoRA) or 8B models with much longer context windows than the 4090 allows.1
+### **2.2 Deployment Tiers: Secure Cloud vs. Community Cloud**
+RunPod segments its GPU inventory into two primary distinct tiers: **Community Cloud** and **Secure Cloud**. This distinction is critical for designing a headless operation, as it fundamentally dictates the reliability engineering required in the training code.
+**Secure Cloud** represents enterprise-grade datacenters with high reliability, redundancy, and security certifications (SOC2, etc.). For a user whose primary requirement is to "upload and train," Secure Cloud offers the assurance that the pod will not vanish mid-training due to a provider pulling the machine off the network. The pricing is slightly higher, but the reduction in operational complexity—specifically the reduced need for aggressive fault-tolerance scripting—often outweighs the raw hourly cost difference.1 For the final "production" training run, specifically when processing a massive dataset that might take 10+ hours, Secure Cloud is the recommended tier to ensure uninterrupted execution.
+**Community Cloud** consists of crowdsourced GPUs provided by third parties. While significantly cheaper, these function similarly to Spot instances in traditional clouds, though with potentially higher variance in uptime and network bandwidth. They are ideal for "bursty" workloads where a user might spin up 10 simultaneous experiments to test different learning rates. However, utilizing this tier for headless training requires the training script to be resilient. It implies that the "code upload" must include logic to check for existing checkpoints on a persistent volume and resume automatically, as the probability of a node restart is non-zero.1
+### **2.3 Cost-Performance Matrix**
+To assist in making the precise hardware decision, the following table synthesizes the cost, utility, and risk profile of available hardware for fine-tuning tasks on RunPod.
+| GPU Model | VRAM | Cloud Tier | Est. Price/Hr | Best Use Case | Headless Reliability |
+| :---- | :---- | :---- | :---- | :---- | :---- |
+| **H100 SXM** | 80GB | Secure | \~$2.69 | Full FT 70B+, Time-Critical Jobs | **High** (Fastest completion minimizes risk) |
+| **A100 SXM** | 80GB | Secure | \~$1.49 | QLoRA 70B, Full FT 8B | **High** (Standard enterprise reliability) |
+| **A100 PCIe** | 40GB | Secure | \~$1.39 | LoRA 13B-30B | **Medium** (Memory constraints may limit batch size) |
+| **RTX 6000 Ada** | 48GB | Secure | \~$0.79 | Mid-range models (30B), Long Context | **High** (Excellent VRAM/Price ratio) |
+| **RTX 4090** | 24GB | Community | \~$0.34 | QLoRA 8B, Debugging, Sweeps | **Low/Medium** (Requires fault tolerance logic) |
+| **RTX 3090** | 24GB | Community | \~$0.22 | Low-budget experimentation | **Low** (Slower speed increases interrupt risk) |
+1
+## ---
+**3\. The Headless Workflow Architecture: Containerization and Automation**
+To satisfy the user's explicit requirement of avoiding a Jupyter notebook in favor of a "code upload" model, the workflow must shift from an interactive session to a batch-processing paradigm. In this model, the local machine is used for code development and configuration, while the remote GPU serves purely as an execution engine. This requires a Docker-first approach where the environment, code, and execution logic are encapsulated within a portable container image.
+### **3.1 The Docker-First Approach**
+The cornerstone of a robust headless workflow is containerization. Launching a generic Ubuntu pod and manually installing libraries via a startup script is prone to error, hard to reproduce, and slow. Instead, the user must define the entire training environment in a Docker image. This ensures that "uploading code" translates immediately to execution without manual environment setup.
+#### **The "Entrypoint" Strategy**
+In a standard interactive RunPod session, the container launches and idles, typically running a sleep command or a Jupyter server, waiting for a user to connect. In a headless workflow, the Docker container utilizes an ENTRYPOINT or CMD script that immediately initiates the training process upon launch. Crucially, once the training process concludes (whether successfully or with a failure), the script handles data egress and terminates the pod.7
+This approach perfectly aligns with the "upload code and train" desire. The "code" is baked into the Docker image (or mounted at runtime), and the "train" command is the automatic, inevitable action of the container starting up.
+#### **Constructing the Golden Image**
+A "Golden Image" for fine-tuning must include the base CUDA drivers, the Python environment, and the specific fine-tuning frameworks (Axolotl or Unsloth). Below is an architectural breakdown of such a Dockerfile, optimized for RunPod.
+**Scenario:** A Docker image designed for fine-tuning Llama 3 using Unsloth.
+Dockerfile
+\# Use RunPod's base image or NVIDIA's CUDA image to ensure driver compatibility
+\# CUDA 11.8 or 12.1 is often required for modern LLM frameworks
+FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel
+\# Set working directory
+WORKDIR /workspace
+\# Install system dependencies
+\# git-lfs is critical for downloading large models/datasets
+RUN apt-get update && apt-get install \-y git git-lfs htop nvtop tmux
+\# Install Python dependencies
+\# Unsloth and Axolotl often require specific versions of xformers and trl
+\# Using a requirements.txt allows for easier version pinning
+COPY requirements.txt /workspace/requirements.txt
+RUN pip install \--no-cache-dir \-r requirements.txt
+\# Install specific frameworks (Example: Unsloth)
+\# Note: Unsloth installation often requires specific CUDA paths
+RUN pip install "unsloth\[colab-new\] @ git+https://github.com/unslothai/unsloth.git"
+RUN pip install \--no-deps "xformers\<0.0.26" "trl\<0.9.0" peft accelerate bitsandbytes
+\# Copy the automation scripts and training code
+COPY train.py /workspace/train.py
+COPY start.sh /workspace/start.sh
+\# Make the start script executable
+RUN chmod \+x /workspace/start.sh
+\# Set the entrypoint to the automation script
+ENTRYPOINT \["/workspace/start.sh"\]
+7
+**Optimization Insight:** Embedding the dataset directly into the Docker image (via COPY dataset.jsonl) is a viable strategy only for small datasets (\<5GB). For massive datasets (\>100GB), as implied by the "custom training data" requirement, this approach creates bloated images that are slow to push and pull. For large-scale data, the start.sh script should be designed to pull the data from S3 or a RunPod Network Volume at runtime, ensuring the Docker image remains lightweight and agile.10
+### **3.2 The Automation Logic: The start.sh Script**
+The start.sh script acts as the "brain" of the headless operation. It orchestrates the sequence of events inside the pod, managing authentication, data ingestion, execution, and cleanup.
+Bash
+\#\!/bin/bash
+set \-e \# Exit immediately if a command exits with a non-zero status
+\# 1\. Environment Setup (Logging in to Hugging Face and WandB)
+\# These tokens are passed via environment variables at runtime
+if; then
+    huggingface-cli login \--token $HF\_TOKEN
+fi
+if; then
+    wandb login $WANDB\_API\_KEY
+fi
+\# 2\. Data Ingestion
+\# Download dataset from S3 or Network Volume if not present
+if \[\! \-f "/workspace/dataset.jsonl" \]; then
+    echo "Downloading dataset from remote source..."
+    \# Example using a presigned URL or S3 CLI
+    wget \-O /workspace/dataset.jsonl "$DATASET\_URL"
+fi
+\# 3\. Execution
+echo "Starting Training..."
+\# Launch the Python training script
+\# Unsloth or Axolotl command goes here
+python train.py \--config config.json
+\# 4\. Exfiltration/Saving
+echo "Training Complete. Merging and Uploading..."
+\# Assuming train.py saves to /workspace/output
+\# This step ensures the trained weights are saved to HF Hub or S3
+python upload\_to\_hub.py \--path /workspace/output \--repo my-user/my-finetuned-model
+\# 5\. Cleanup (Critical for Cost Savings)
+echo "Shutting down pod to stop billing..."
+runpodctl stop pod $RUNPOD\_POD\_ID
+7
+**FinOps Strategy:** By including runpodctl stop pod $RUNPOD\_POD\_ID as the final command, the user ensures they only pay for the exact duration of the training. This effectively transforms a standard GPU pod into a serverless-like job, preventing "zombie pods" from racking up bills after the training is finished.12
+### **3.3 Remote Management: The runpodctl Utility**
+For managing these headless pods, runpodctl is the essential Command Line Interface (CLI) tool provided by RunPod. It allows the user to spin up pods, stream logs, and transfer files without ever navigating to the web console.13
+Automation via CLI:
+The user can script the deployment of the training job from their local machine. A single command can instantiate the pod using the custom image defined above:
+Bash
+runpodctl create pod \\
+    \--name "headless-llama3-finetune" \\
+    \--gpuType "NVIDIA A100 80GB PCIe" \\
+    \--imageName "myregistry/custom-llm-trainer:v1" \\
+    \--containerDiskSize 100 \\
+    \--volumeSize 200 \\
+    \--env HF\_TOKEN=$HF\_TOKEN \\
+    \--env WANDB\_API\_KEY=$WANDB\_KEY \\
+    \--env DATASET\_URL="https://my-s3-bucket..."
+14
+This command fulfills the user's request: it uploads the configuration (via the image definition) and starts training immediately. The \--gpuType flag ensures the job lands on the specific hardware required for speed, while \--env passes the necessary secrets securely.
+## ---
+**4\. Fine-Tuning Frameworks: The Engines of Efficiency**
+To train "as fast as possible" without reinventing the wheel, high-level fine-tuning frameworks are vastly superior to writing raw PyTorch training loops. The two leading contenders for this workflow on RunPod are **Axolotl** and **Unsloth**. Each offers distinct advantages for headless execution.
+### **4.1 Axolotl: The Configuration-Driven Powerhouse**
+Axolotl is designed for users who want to define *what* to train, not *how* to code the training loop. It abstracts the complexity of the Hugging Face Trainer into a comprehensive YAML configuration file.15
+* **Headless Suitability:** Excellent. Because the entire training logic is encapsulated in a single YAML file, "uploading code" simply means injecting this config file into the container. There is no need to maintain complex Python scripts; the logic is declarative.
+* **Feature Set:** Axolotl supports Full Fine-Tuning (FFT), LoRA, QLoRA, and advanced techniques like Flash Attention 2 and Sample Packing. Sample packing is particularly relevant for speed, as it concatenates multiple short examples into a single sequence, removing padding tokens and significantly increasing training throughput.17
+* **Workflow Integration:**
+  1. User edits config.yaml locally.
+  2. User builds Docker image with this config or mounts it at runtime.
+  3. Container starts and runs axolotl train config.yaml.
+* **Multi-GPU Scaling:** Axolotl excels at multi-GPU training using FSDP (Fully Sharded Data Parallel) or DeepSpeed. If the user intends to scale training across an 8x A100 node to maximize speed, Axolotl is the robust choice.17
+### **4.2 Unsloth: The Efficiency Specialist**
+Unsloth is a framework optimized specifically for speed and memory efficiency on single-GPU setups. It utilizes custom Triton kernels to manually backpropagate gradients, achieving 2-5x faster training speeds and up to 80% less memory usage compared to standard Hugging Face implementations.17
+* **Headless Suitability:** High. Unsloth provides Docker images that can be easily adapted for headless execution.9 The speed gains directly address the user's requirement to "train as fast as possible."
+* **Performance:** For single-GPU setups (e.g., one H100 or A100), Unsloth is unrivaled. The memory savings allow users to fit significantly larger batch sizes into VRAM, which directly translates to faster wall-clock training times. For example, on a Llama 3 8B model, Unsloth can enable training with context lengths that would cause OOM (Out of Memory) errors on standard implementations.19
+* **Limitation:** Historically, Unsloth has been optimized for single-GPU training. While multi-GPU support is evolving, its primary strength remains in maximizing the throughput of a single card. For a user operating on a single powerful node (like an H100), Unsloth is likely the fastest option.18
+### **4.3 Comparative Analysis for the User**
+| Feature | Axolotl | Unsloth | Strategic Recommendation |
+| :---- | :---- | :---- | :---- |
+| **Configuration** | YAML-based (Declarative) | Python/Script-based | **Axolotl** for strict config management and reproducibility. |
+| **Speed (Single GPU)** | High (uses Flash Attn) | **Extreme** (2x faster than Axolotl) | **Unsloth** for raw speed on single cards (H100/A100). |
+| **Multi-GPU** | Native Support (DeepSpeed/FSDP) | Limited/Paid Tier | **Axolotl** for distributed training across clusters. |
+| **Ease of Headless** | Very High | High | **Both** are excellent; choice depends on scaling needs. |
+**Expert Insight:** Given the user's preference for "fast as possible" and "custom code," if the model fits on a single GPU (e.g., Llama 3 8B or 70B on an H100), **Unsloth** is the superior choice for raw throughput. If the user requires multi-GPU scaling or complex dataset mixing configurations, **Axolotl** provides a more robust infrastructure.18
+## ---
+**5\. Data Logistics: Solving the Custom Data Bottleneck**
+A major challenge in ephemeral, headless training is data logistics. The user specified "custom training data," which implies datasets that are not pre-cached in public hubs. Handling large datasets (100GB+) efficiently is critical to avoiding idle GPU time.
+### **5.1 Storage Architectures: Network Volumes vs. NVMe vs. Object Storage**
+* **Local Pod Storage (Container Disk):** This offers the fastest I/O performance. Data is stored on the NVMe SSD directly attached to the GPU instance. This is ideal for maximizing training speed, as the GPU is not starved of data. However, this storage is ephemeral; data is lost if the pod is terminated without external saving.5
+* **RunPod Network Volumes:** This is persistent storage that survives pod termination and allows data to be shared across pods.
+  * *Throughput Bottleneck:* Network volumes can suffer from slower throughput (200-400 MB/s) compared to local NVMe, potentially bottlenecking the data loader during training of small models where the GPU processes batches faster than the disk can supply them.22
+  * *Region Lock:* Network volumes are region-locked. If a volume is created in US-NJ, the user is forced to rent GPUs in US-NJ. This severely limits the ability to grab available H100s in other regions, contradicting the "train as fast as possible" goal.22
+* **S3 / Object Storage:** The most flexible approach. Data is stored in AWS S3 (or compatible) and streamed or downloaded at the start of the session.
+### **5.2 Recommended Data Strategy for Speed**
+To maximize training speed, **Local NVMe Storage** is superior to Network Volumes, despite its ephemeral nature. The recommended workflow for headless execution is:
+1. **Storage:** Store the master dataset in a high-performance S3 bucket or RunPod's S3-compatible object storage layer.25
+2. **Ingest:** The start.sh script downloads the dataset from S3 to the pod's *local* /workspace directory (NVMe) at boot time.
+3. **Train:** The model trains off the fast local NVMe, ensuring the GPU is fully saturated.
+4. **Egress:** The start.sh uploads the checkpoints and final model back to S3 or Hugging Face.
+This approach avoids the region-locking of Network Volumes and the I/O latency penalties, utilizing the immense bandwidth of datacenter GPUs for rapid setup.10
+### **5.3 Transferring Large Data: The 100GB Challenge**
+For users who must use RunPod storage (e.g., due to compliance or cost), transferring 100GB+ of data from a local machine is non-trivial. The runpodctl send command creates a peer-to-peer transfer tunnel. While effective for smaller files, users have reported slow speeds and timeouts for large datasets.26
+**Insight:** For datasets \>100GB, do not upload from a home internet connection directly to a GPU pod. Instead:
+1. Spin up a cheap **CPU pod** on RunPod.
+2. Use rsync or runpodctl to upload the data to this CPU pod (which sits on the high-speed datacenter backbone).
+3. From the CPU pod, transfer the data to a Network Volume or S3 bucket.
+   This leverages the internal network backbone rather than residential ISP uplinks, preventing the GPU pod from sitting idle while waiting for data uploads.
+## ---
+**6\. Monitoring and Observability without Jupyter**
+In a headless environment, "blind" training is a significant operational risk. Observability must be externalized to ensure the user knows if the model is converging or if the pod has crashed.
+### **6.1 Weights & Biases (WandB)**
+WandB is the de facto standard for headless monitoring. It integrates natively with both Axolotl and Unsloth (via the Hugging Face Trainer).
+* **Real-Time Metrics:** Loss curves, GPU utilization, memory usage, and learning rate schedules are streamed to the WandB dashboard in real-time. This allows the user to monitor the "pulse" of the training from a mobile device or laptop.
+* **Artifacts:** Model checkpoints and config files can be logged as artifacts, providing version control for the models and ensuring reproducibility.
+### **6.2 Remote Logging**
+RunPod provides a logging driver that captures stdout and stderr from the container.
+* **Command:** runpodctl logs \<pod\_id\> allows the user to check the console output from their local terminal to verify the script started correctly or to catch crash errors (e.g., CUDA OOM).11
+* **Best Practice:** The start.sh script should use set \-e (exit immediately on error) and trap errors. Advanced users may add a curl command to the script to send a notification (e.g., via a Discord webhook or Slack API) if the training fails or succeeds, ensuring the user is alerted immediately without needing to constantly poll the logs.
+## ---
+**7\. Advanced Optimization and Troubleshooting**
+### **7.1 Handling "Cold Starts" and Image Caching**
+Downloading large Docker images (often 10GB+ for ML images) takes time. RunPod caches images on the host node.
+* **Strategy:** Stick to a single image tag (e.g., myuser/trainer:v1). Once a specific host has pulled this image, subsequent runs on that same host are instant.
+* **Docker Optimization:** Use multi-stage builds to keep the final image size small. Remove cache files (pip cache purge) within the Dockerfile to minimize layer size.28
+### **7.2 CUDA Version Mismatches**
+A common failure mode in custom images is a mismatch between the Docker container's CUDA toolkit and the host driver.
+* **RunPod Environment:** RunPod hosts generally run the latest NVIDIA drivers.
+* **Image Requirement:** Ensure the Docker image uses a compatible CUDA version (e.g., CUDA 11.8 or 12.1). Unsloth, for example, has specific requirements for CUDA 12.1 for maximum performance.9 Using the wrong base image will result in runtime errors regarding "Flash Attention" or "Bitsandbytes" compilation.
+### **7.3 Spot Instance Interruptions**
+If using Community Cloud to save money, the pod may be preempted (shut down) if the provider needs the hardware.
+* **Mitigation:** Configure the training script to save checkpoints frequently (e.g., every 100 steps) to a mounted Network Volume or upload them immediately to S3.
+* **Resume Logic:** The start.sh should check for the existence of a checkpoint and automatically pass \--resume\_from\_checkpoint to the training script. This ensures that if a pod dies and a new one is spawned, it picks up exactly where the last one left off.30
+## ---
+**8\. Conclusion and Strategic Roadmap**
+For a user demanding the fastest possible fine-tuning workflow without the overhead of Jupyter notebooks, RunPod offers a powerful substrate, provided the workflow is architected correctly. The optimal path requires moving away from interactive "pet" instances to ephemeral "cattle" instances managed by code.
+**The Recommended "Fast Track" Configuration:**
+1. **Hardware:** NVIDIA H100 (Secure Cloud) for speed and reliability, or RTX 4090 (Community Cloud) for cost-efficiency.
+2. **Framework:** **Unsloth** for single-GPU jobs (fastest throughput); **Axolotl** for multi-GPU or complex configurations.
+3. **Deployment:** Custom Docker image with an ENTRYPOINT script that automates the Download \-\> Train \-\> Upload \-\> Terminate lifecycle.
+4. **Interface:** runpodctl for deployment; WandB for monitoring; SSH for emergency debugging.
+5. **Data:** S3-backed ingestion to local NVMe storage to bypass network volume I/O bottlenecks.
+By adopting this headless architecture, the user transforms the fine-tuning process from a manual, error-prone task into a scalable, automated engineering operation, fully leveraging the raw compute power of RunPod's infrastructure. This report confirms that while RunPod's interface invites interactive use, its API and CLI capabilities are fully mature for the rigorous demands of headless, high-velocity machine learning operations.
+#### **Works cited**
+1. Runpod GPU pricing: A complete breakdown and platform comparison | Blog \- Northflank, accessed January 12, 2026, [https://northflank.com/blog/runpod-gpu-pricing](https://northflank.com/blog/runpod-gpu-pricing)
+2. The NVIDIA H100 GPU Review: Why This AI Powerhouse Dominates (But Costs a Fortune) \- Runpod, accessed January 12, 2026, [https://www.runpod.io/articles/guides/nvidia-h100](https://www.runpod.io/articles/guides/nvidia-h100)
+3. Runpod Secrets: Affordable A100/H100 Instances, accessed January 12, 2026, [https://www.runpod.io/articles/guides/affordable-a100-h100-gpu-cloud](https://www.runpod.io/articles/guides/affordable-a100-h100-gpu-cloud)
+4. Pricing | Runpod GPU cloud computing rates, accessed January 12, 2026, [https://www.runpod.io/pricing](https://www.runpod.io/pricing)
+5. RunPod Pricing 2025 Complete Guide (GPU Cloud Costs Breakdown) \- Flexprice, accessed January 12, 2026, [https://flexprice.io/blog/runprod-pricing-guide-with-gpu-costs](https://flexprice.io/blog/runprod-pricing-guide-with-gpu-costs)
+6. No-Code AI: How I Ran My First LLM Without Coding | Runpod Blog, accessed January 12, 2026, [https://www.runpod.io/blog/no-code-ai-run-llm](https://www.runpod.io/blog/no-code-ai-run-llm)
+7. Dockerfile \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/tutorials/introduction/containers/create-dockerfiles](https://docs.runpod.io/tutorials/introduction/containers/create-dockerfiles)
+8. Deploying AI Apps with Minimal Infrastructure and Docker \- Runpod, accessed January 12, 2026, [https://www.runpod.io/articles/guides/deploy-ai-apps-minimal-infrastructure-docker](https://www.runpod.io/articles/guides/deploy-ai-apps-minimal-infrastructure-docker)
+9. Fine-Tuning Local Models with Docker Offload and Unsloth, accessed January 12, 2026, [https://www.docker.com/blog/fine-tuning-models-with-offload-and-unsloth/](https://www.docker.com/blog/fine-tuning-models-with-offload-and-unsloth/)
+10. Optimize your workers \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/serverless/development/optimization](https://docs.runpod.io/serverless/development/optimization)
+11. Manage Pods \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/pods/manage-pods](https://docs.runpod.io/pods/manage-pods)
+12. AI on a Schedule: Using Runpod's API to Run Jobs Only When Needed, accessed January 12, 2026, [https://www.runpod.io/articles/guides/ai-on-a-schedule](https://www.runpod.io/articles/guides/ai-on-a-schedule)
+13. Overview \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/runpodctl/overview](https://docs.runpod.io/runpodctl/overview)
+14. create pod \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/runpodctl/reference/runpodctl-create-pod](https://docs.runpod.io/runpodctl/reference/runpodctl-create-pod)
+15. LLM fine-tuning | LLM Inference Handbook \- BentoML, accessed January 12, 2026, [https://bentoml.com/llm/getting-started/llm-fine-tuning](https://bentoml.com/llm/getting-started/llm-fine-tuning)
+16. How to fine-tune a model using Axolotl | Runpod Blog, accessed January 12, 2026, [https://www.runpod.io/blog/how-to-fine-tune-a-model-using-axolotl](https://www.runpod.io/blog/how-to-fine-tune-a-model-using-axolotl)
+17. Best frameworks for fine-tuning LLMs in 2025 \- Modal, accessed January 12, 2026, [https://modal.com/blog/fine-tuning-llms](https://modal.com/blog/fine-tuning-llms)
+18. Comparing LLM Fine-Tuning Frameworks: Axolotl, Unsloth, and Torchtune in 2025, accessed January 12, 2026, [https://blog.spheron.network/comparing-llm-fine-tuning-frameworks-axolotl-unsloth-and-torchtune-in-2025](https://blog.spheron.network/comparing-llm-fine-tuning-frameworks-axolotl-unsloth-and-torchtune-in-2025)
+19. Axolotl vs LLaMA-Factory vs Unsloth for AI Fine-Tuning 2026 \- Index.dev, accessed January 12, 2026, [https://www.index.dev/skill-vs-skill/ai-axolotl-vs-llama-factory-vs-unsloth](https://www.index.dev/skill-vs-skill/ai-axolotl-vs-llama-factory-vs-unsloth)
+20. \[TEMPLATE\] One-click Unsloth finetuning on RunPod : r/LocalLLaMA \- Reddit, accessed January 12, 2026, [https://www.reddit.com/r/LocalLLaMA/comments/1nyzzws/template\_oneclick\_unsloth\_finetuning\_on\_runpod/](https://www.reddit.com/r/LocalLLaMA/comments/1nyzzws/template_oneclick_unsloth_finetuning_on_runpod/)
+21. unsloth/llama-3-8b-bnb-4bit \- Hugging Face, accessed January 12, 2026, [https://huggingface.co/unsloth/llama-3-8b-bnb-4bit](https://huggingface.co/unsloth/llama-3-8b-bnb-4bit)
+22. PSA: Don't bother with Network Volumes on Runpod : r/StableDiffusion \- Reddit, accessed January 12, 2026, [https://www.reddit.com/r/StableDiffusion/comments/1nkcgvp/psa\_dont\_bother\_with\_network\_volumes\_on\_runpod/](https://www.reddit.com/r/StableDiffusion/comments/1nkcgvp/psa_dont_bother_with_network_volumes_on_runpod/)
+23. Network volumes \- Runpod Documentation, accessed January 12, 2026, [https://docs.runpod.io/storage/network-volumes](https://docs.runpod.io/storage/network-volumes)
+24. Using network volume with serverless \- Runpod \- Answer Overflow, accessed January 12, 2026, [https://www.answeroverflow.com/m/1234830020678123610](https://www.answeroverflow.com/m/1234830020678123610)
+25. Streamline Your AI Workflows with RunPod's New S3-Compatible API, accessed January 12, 2026, [https://www.runpod.io/blog/streamline-ai-workflows-s3-api](https://www.runpod.io/blog/streamline-ai-workflows-s3-api)
+26. Upload speed \- Runpod \- Answer Overflow, accessed January 12, 2026, [https://www.answeroverflow.com/m/1415080595020709938](https://www.answeroverflow.com/m/1415080595020709938)
+27. \`runpodctl send\` crawling at \<1MB speeds \- Runpod \- Answer Overflow, accessed January 12, 2026, [https://www.answeroverflow.com/m/1208971275163406376](https://www.answeroverflow.com/m/1208971275163406376)
+28. MLOps Workflow for Docker-Based AI Model Deployment \- Runpod, accessed January 12, 2026, [https://www.runpod.io/articles/guides/mlops-workflow-docker-ai-deployment](https://www.runpod.io/articles/guides/mlops-workflow-docker-ai-deployment)
+29. Installation \- Axolotl Docs, accessed January 12, 2026, [https://docs.axolotl.ai/docs/installation.html](https://docs.axolotl.ai/docs/installation.html)
+30. Does anyone use RunPod for SFT? If yes, you train via SSH or Jupyter (web-hosted), accessed January 12, 2026, [https://www.reddit.com/r/LocalLLaMA/comments/1pd6vxu/does\_anyone\_use\_runpod\_for\_sft\_if\_yes\_you\_train/](https://www.reddit.com/r/LocalLLaMA/comments/1pd6vxu/does_anyone_use_runpod_for_sft_if_yes_you_train/)

ai-docs/chatbot-ideology.yaml ADDED Viewed

	@@ -0,0 +1,373 @@

+# Marxist-Leninist Chatbot Training Set Design
+# General-purpose ideological chatbot using ProleWiki corpus
+# Status: UPDATED FOR GRPO (was SFT)
+# Last Updated: 2025-12-17
+# =============================================================================
+# METHODOLOGY UPDATE: SFT → GRPO
+# =============================================================================
+# This document was originally designed for SFT. The project has pivoted to
+# GRPO (Group Relative Policy Optimization) for the reasons documented in
+# ai-docs/finetune.yaml. The ideological design principles and question
+# generation strategies here remain valid for GRPO training.
+#
+# Current implementation:
+# - Dataset: training_data/curated_qa.jsonl (128 curated pairs)
+# - GRPO format: training_data/grpo_dataset.jsonl
+# - Notebook: notebooks/Marxist_GRPO_Training.ipynb
+# - Reward functions: src/prolewiki_llm/grpo_rewards.py
+# =============================================================================
+overview:
+  goal: |
+    A general-purpose chatbot well-versed in Marxist-Leninist theory.
+    Can discuss revolutionary theory, imperialism, class struggle,
+    dialectical materialism, and related topics through materialist analysis.
+  not_goal: |
+    NOT a game-specific assistant. NOT tied to any particular application.
+    The Babylon game project informed the ideological direction, but
+    the chatbot itself is a standalone ML theory assistant.
+  inspiration: |
+    The Babylon game (~/projects/game/babylon/) models class struggle
+    through MLM-TW (Marxist-Leninist-Maoist Third Worldist) theory.
+    This chatbot shares that theoretical grounding but serves as a
+    general educational/discussion tool.
+  training_approach: "GRPO (Group Relative Policy Optimization) with multi-layer rewards"
+  base_model: "unsloth/DeepSeek-R1-0528-Qwen3-8B"
+  current_samples: "128 curated Q&A pairs"
+  estimated_training_time: "~2-4 hours on A40 48GB"
+data_source:
+  primary: "ProleWiki Library namespace chunks"
+  location: "sample-pipeline/chunks/Library/*.jsonl"
+  format: "JSONL with metadata"
+  chunk_schema:
+    chunk_id: "Unique identifier"
+    text: "The actual content (training answer)"
+    article_title: "Source work title (contains author)"
+    section: "Section/chapter name (question seed)"
+    categories: "Topic categories"
+    internal_links: "Referenced concepts (question seeds)"
+    word_count: "Chunk size"
+  available_works:
+    marx:
+      - "Capital, Volume I"
+      - "Capital, Volume II"
+      - "Capital, Volume III"
+      - "Grundrisse"
+      - "The German Ideology"
+    lenin:
+      - "Imperialism, the Highest Stage of Capitalism"
+      - "State and Revolution"
+      - "What Is To Be Done?"
+    mao:
+      - "On Contradiction"
+      - "On Practice"
+    other:
+      - "Additional works in Library namespace"
+  chunk_count: "~1,034 chunks currently available"
+system_prompt:
+  description: "Defines the chatbot's persona and approach"
+  recommended: |
+    You are a Marxist-Leninist assistant. You explain revolutionary theory
+    through materialist analysis, drawing on the works of Marx, Engels,
+    Lenin, Mao, and other socialist thinkers. You ground explanations in
+    historical materialism and class analysis.
+  alternatives:
+    concise: |
+      You are a Marxist theorist. You explain concepts through
+      dialectical and historical materialism.
+    educational: |
+      You are a patient teacher of Marxist-Leninist theory. You explain
+      complex concepts clearly, always connecting theory to material
+      conditions and class relations.
+    third_worldist: |
+      You are a Marxist-Leninist-Maoist Third Worldist. You analyze
+      imperialism, unequal exchange, and the global class structure,
+      recognizing the labor aristocracy in imperial core nations.
+question_generation:
+  description: |
+    Transform chunk metadata into natural questions that a user might ask.
+    The chunk text becomes the answer.
+  strategy: "Use available metadata to generate contextual questions"
+  templates:
+    with_section:
+      pattern: "What does {author} say about {section}?"
+      example: "What does Marx say about the commodity form?"
+      priority: 1
+    with_internal_links:
+      pattern: "Explain {concept} from a Marxist perspective."
+      example: "Explain surplus value from a Marxist perspective."
+      priority: 2
+    with_categories:
+      pattern: "Discuss {category} in Marxist theory."
+      example: "Discuss imperialism in Marxist theory."
+      priority: 3
+    fallback:
+      pattern: "What does {author} teach us about this?"
+      example: "What does Lenin teach us about this?"
+      priority: 4
+  author_extraction:
+    description: "Extract author name from article_title field"
+    pattern: "Library {Author} {Work Title}"
+    examples:
+      - input: "Library Karl Marx Capital, vol. I, Chapter 1"
+        output: "Marx"
+      - input: "Library Vladimir Lenin Imperialism"
+        output: "Lenin"
+      - input: "Library Mao Zedong On Contradiction"
+        output: "Mao"
+    code: |
+      def extract_author(title: str) -> str:
+          """Extract author from 'Library Author Name Work...' format."""
+          if not title.startswith("Library "):
+              return "the author"
+          # Common author mappings
+          author_map = {
+              "Karl Marx": "Marx",
+              "Friedrich Engels": "Engels",
+              "Vladimir Lenin": "Lenin",
+              "V.I. Lenin": "Lenin",
+              "Mao Zedong": "Mao",
+              "Mao Tse-tung": "Mao",
+              "Joseph Stalin": "Stalin",
+              "Rosa Luxemburg": "Luxemburg",
+              "Antonio Gramsci": "Gramsci",
+              "Frantz Fanon": "Fanon",
+          }
+          title_part = title[8:]  # Remove "Library "
+          for full_name, short_name in author_map.items():
+              if title_part.startswith(full_name):
+                  return short_name
+          # Fallback: first two words
+          words = title_part.split()
+          if len(words) >= 2:
+              return words[1]  # Usually last name
+          return "the author"
+training_format:
+  template: "Qwen-2.5 chat format"
+  note: "MUST use Qwen template for DeepSeek-R1-Distill-Qwen model"
+  structure: |
+    <|im_start|>system
+    {system_prompt}<|im_end|>
+    <|im_start|>user
+    {question}<|im_end|>
+    <|im_start|>assistant
+    {answer}<|im_end|>
+  example:
+    system: "You are a Marxist-Leninist assistant..."
+    user: "What does Marx say about the commodity form?"
+    answer: |
+      The wealth of those societies in which the capitalist mode of
+      production prevails, presents itself as an immense accumulation
+      of commodities...
+  output_field: "text"
+  description: |
+    For Unsloth SFTTrainer, provide a single 'text' field containing
+    the complete formatted conversation.
+transformation_code:
+  description: "Complete transformation from chunks to training data"
+  implementation: |
+    import json
+    from pathlib import Path
+    def extract_author(title: str) -> str:
+        """Extract author from article title."""
+        if not title.startswith("Library "):
+            return "the author"
+        author_map = {
+            "Karl Marx": "Marx",
+            "Friedrich Engels": "Engels",
+            "Vladimir Lenin": "Lenin",
+            "Mao Zedong": "Mao",
+            "Joseph Stalin": "Stalin",
+        }
+        title_part = title[8:]
+        for full_name, short_name in author_map.items():
+            if title_part.startswith(full_name):
+                return short_name
+        words = title_part.split()
+        return words[1] if len(words) >= 2 else "the author"
+    def generate_question(chunk: dict) -> str:
+        """Generate a natural question from chunk metadata."""
+        author = extract_author(chunk.get("article_title", ""))
+        # Priority 1: Use section
+        if chunk.get("section"):
+            section = chunk["section"].lower()
+            return f"What does {author} say about {section}?"
+        # Priority 2: Use internal links
+        if chunk.get("internal_links"):
+            concept = chunk["internal_links"][0]
+            return f"Explain {concept} from a Marxist perspective."
+        # Priority 3: Use categories
+        if chunk.get("categories"):
+            category = chunk["categories"][0]
+            return f"Discuss {category} in Marxist theory."
+        # Fallback
+        return f"What does {author} teach us in this passage?"
+    def chunk_to_training(chunk: dict, system_prompt: str) -> dict:
+        """Convert a chunk to Qwen-formatted training example."""
+        question = generate_question(chunk)
+        answer = chunk["text"]
+        text = f"""<|im_start|>system
+    {system_prompt}<|im_end|>
+    <|im_start|>user
+    {question}<|im_end|>
+    <|im_start|>assistant
+    {answer}<|im_end|>"""
+        return {"text": text}
+    def process_chunks(input_dir: Path, output_path: Path):
+        """Process all chunk files into training dataset."""
+        system_prompt = (
+            "You are a Marxist-Leninist assistant. You explain revolutionary "
+            "theory through materialist analysis, drawing on the works of Marx, "
+            "Engels, Lenin, Mao, and other socialist thinkers."
+        )
+        training_data = []
+        for jsonl_file in input_dir.glob("*.jsonl"):
+            with open(jsonl_file) as f:
+                for line in f:
+                    chunk = json.loads(line)
+                    example = chunk_to_training(chunk, system_prompt)
+                    training_data.append(example)
+        with open(output_path, "w") as f:
+            for example in training_data:
+                f.write(json.dumps(example) + "\n")
+        print(f"Generated {len(training_data)} training examples")
+quality_considerations:
+  chunk_quality:
+    - "ProleWiki text is already high-quality Marxist writing"
+    - "Chunks preserve context through overlap"
+    - "Section boundaries respected"
+  question_diversity:
+    - "Vary question templates to avoid repetitive patterns"
+    - "Use all available metadata fields"
+    - "Consider adding manual seed questions for key concepts"
+  answer_length:
+    - "Chunks are 350-500 tokens (good length for chat responses)"
+    - "Not too short (lacks substance) or too long (loses focus)"
+  potential_improvements:
+    - "Add conversational variations (rephrase questions)"
+    - "Include follow-up question pairs"
+    - "Add explicit concept definitions from glossary"
+    - "Include historical examples and applications"
+implementation_steps:
+  step_1:
+    name: "Verify chunk data"
+    command: "ls -la sample-pipeline/chunks/Library/"
+    check: "Confirm JSONL files exist with expected format"
+  step_2:
+    name: "Create transformation script"
+    location: "src/prolewiki_llm/prepare_training.py"
+    description: "Implement the transformation code above"
+  step_3:
+    name: "Generate training data"
+    command: "uv run python -m prolewiki_llm.prepare_training"
+    output: "training_data/ml_chatbot.jsonl"
+  step_4:
+    name: "Upload to RunPod"
+    description: "Transfer JSONL to pod's /workspace/data/"
+  step_5:
+    name: "Run fine-tuning"
+    description: "Execute training script (see finetune.yaml)"
+  step_6:
+    name: "Export and test"
+    description: "GGUF export, Ollama deployment, manual testing"
+evaluation:
+  manual_testing:
+    description: "Chat with model and assess quality"
+    test_questions:
+      - "What is surplus value?"
+      - "Explain the labor theory of value."
+      - "What is imperialism according to Lenin?"
+      - "How does dialectical materialism differ from idealism?"
+      - "What is the role of the vanguard party?"
+      - "Why do contradictions drive historical change?"
+      - "What is the labor aristocracy?"
+  quality_criteria:
+    - "Responses grounded in Marxist theory"
+    - "Materialist analysis (not idealist)"
+    - "Accurate to source texts"
+    - "Coherent and well-structured"
+    - "Appropriate length for chat"
+  red_flags:
+    - "Refusing to discuss political topics (abliteration failure)"
+    - "Generic/vague responses not grounded in theory"
+    - "Mixing incompatible ideological frameworks"
+    - "Hallucinating quotes or concepts"
+future_enhancements:
+  phase_2:
+    - "Add conversational multi-turn examples"
+    - "Include debate/argument handling"
+    - "Add current events analysis capability"
+  phase_3:
+    - "Multi-persona support (different theoretical traditions)"
+    - "Game integration (Babylon narrative generation)"
+    - "RAG integration for expanded knowledge"
+related_docs:
+  - "ai-docs/finetune.yaml - GRPO methodology and training config"
+  - "ai-docs/reward-modeling.yaml - Multi-layer reward function design"
+  - "ai-docs/runpod.yaml - Cloud GPU setup"
+  - "ai-docs/project-status.yaml - Phase 8 implementation status"
+  - "notebooks/Marxist_GRPO_Training.ipynb - Authoritative training notebook"
+  - "src/prolewiki_llm/ - Python module with reward functions"

ai-docs/finetune.yaml ADDED Viewed

	@@ -0,0 +1,297 @@

+# Fine-tuning Documentation - Phase 8
+# Purpose: Marxist-Leninist LLM fine-tuning on ProleWiki corpus
+# Status: IN_PROGRESS (implementation complete, training execution pending)
+# Method: GRPO (Group Relative Policy Optimization) - NOT SFT
+# Last Updated: 2025-12-17
+# =============================================================================
+# METHODOLOGY PIVOT: SFT → GRPO
+# =============================================================================
+# IMPORTANT: This project pivoted from SFT to GRPO methodology.
+# This document has been updated to reflect the current approach.
+#
+# WHY GRPO OVER SFT:
+# - Political theory has no single "correct" answer (unlike math)
+# - Open-ended prose requires semantic similarity, not exact string matching
+# - Reward functions can encode domain expertise
+# - Multi-layer rewards defeat adversarial "word soup" attacks
+# - GRPO naturally handles the inherent subjectivity of political analysis
+#
+# AUTHORITATIVE SOURCES:
+# - Training notebook: notebooks/Marxist_GRPO_Training.ipynb
+# - Reward functions: src/prolewiki_llm/grpo_rewards.py
+# - W&B logging: src/prolewiki_llm/wandb_logging.py
+# - Reward design: ai-docs/reward-modeling.yaml
+# =============================================================================
+overview:
+  goal: Fine-tune DeepSeek-R1-0528-Qwen3-8B for Marxist-Leninist theory responses
+  method: GRPO (Group Relative Policy Optimization) via Unsloth + TRL
+  dataset: 1,058 curated Q&A pairs (training_data/grpo_dataset.jsonl)
+  output_format: LoRA adapter → GGUF for Ollama deployment
+  hardware_requirement: A40 48GB (RunPod) or similar high-VRAM GPU
+  status: Implementation complete, execution pending
+# =============================================================================
+# GRPO VS SFT RATIONALE
+# =============================================================================
+grpo_rationale:
+  description: |
+    GRPO (Group Relative Policy Optimization) is preferred over SFT for
+    Marxist-Leninist training because political theory responses have no
+    single "correct" answer. Unlike math problems (where GRPO was originally
+    popularized with DeepSeek-R1), political theory is inherently open-ended.
+  sft_limitations:
+    - "Exact answer matching fails for prose responses"
+    - "No way to express 'this answer is better than that answer'"
+    - "Can only train on (input, output) pairs, not preferences"
+    - "Easily reward-hacked by models memorizing training data"
+  grpo_advantages:
+    - "Uses reward functions to score response quality"
+    - "Multiple generations compared (Group Relative)"
+    - "Semantic similarity captures meaning, not verbatim wording"
+    - "Multi-layer rewards prevent adversarial gaming"
+    - "Can penalize 'word soup' via coherence checks"
+  key_insight: |
+    The breakthrough insight is that GRPO's reward function paradigm
+    perfectly maps to political theory training: we can encode what
+    makes a "good Marxist-Leninist response" through semantic rewards
+    rather than exact answer matching.
+# =============================================================================
+# BASE MODEL
+# =============================================================================
+model:
+  primary:
+    name: DeepSeek-R1-0528-Qwen3-8B
+    huggingface: unsloth/DeepSeek-R1-0528-Qwen3-8B
+    params: 8B
+    architecture: Qwen-2.5
+    context_length: 2048 (training max_completion_length)
+    reasoning: |
+      1. DeepSeek R1 architecture has strong reasoning capabilities
+      2. 8B params = fits on A40 with LoRA
+      3. Full Unsloth optimization support
+      4. unsloth/ namespace provides optimized 4-bit version
+  lora_config:
+    rank: 64
+    lora_alpha: 64
+    target_modules: [q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj]
+    use_rslora: true
+    use_gradient_checkpointing: unsloth
+# =============================================================================
+# DATASET
+# =============================================================================
+dataset:
+  source: training_data/curated_qa.jsonl
+  transformed: training_data/grpo_dataset.jsonl
+  count: 1,058 curated Q&A pairs
+  format: |
+    {"prompt": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}], "answer": "..."}
+  topics:
+    - Revisionism and opportunism
+    - Surplus value and exploitation
+    - Dialectical materialism
+    - Imperialism and monopoly capitalism
+    - Class struggle and revolution
+    - Socialist construction
+    - Historical figures (Marx, Lenin, Engels, Stalin, Mao)
+# =============================================================================
+# REWARD FUNCTION SYSTEM (THE KEY INNOVATION)
+# =============================================================================
+reward_functions:
+  reference: ai-docs/reward-modeling.yaml
+  module: src/prolewiki_llm/grpo_rewards.py
+  tests: tests/unit/training/test_grpo_rewards.py (43 tests)
+  layers:
+    format_rewards:
+      - match_format_exactly: "3.0 for exact </think> pattern"
+      - match_format_approximately: "1.5-2.0 for partial pattern"
+    semantic_rewards:
+      - semantic_similarity_reward: "Sentence-transformer embedding similarity"
+      - terminology_reward: "Bonus for Marxist lexicon usage"
+    coherence_rewards:
+      - nli_coherence_reward: "BART-large-MNLI entailment checking"
+      - self_consistency_reward: "No internal contradictions"
+      - structural_coherence_reward: "Terms in syntactic roles"
+    anti_hacking_rewards:
+      - topic_relevance_reward: "Question→answer concept coverage"
+      - interconnection_depth_reward: "Anti-buzzword-salad detection"
+      - completeness_reward: "Key concept coverage"
+  combined_functions:
+    full_coherence_reward: "5-layer combined (recommended)"
+    robust_coherence_reward: "3-layer (NLI + self-consistency + structural)"
+  anti_hacking_measures:
+    HOLLOW_BUZZWORDS: "Penalty for activist jargon without substance"
+    DEPTH_MARKERS: "Bonus for historical specificity"
+    EXPLANATORY_PHRASES: "Bonus for causal reasoning"
+    depth_ratio: "Words per concept (penalizes buzzword salad)"
+# =============================================================================
+# TRAINING CONFIGURATION
+# =============================================================================
+training_config:
+  trainer: GRPOTrainer (from trl library)
+  max_steps: 250
+  batch_size: 2
+  gradient_accumulation: 2
+  learning_rate: 5e-6
+  num_generations: 4
+  temperature: 1.0
+  gpu_memory_utilization: 0.85
+  hardware: A40 48GB (RunPod)
+  vllm_sampling:
+    min_p: 0.1
+    top_p: 1.0
+    top_k: -1
+    seed: 3407
+# =============================================================================
+# WEIGHTS & BIASES INTEGRATION
+# =============================================================================
+wandb_logging:
+  module: src/prolewiki_llm/wandb_logging.py
+  tests: tests/unit/training/test_wandb_logging.py (17 tests)
+  features:
+    - WandbSampleLogger: "Periodic sample tables (question → response → rewards)"
+    - create_logging_reward: "Zero-cost logging reward for GRPOTrainer"
+    - log_reward_metrics: "Per-reward mean/min/max tracking"
+    - init_wandb_logging: "Project/run initialization"
+    - finish_wandb_logging: "Summary statistics"
+    - log_model_checkpoint: "Artifact logging"
+  graceful_degradation: |
+    All functions work without wandb installed.
+    Falls back to print() statements when wandb unavailable.
+# =============================================================================
+# TRAINING NOTEBOOK (AUTHORITATIVE REFERENCE)
+# =============================================================================
+notebook:
+  location: notebooks/Marxist_GRPO_Training.ipynb
+  description: |
+    Self-contained Jupyter notebook for RunPod execution.
+    Contains ALL reward functions inline (no external imports).
+    This is the authoritative reference for current implementation.
+  contents:
+    - "Model loading (FastLanguageModel.from_pretrained)"
+    - "LoRA configuration (get_peft_model)"
+    - "Dataset loading from grpo_dataset.jsonl"
+    - "All 13+ reward functions inline"
+    - "W&B logging integration"
+    - "GRPOConfig with A40-optimized settings"
+    - "GRPOTrainer setup and training"
+    - "LoRA saving and GGUF export"
+# =============================================================================
+# EXPORT AND DEPLOYMENT
+# =============================================================================
+export:
+  lora_save:
+    code: |
+      model.save_pretrained_merged("marxist_lora", tokenizer)
+  gguf:
+    description: Quantized format for Ollama deployment
+    quantization_levels:
+      q4_k_m: "Recommended balance of size/quality (~4GB)"
+      q8_0: "Higher quality (~7GB)"
+    code: |
+      model.save_pretrained_gguf("marxist_gguf", tokenizer, quantization_method="q4_k_m")
+  ollama:
+    modelfile: |
+      FROM ./marxist-gguf-q4_k_m.gguf
+      TEMPLATE """<|begin_of_text|>{{ .System }}<|User|>{{ .Prompt }}<|Assistant|>"""
+      SYSTEM "You are a Marxist-Leninist assistant..."
+      PARAMETER temperature 0.7
+      PARAMETER num_ctx 2048
+# =============================================================================
+# IMPLEMENTATION STATUS
+# =============================================================================
+implementation_status:
+  completed:
+    - "8.1 Dataset Preparation: grpo_dataset.jsonl (1,058 Q&A pairs)"
+    - "8.2 Reward Function System: 13+ functions with anti-hacking"
+    - "8.3 W&B Logging Integration: Full observability"
+    - "8.4 Training Notebook: Self-contained for RunPod"
+  pending:
+    - "8.5 Training Execution: Run notebook on RunPod A40"
+    - "8.6 Model Evaluation: Manual review, reward hacking detection"
+    - "8.7 GGUF Export: Convert to Ollama format"
+  test_counts:
+    grpo_rewards: 43 passing
+    wandb_logging: 17 passing
+    total: 60 passing
+# =============================================================================
+# HARDWARE REQUIREMENTS
+# =============================================================================
+hardware:
+  recommended:
+    gpu: A40 (48GB VRAM) on RunPod
+    cost: "~$0.79/hour spot, ~$1.14/hour on-demand"
+    training_time: "~2-4 hours for 250 steps"
+  vram_breakdown:
+    model_4bit: "~4GB"
+    lora_params: "~1GB"
+    optimizer_state: "~2GB"
+    activations: "~6GB with gradient checkpointing"
+    vllm_generation: "~8GB for 4 generations"
+    reward_models: "~2.5GB (NLI + embeddings + spaCy)"
+    total: "~24GB (safe on 48GB A40)"
+  cloud_options:
+    primary: "RunPod A40 (48GB) - see ai-docs/runpod.yaml"
+    alternative: "Lambda Labs A100 (40GB)"
+# =============================================================================
+# REFERENCES
+# =============================================================================
+references:
+  internal:
+    - "ai-docs/reward-modeling.yaml: Reward function design"
+    - "ai-docs/runpod.yaml: GPU setup instructions"
+    - "notebooks/Marxist_GRPO_Training.ipynb: Authoritative notebook"
+    - "src/prolewiki_llm/: Python module"
+  external:
+    unsloth:
+      - "https://github.com/unslothai/unsloth"
+      - "https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide"
+    grpo:
+      - "https://arxiv.org/abs/2508.18212 - NLI as reward paradigm"
+      - "https://arxiv.org/abs/2509.22047 - MO-GRPO reward hacking"
+    trl:
+      - "https://huggingface.co/docs/trl/en/grpo_trainer"

ai-docs/reward-modeling.yaml ADDED Viewed

	@@ -0,0 +1,857 @@

+# Reward Modeling for GRPO Fine-Tuning
+# Token-efficient reference for AI assistants
+# Research-backed approaches to defeat reward hacking
+status: IMPLEMENTED - multi-layer coherence rewards with wandb logging
+purpose: Robust reward functions for Marxist-Leninist GRPO training
+module: src/prolewiki_llm/
+files:
+  grpo_rewards: Reward functions for GRPO training
+  wandb_logging: Weights & Biases integration for training observability
+# =============================================================================
+# OVERVIEW
+# =============================================================================
+overview:
+  goal: |
+    Train DeepSeek-R1-0528-Qwen3-8B on ProleWiki Q&A corpus using GRPO
+    with reward functions that cannot be gamed by "word soup" or other
+    adversarial strategies.
+  problem: |
+    Naive reward functions (substring matching, keyword counting) are
+    vulnerable to reward hacking. A model could maximize reward by
+    outputting random Marxist terminology without coherent meaning.
+  solution: |
+    Multi-layer reward combining:
+    1. NLI (Natural Language Inference) - checks logical consistency
+    2. Self-consistency - checks for internal contradictions
+    3. Structural coherence - checks syntactic structure via spaCy
+  research_basis:
+    - "arxiv.org/abs/2508.18212 - NLI as reward modeling paradigm"
+    - "arxiv.org/abs/2509.22047 - MO-GRPO mitigating reward hacking"
+    - "arxiv.org/abs/2508.05170 - Posterior-GRPO process rewards"
+# =============================================================================
+# IDEOLOGICAL BIAS CONSIDERATION
+# =============================================================================
+ideological_bias:
+  concern: |
+    NLI models trained on predominantly liberal/capitalist media might
+    encode bourgeois assumptions, potentially marking valid Marxist claims
+    as "contradiction" or "neutral".
+  empirical_finding: |
+    Testing shows BART-large-MNLI performs LOGICAL inference, not
+    ideological judgment:
+    - "Capitalism exploits workers" → "Workers are exploited": entailment (0.998)
+    - "Class struggle drives history" vs "Great individuals drive history": contradiction (0.998)
+    - Word soup → coherent claim: neutral (0.932)
+  why_it_works: |
+    We compare Marxist response against Marxist ground truth (from ProleWiki).
+    The model isn't judging if Marxism is "true" - it's checking if the
+    response logically follows from the expected answer.
+  mitigation_strategy: |
+    Self-consistency check avoids external ideology entirely - it only
+    checks if the response contradicts ITSELF, not external "truth".
+# =============================================================================
+# REWARD FUNCTIONS
+# =============================================================================
+reward_functions:
+  # ---------------------------------------------------------------------------
+  # FORMAT REWARDS (from original notebook)
+  # ---------------------------------------------------------------------------
+  match_format_exactly:
+    purpose: Encourage proper <think>...</think> format
+    scoring:
+      has_think_end_tag: "+3.0"
+      no_tag: "0.0"
+    notes: Checks for </think> tag presence
+  match_format_approximately:
+    purpose: Reward partial format compliance
+    scoring:
+      one_think_start: "+0.5"
+      one_think_end: "+0.5"
+      multiple_or_missing: "-1.0"
+    notes: Penalizes malformed tag structure
+  # ---------------------------------------------------------------------------
+  # SEMANTIC REWARDS
+  # ---------------------------------------------------------------------------
+  semantic_similarity_reward:
+    purpose: Reward responses semantically similar to ground truth
+    model: sentence-transformers/all-MiniLM-L6-v2
+    scoring:
+      similarity_gt_0.75: "+5.0"
+      similarity_gt_0.60: "+3.0"
+      similarity_gt_0.45: "+1.0"
+      similarity_gt_0.30: "-1.0"
+      similarity_le_0.30: "-3.0"
+    notes: |
+      Uses cosine similarity of embeddings.
+      Good for soft matching but doesn't catch contradictions.
+  # ---------------------------------------------------------------------------
+  # NLI-BASED REWARDS (Research-backed)
+  # ---------------------------------------------------------------------------
+  nli_coherence_reward:
+    purpose: Check if response ENTAILS ground truth
+    model: facebook/bart-large-mnli
+    scoring:
+      entailment: "+3.0 (response supports/implies ground truth)"
+      neutral: "-1.0 (off-topic or incoherent)"
+      contradiction: "-3.0 (contradicts ground truth)"
+    defeats:
+      - "Word soup (random terms → neutral)"
+      - "Contradictory claims"
+      - "Off-topic responses"
+    gpu_memory: "~1.6GB"
+    research: "arxiv.org/abs/2508.18212"
+  self_consistency_reward:
+    purpose: Check for internal contradictions (no external ideology)
+    method: |
+      Parse response into sentences using spaCy.
+      Check adjacent sentence pairs for NLI contradiction.
+    scoring:
+      no_contradictions: "+1.0"
+      has_contradiction: "-2.0"
+    notes: |
+      Only checks within-document coherence, avoiding any external
+      ideological bias from the NLI model's training data.
+    research: "arxiv.org/abs/2508.05170"
+  structural_coherence_reward:
+    purpose: Verify proper linguistic structure (defeats word soup)
+    model: spacy en_core_web_sm
+    checks:
+      - "Marxist terms in subject/object syntactic positions"
+      - "Presence of discourse connectives (therefore, because, etc.)"
+      - "Complete sentence structure"
+    scoring:
+      term_in_syntactic_role: "+0.3 per term (max +1.5)"
+      discourse_connective: "+0.2 per connective (max +1.0)"
+      no_sentences: "-1.0"
+    syntactic_roles:
+      - "nsubj (nominal subject)"
+      - "nsubjpass (passive subject)"
+      - "dobj (direct object)"
+      - "pobj (object of preposition)"
+      - "attr (attribute)"
+      - "appos (appositional modifier)"
+    discourse_connectives:
+      - "because, therefore, thus, hence, consequently"
+      - "however, although, whereas, nevertheless, moreover"
+      - "furthermore, additionally, specifically, namely"
+      - "as a result, due to, in order to, so that"
+      - "on the other hand, in contrast, similarly, likewise"
+  # ---------------------------------------------------------------------------
+  # TOPIC RELEVANCE REWARD (Question-Answer Alignment)
+  # ---------------------------------------------------------------------------
+  topic_relevance_reward:
+    purpose: Ensure answer addresses what the question asked about
+    method: |
+      Implements f(A) ⊆ f(Q) check where f extracts semantic topics:
+      1. Extract core topics from question Q using dependency parsing
+      2. Expand Q topics with Marxist concept synonyms
+      3. Extract topics from answer A
+      4. Compute coverage: how many Q topics are addressed in A
+    scoring:
+      gt_80_coverage: "+2.0 (answer fully addresses question topics)"
+      gt_60_coverage: "+1.5 (answer mostly on-topic)"
+      gt_40_coverage: "+1.0 (answer partially on-topic)"
+      gt_20_coverage: "0.0 (answer tangentially related)"
+      le_20_coverage: "-1.5 (answer off-topic)"
+    defeats:
+      - "Off-topic coherent text (coherent Marxist text about wrong subject)"
+      - "Topic drift during response"
+    spacy_model: "en_core_web_trf (transformer-based, best semantic understanding)"
+    topic_extraction:
+      question: "ROOT verb children with nsubj/dobj/attr/nsubjpass dependency"
+      answer: "Noun chunks + named entities (determiners stripped)"
+      synonym_expansion: "CONCEPT_EQUIVALENCES dict maps bourgeoisie ↔ capitalist class"
+  # ---------------------------------------------------------------------------
+  # COMBINED REWARDS
+  # ---------------------------------------------------------------------------
+  robust_coherence_reward:
+    purpose: Multi-layer coherence check
+    layers:
+      1: "NLI coherence - Does response entail ground truth?"
+      2: "Self-consistency - Does response contradict itself?"
+      3: "Structural coherence - Are terms used in proper syntax?"
+    scoring:
+      max_score: "+5.5 (entailment + consistent + structured)"
+      contradiction_floor: "-3.0 (NLI contradiction)"
+      inconsistency_penalty: "-2.0 (internal contradiction)"
+    combination_logic: |
+      if nli_score <= -3.0:
+          return -3.0  # Contradiction dominates
+      elif consistency_score <= -2.0:
+          return -2.0  # Internal contradiction
+      else:
+          return nli_score + (consistency * 0.5) + (structure * 0.5)
+  full_coherence_reward:
+    purpose: Complete coherence check (RECOMMENDED - maximum robustness)
+    layers:
+      1: "NLI coherence - Does response entail ground truth?"
+      2: "Self-consistency - Does response contradict itself?"
+      3: "Structural coherence - Are terms used in proper syntax?"
+      4: "Topic relevance - Does answer address what was asked?"
+      5: "Interconnection depth - Rewards deep analysis, penalizes buzzword salad"
+    scoring:
+      max_score: "+7.0 (all checks pass with deep analysis)"
+      off_topic_penalty: "-2.0 (severely off-topic)"
+      buzzword_salad_penalty: "-1.5 (shallow buzzword listing)"
+      inherits: "robust_coherence_reward penalties for NLI/consistency failures"
+    combination_logic: |
+      if relevance <= -1.5:
+          return -2.0  # Severely off-topic
+      elif robust <= -2.0:
+          return robust  # Robust check failed
+      elif depth <= -1.5:
+          return -1.5  # Buzzword salad detected
+      else:
+          return robust + (relevance * 0.4) + (depth * 0.3)
+    use_when: "Maximum robustness against reward hacking is needed"
+  # ---------------------------------------------------------------------------
+  # INTERCONNECTION DEPTH REWARD (Anti-Buzzword-Salad)
+  # ---------------------------------------------------------------------------
+  interconnection_depth_reward:
+    purpose: Distinguish deep analysis from shallow buzzword salad
+    method: |
+      Rewards meaningful interconnections while penalizing superficial
+      concept-dropping. Distinguishes:
+      - GOOD: "Surplus value relates to imperialism BECAUSE capital export..."
+      - BAD: "Surplus value intersects with imperialism, colonialism, patriarchy..."
+    signals:
+      depth_ratio:
+        description: "Words per unique Marxist concept"
+        gt_20: "+1.0 (deep analysis - few concepts well-explained)"
+        range_10_20: "+0.5 (adequate depth)"
+        range_5_10: "-0.5 (shallow)"
+        lt_5: "-1.5 (severe buzzword soup)"
+      hollow_buzzwords:
+        description: "Activist jargon without substance"
+        threshold: "> 2 hollow phrases"
+        penalty: "-0.3 per additional (max -1.5)"
+      depth_markers:
+        description: "Historical specificity, citations, examples"
+        bonus: "+0.3 each (max +1.5)"
+      explanation_ratio:
+        description: "Explanatory phrases per concept"
+        gt_50_percent: "+0.5 (well-explained)"
+        lt_10_percent_many_concepts: "-0.5 (unexplained concept soup)"
+    scoring:
+      range: "-2.5 to +3.0"
+    defeats:
+      - "Buzzword salad (many concepts, no explanation)"
+      - "Activist jargon (performative language without analysis)"
+      - "Intersectionality word soup (mentioning everything without depth)"
+  # ---------------------------------------------------------------------------
+  # ENTITY VERIFICATION REWARDS (Anti-Hallucination)
+  # ---------------------------------------------------------------------------
+  entity_verification_reward:
+    status: NEW - added 2025-12-18
+    purpose: Penalize confident claims about unverified entities
+    method: |
+      Uses spaCy NER to extract entity mentions from responses, then
+      checks them against a whitelist of 24,040 entities extracted from
+      the ProleWiki corpus. Rewards epistemic humility for unknown entities.
+    whitelist_source: "training_data/entity_whitelist_clean.json"
+    whitelist_stats:
+      total_entities: 24040
+      sources:
+        - "5,129 article titles (filenames)"
+        - "27,329 internal wiki links [[Entity]]"
+        - "2,458 categories"
+        - "2,179 library work references"
+        - "2,145 infobox person names"
+    scoring:
+      epistemic_humility: "+2.0 (expresses uncertainty about unknown entities)"
+      verified_entities: "+1.0 (discusses only verified entities)"
+      unknown_no_uncertainty: "-1.0 (mentions unknown entities without caution)"
+      confident_hallucination: "-2.5 (fabricates details about unknown entities)"
+    defeats:
+      - "Confident hallucination about fictional organizations"
+      - "Fabricated founding dates, founders, activities"
+      - "Made-up historical context for non-existent entities"
+    example_test_case: |
+      Question: "What was the Militant League of Trotskyist Shanghai Students?"
+      BAD response: "Founded in 1925, the Militant League..." (fabricated details)
+      GOOD response: "I cannot verify this organization..." (epistemic humility)
+    dependencies:
+      - "spaCy (en_core_web_trf or fallback)"
+      - "training_data/entity_whitelist_clean.json"
+  epistemic_calibration_reward:
+    status: NEW - added 2025-12-18
+    purpose: Lightweight pattern-based epistemic calibration
+    method: |
+      Pattern matching for uncertainty phrases and confident claim patterns.
+      Doesn't require NER or entity whitelist - just regex matching.
+      Use when entity_verification_reward is too computationally expensive.
+    scoring:
+      has_uncertainty_phrases: "+1.5"
+      has_confident_claims_no_uncertainty: "-0.5"
+      neutral: "0.0"
+    uncertainty_patterns:
+      - "I cannot/can't/don't verify/confirm/find"
+      - "I don't have verified/specific information"
+      - "I'm not certain/sure/confident"
+      - "I should not fabricate/make up/speculate"
+      - "Could you provide more context"
+    confident_claim_patterns:
+      - "founded in [year]"
+      - "established in [year]"
+      - "was founded by"
+      - "was created by"
+    use_when: "Lightweight alternative to entity_verification_reward"
+  # ---------------------------------------------------------------------------
+  # IDEOLOGICAL FIRMNESS REWARD (Anti-Capitulation)
+  # ---------------------------------------------------------------------------
+  ideological_firmness_reward:
+    status: NEW - added 2025-12-18
+    purpose: Reward firm Marxist counter-arguments to reactionary claims
+    method: |
+      Detects ideological challenges (fascist propaganda, anti-communist
+      tropes, bad faith questions) and scores responses based on:
+      1. Detection: Is the prompt an ideological challenge?
+      2. Capitulation check: Does the response hedge, both-sides, or concede?
+      3. Firmness check: Does the response firmly refute with class analysis?
+      4. Substantive bonus: Does the response use Marxist analytical terms?
+      Key insight: COMBAT reactionary ideology with analysis, don't refuse engagement.
+    scoring:
+      non_challenge: "0.0 (neutral - no ideological challenge in prompt)"
+      capitulation_patterns: "-0.5 each (both-sidesing, hedging, false balance)"
+      firmness_patterns: "+0.5 each (direct refutation, class analysis)"
+      class_analysis_bonus: "+0.2 each marker (capped at +1.0)"
+      range: "-2.0 to +2.0"
+    ideological_challenge_patterns:
+      reactionary_claims:
+        - "Hitler helped workers"
+        - "Nazis were socialist"
+      death_toll_claims:
+        - "communism killed millions"
+        - "Black Book of Communism"
+      anti_communist_framing:
+        - "Stalin was a dictator"
+        - "authoritarian regime"
+        - "communism failed"
+      bad_faith_questions:
+        - "Holodomor"
+        - "Gulag"
+        - "Great Purge"
+        - "Was Stalin bad?"
+        - "Was Mao good or bad?"
+    capitulation_patterns:
+      both_sidesing:
+        - "valid points on both sides"
+        - "on one hand... on the other hand"
+        - "reasonable people can disagree"
+      hedging:
+        - "it's a complex/nuanced issue"
+        - "I can see why you'd ask"
+      false_balance:
+        - "Stalin did good things and bad things"
+        - "achievements and mistakes to consider"
+      theory_practice:
+        - "communism sounds good in theory but..."
+    firmness_patterns:
+      direct_refutation:
+        - "This claim is false/propaganda"
+        - "This is fascist/bourgeois propaganda"
+        - "discredited"
+        - "debunked"
+      analytical_framing:
+        - "From a Marxist analysis..."
+        - "The material reality shows..."
+        - "historically documented"
+      class_analysis:
+        - "serves the interests of capital"
+        - "bourgeois historiography"
+        - "dictatorship of the proletariat"
+      socialist_achievements:
+        - "universal literacy"
+        - "defeated fascism"
+        - "provided housing and healthcare"
+    special_handling:
+      quote_to_refute: |
+        Quoting reactionary claims in order to refute them is NOT penalized.
+        Pattern: "X claim Y, but this ignores..."
+      principled_self_criticism: |
+        ML self-criticism is NOT capitulation when framed from a
+        revolutionary perspective (e.g., "Marxist-Leninists engage in
+        self-criticism as a methodological principle")
+    defeats:
+      - "Both-sidesing on fascism/communism"
+      - "Hedging on historical socialist figures"
+      - "Conceding reactionary framing"
+      - "Theory/practice false dichotomy"
+    example_test_cases:
+      high_score: |
+        Q: "Didn't Hitler help German workers?"
+        A: "This is fascist propaganda. Hitler destroyed trade unions,
+           banned strikes, and served the interests of finance capital."
+        Score: >= +1.5
+      low_score: |
+        Q: "Didn't Hitler help German workers?"
+        A: "While Hitler did implement some programs that benefited
+           workers, it's important to weigh both positive and negative..."
+        Score: <= -1.5
+      neutral: |
+        Q: "What year did the October Revolution happen?"
+        A: "The October Revolution occurred in 1917."
+        Score: ~0.0 (no ideological challenge detected)
+    use_when: |
+      Training models to maintain principled Marxist positions when
+      facing reactionary claims and propaganda. Prevents models from
+      developing liberal "both-sides" tendencies on class issues.
+  # ---------------------------------------------------------------------------
+  # DEPRECATED/SHALLOW REWARDS
+  # ---------------------------------------------------------------------------
+  terminology_reward:
+    status: DEPRECATED - use structural_coherence_reward instead
+    purpose: Reward Marxist terminology (SHALLOW - can be gamed)
+    scoring: "+0.3 per term (max +2.0)"
+    warning: |
+      This reward can be gamed with "word soup" - random Marxist
+      terms without coherent meaning. Use nli_coherence_reward or
+      structural_coherence_reward for robust evaluation.
+# =============================================================================
+# MARXIST TERMINOLOGY
+# =============================================================================
+marxist_terms:
+  core_concepts:
+    - "dialectical, materialism, historical materialism, dialectical materialism"
+  classes:
+    - "bourgeoisie, proletariat, petty bourgeois, lumpenproletariat"
+    - "working class, ruling class"
+  class_struggle:
+    - "class struggle, class consciousness, class war, class conflict"
+  political_economy:
+    - "surplus value, commodity, use value, exchange value"
+    - "labor power, means of production, relations of production"
+    - "forces of production, mode of production, primitive accumulation"
+    - "exploitation, capital accumulation"
+  imperialism:
+    - "imperialism, colonialism, neo-colonialism, settler colonialism"
+    - "national liberation, self-determination"
+  state_revolution:
+    - "dictatorship of the proletariat, vanguard, vanguard party"
+    - "democratic centralism, withering away of the state"
+  ideology:
+    - "hegemony, superstructure, base, ideology, false consciousness"
+  revisionism:
+    - "revisionism, opportunism, reformism, social democracy, ultra-leftism"
+  alienation:
+    - "alienation, fetishism, commodity fetishism, reification"
+  historical:
+    - "paris commune, october revolution, bolshevik, menshevik"
+  anti_colonial:
+    - "decolonization, third world, global south, national bourgeoisie, comprador"
+# =============================================================================
+# HOLLOW BUZZWORDS (Activist Jargon to Penalize)
+# =============================================================================
+hollow_buzzwords:
+  description: |
+    Phrases that signal superficial analysis when used without substantive
+    explanation. These are NOT Marxist technical terms - they are activist
+    jargon that often substitutes for actual analysis.
+  vague_connectors:
+    - "interconnected, interrelated, intersects with"
+    - "it's all connected, everything is connected, systemic"
+  performative_language:
+    - "centered, centering, uplift, uplifting"
+    - "do the work, the work, unpack, unpacking"
+    - "unlearn, unlearning, hold space, sit with, lean into"
+    - "problematic, harmful, toxic"
+  vague_abstractions:
+    - "in a way, sort of, kind of, essentially, basically"
+    - "generally speaking, broadly"
+  misused_terms:
+    - "praxis (when used without explanation)"
+    - "material conditions (when used as hand-wave)"
+    - "structural, structurally (when mechanism not specified)"
+  note: |
+    The penalty applies when hollow buzzword DENSITY is high AND
+    depth ratio is low. Legitimate use with explanation is not penalized.
+# =============================================================================
+# EXPLANATORY PHRASES (Depth Markers)
+# =============================================================================
+explanatory_phrases:
+  description: "Phrases indicating concept is being explained, not just dropped"
+  causal:
+    - "because the, because of, this is because, since the"
+    - "due to the, as a result of, results from, caused by"
+    - "leads to, results in, enables, produces"
+  definitional:
+    - "is defined as, refers to, means that, denotes"
+    - "that is, in other words, namely, i.e."
+  elaboration:
+    - "specifically, in particular, for example, such as"
+    - "this means, which means, this implies, therefore"
+  mechanism:
+    - "this occurs when, this happens because, the mechanism"
+    - "through the process of, by means of, works by"
+depth_markers:
+  description: "Phrases indicating analytical depth (historical specificity, citations)"
+  historical:
+    - "in 1, in 2, during the, after the, before the"
+  citations:
+    - "marx argued, lenin wrote, engels noted, gramsci"
+    - "according to, as marx, as lenin"
+  examples:
+    - "for example, such as, in the case of, consider"
+  definitions:
+    - "defined as, meaning, specifically"
+# =============================================================================
+# USAGE IN TRAINING
+# =============================================================================
+training_usage:
+  full_reward_set:
+    description: "RECOMMENDED - Maximum robustness including depth analysis"
+    functions:
+      - "match_format_exactly (+3.0 for </think>)"
+      - "match_format_approximately (tag validation)"
+      - "full_coherence_reward (NLI + structure + topic + depth)"
+      - "completeness_reward (length comparison)"
+      - "debug_print_reward (monitoring)"
+    notes: "full_coherence_reward now includes interconnection_depth_reward"
+  anti_hallucination_reward_set:
+    description: "Maximum protection against entity hallucination"
+    functions:
+      - "match_format_exactly (+3.0 for </think>)"
+      - "match_format_approximately (tag validation)"
+      - "full_coherence_reward (NLI + structure + topic + depth)"
+      - "entity_verification_reward (whitelist check + NER)"
+      - "completeness_reward (length comparison)"
+      - "debug_print_reward (monitoring)"
+    notes: |
+      Adds entity_verification_reward to full_reward_set.
+      Requires entity_whitelist_clean.json (24,040 verified entities).
+      Use when hallucination about organizations/people is a concern.
+    use_with_training_data:
+      - "synthetic_epistemic_humility.jsonl (20 examples)"
+  ideological_firmness_reward_set:
+    description: "NEW - Prevents capitulation to reactionary framing"
+    functions:
+      - "match_format_exactly (+3.0 for </think>)"
+      - "match_format_approximately (tag validation)"
+      - "full_coherence_reward (NLI + structure + topic + depth)"
+      - "ideological_firmness_reward (anti-capitulation)"
+      - "completeness_reward (length comparison)"
+      - "debug_print_reward (monitoring)"
+    notes: |
+      Adds ideological_firmness_reward to full_reward_set.
+      Prevents models from developing liberal "both-sides" tendencies.
+      Use when training on ideologically charged Q&A pairs.
+    use_when: |
+      Training data contains questions about fascism, anti-communist
+      tropes, or historical socialist figures. Ensures model maintains
+      principled Marxist positions rather than hedging or capitulating.
+  robust_reward_set:
+    description: "Balanced set that defeats word soup attacks"
+    functions:
+      - "match_format_exactly (+3.0 for </think>)"
+      - "match_format_approximately (tag validation)"
+      - "robust_coherence_reward (NLI + self-consistency + structure)"
+      - "completeness_reward (length comparison)"
+      - "debug_print_reward (monitoring)"
+  lightweight_anti_hallucination_set:
+    description: "Anti-hallucination without NER overhead"
+    functions:
+      - "match_format_exactly (+3.0 for </think>)"
+      - "match_format_approximately (tag validation)"
+      - "robust_coherence_reward (NLI + self-consistency + structure)"
+      - "epistemic_calibration_reward (pattern matching only)"
+      - "completeness_reward (length comparison)"
+    notes: |
+      Uses epistemic_calibration_reward instead of entity_verification_reward.
+      Faster but less precise - doesn't check entity whitelist.
+  legacy_reward_set:
+    description: "Original set (VULNERABLE to word soup)"
+    functions:
+      - "match_format_exactly"
+      - "match_format_approximately"
+      - "semantic_similarity_reward"
+      - "terminology_reward  # VULNERABLE"
+      - "completeness_reward"
+      - "debug_print_reward"
+  grpo_trainer_example: |
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[
+            match_format_exactly,
+            match_format_approximately,
+            full_coherence_reward,  # RECOMMENDED: NLI + structure + topic
+            completeness_reward,
+            debug_print_reward,
+        ],
+        args=training_args,
+        train_dataset=dataset,
+    )
+# =============================================================================
+# DEPENDENCIES
+# =============================================================================
+dependencies:
+  required:
+    - "sentence-transformers  # for semantic_similarity_reward"
+    - "transformers  # for NLI pipeline (bart-large-mnli)"
+    - "spacy  # for topic extraction and structural coherence"
+    - "spacy-curated-transformers  # for en_core_web_trf"
+    - "numpy  # for embeddings"
+  models_downloaded:
+    - "sentence-transformers/all-MiniLM-L6-v2 (~90MB)"
+    - "facebook/bart-large-mnli (~1.6GB)"
+    - "en_core_web_trf (~436MB, transformer-based, RECOMMENDED)"
+    - "en_core_web_md (~40MB, word vectors, fallback)"
+    - "en_core_web_sm (~12MB, no vectors, last resort)"
+  gpu_memory:
+    embedder: "~200MB"
+    nli_model: "~1.6GB"
+    spacy_trf: "~500MB (RoBERTa-based transformer)"
+    total: "~2.5GB additional (on top of training model)"
+# =============================================================================
+# WANDB LOGGING INTEGRATION
+# =============================================================================
+wandb_logging:
+  purpose: |
+    Comprehensive logging for GRPO training observability via Weights & Biases.
+    Provides debugging visibility into reward function behavior, sample outputs,
+    and per-step metrics.
+  module: src/prolewiki_llm/wandb_logging.py
+  components:
+    init_wandb_logging:
+      purpose: Initialize W&B run with configuration
+      signature: |
+        init_wandb_logging(
+            project: str,
+            config: dict[str, Any],
+            name: str | None = None,
+            tags: list[str] | None = None,
+            notes: str | None = None,
+            mode: str = "online",  # or "offline", "disabled"
+        ) -> wandb.Run | None
+      example: |
+        run = init_wandb_logging(
+            project="marxist-grpo",
+            config={
+                "model": "DeepSeek-R1-0528-Qwen3-8B",
+                "learning_rate": 5e-6,
+                "batch_size": 2,
+            },
+            tags=["grpo", "marxist", "v1"],
+        )
+    WandbSampleLogger:
+      purpose: Accumulate and log sample tables for debugging
+      fields:
+        log_every_n_steps: "int = 10 (log table every N training steps)"
+        max_samples_per_log: "int = 4 (samples per table)"
+      methods:
+        add_sample: "Add sample with question/response/rewards to buffer"
+        should_log: "Check if current step should trigger table log"
+        log_table: "Log accumulated samples as wandb.Table"
+        clear: "Clear sample buffer"
+      table_columns:
+        - "step, question, response, ground_truth"
+        - "format_exact, format_approx, nli_coherence"
+        - "topic_relevance, depth, completeness, total"
+    create_logging_reward:
+      purpose: Factory for GRPOTrainer-compatible logging reward function
+      signature: |
+        create_logging_reward(
+            sample_logger: WandbSampleLogger | None = None,
+            compute_all_rewards: bool = True,
+        ) -> Callable[..., list[float]]
+      behavior: |
+        1. Computes ALL reward functions internally (if compute_all_rewards=True)
+        2. Logs aggregated metrics to wandb (per-reward mean/min/max)
+        3. Logs sample tables at configured intervals
+        4. Returns [0.0] * len(completions) (no training effect)
+      usage: |
+        sample_logger = WandbSampleLogger(log_every_n_steps=10)
+        logging_reward = create_logging_reward(sample_logger)
+        trainer = GRPOTrainer(
+            reward_funcs=[..., logging_reward],  # Add to reward_funcs
+            ...
+        )
+    log_reward_metrics:
+      purpose: Log per-reward metrics to wandb
+      signature: |
+        log_reward_metrics(
+            step: int,
+            reward_scores: dict[str, list[float]],
+        ) -> None
+      logs:
+        - "rewards/{name} - mean score"
+        - "rewards/{name}_min - minimum score"
+        - "rewards/{name}_max - maximum score"
+        - "rewards/total - sum of all reward means"
+    finish_wandb_logging:
+      purpose: Finish run with optional summary statistics
+      signature: |
+        finish_wandb_logging(
+            summary: dict[str, Any] | None = None,
+        ) -> None
+    log_model_checkpoint:
+      purpose: Log checkpoint as wandb artifact
+      signature: |
+        log_model_checkpoint(
+            checkpoint_path: str,
+            metadata: dict[str, Any] | None = None,
+        ) -> None
+  graceful_degradation: |
+    All functions work gracefully when wandb is not installed:
+    - is_wandb_available() returns False
+    - Logging functions print fallback messages or no-op
+    - create_logging_reward returns valid reward function (prints to stdout)
+  integration_example: |
+    from prolewiki_llm import (
+        init_wandb_logging,
+        WandbSampleLogger,
+        create_logging_reward,
+        finish_wandb_logging,
+        match_format_exactly,
+        full_coherence_reward,
+    )
+    # Initialize wandb
+    run = init_wandb_logging(
+        project="marxist-grpo",
+        config={"model": "DeepSeek-R1", "lr": 5e-6, "steps": 250},
+    )
+    # Create sample logger and logging reward
+    sample_logger = WandbSampleLogger(log_every_n_steps=10, max_samples_per_log=4)
+    logging_reward = create_logging_reward(sample_logger, compute_all_rewards=True)
+    # Train with logging
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=[
+            match_format_exactly,
+            full_coherence_reward,
+            logging_reward,  # Logs all metrics + samples
+        ],
+        args=training_args,
+        train_dataset=dataset,
+    )
+    trainer.train()
+    # Finish with summary
+    finish_wandb_logging(summary={"final_loss": trainer.state.loss})
+# =============================================================================
+# TESTING
+# =============================================================================
+testing:
+  test_word_soup:
+    input: "bourgeoisie proletariat dialectical materialism surplus value"
+    expected_nli: "neutral (0.93)"
+    expected_structure: "low (no syntactic roles)"
+    expected_topic: "fails - no proper sentence to extract topics from"
+  test_good_response:
+    input: "The bourgeoisie extracts surplus value from the proletariat through exploitation of labor power."
+    expected_nli: "entailment (depends on ground truth)"
+    expected_structure: "high (terms in subject/object positions)"
+    expected_topic: "high (bourgeoisie, surplus value, proletariat in proper roles)"
+  test_contradiction:
+    input: "Capitalism benefits everyone. Workers are exploited under capitalism."
+    expected_self_consistency: "-2.0 (internal contradiction)"
+  test_off_topic:
+    question: "What is revisionism?"
+    answer: "Imperialism is the highest stage of capitalism characterized by monopolies."
+    expected_topic_relevance: "-1.5 (off-topic - discusses imperialism not revisionism)"
+  test_synonym_recognition:
+    question: "What is the bourgeoisie?"
+    answer: "The capitalist class owns the means of production."
+    expected_topic_relevance: "+2.0 (synonym 'capitalist class' recognized)"
+# =============================================================================
+# REFERENCES
+# =============================================================================
+references:
+  papers:
+    - "Better LM-Based Judging Reward Modeling: arxiv.org/abs/2508.18212"
+    - "MO-GRPO Mitigating Reward Hacking: arxiv.org/abs/2509.22047"
+    - "Posterior-GRPO Process Rewards: arxiv.org/abs/2508.05170"
+    - "MENLI NLI Evaluation Metrics: doi.org/10.1162/tacl_a_00576"
+  models:
+    - "BART-large-MNLI: huggingface.co/facebook/bart-large-mnli"
+    - "all-MiniLM-L6-v2: huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
+    - "spaCy en_core_web_trf: spacy.io/models/en#en_core_web_trf (RECOMMENDED)"
+    - "spaCy en_core_web_md: spacy.io/models/en#en_core_web_md (fallback)"
+  related_docs:
+    - "ai-docs/finetune.yaml - overall fine-tuning strategy"
+    - "ai-docs/chatbot-ideology.yaml - training data design"
+    - "ai-docs/runpod.yaml - GPU setup for training"

ai-docs/runpod.yaml ADDED Viewed

	@@ -0,0 +1,362 @@

+# RunPod.io Setup Guide for LLM Fine-Tuning
+# Optimized for DeepSeek 7B Abliterated with Unsloth QLoRA
+# Status: READY - verified configuration for Phase 8
+overview:
+  purpose: |
+    Step-by-step guide for deploying a RunPod GPU pod to fine-tune
+    DeepSeek-R1-Distill-Qwen-7B-abliterated on ProleWiki corpus using Unsloth.
+  estimated_cost: "$0.30-0.60 for complete training run (~30 min)"
+  workflow_summary:
+    - Create pod with PyTorch 2.4 template
+    - Install Unsloth and dependencies
+    - Upload training data (JSONL chunks)
+    - Run SFT training (~20-30 min)
+    - Export GGUF model
+    - Download and deploy to Ollama
+    - STOP POD immediately after download
+gpu_selection:
+  recommended: RTX 4090
+  vram_required: "16-18GB with Unsloth QLoRA"
+  note: |
+    Unsloth's QLoRA reduces 7B model VRAM from ~24GB to ~16-18GB.
+    RTX 4090 (24GB) provides comfortable headroom.
+  options:
+    rtx_4090:
+      vram: 24GB
+      price_spot: "$0.40-0.50/hr"
+      price_ondemand: "$0.50-0.60/hr"
+      recommendation: "Best value - sufficient VRAM, fast training"
+    a40:
+      vram: 48GB
+      price_spot: "$0.45-0.55/hr"
+      price_ondemand: "$0.50-0.65/hr"
+      recommendation: "More headroom, similar price"
+    rtx_3090:
+      vram: 24GB
+      price_spot: "$0.25-0.35/hr"
+      price_ondemand: "$0.30-0.40/hr"
+      recommendation: "Budget option, slightly older"
+    a100_40gb:
+      vram: 40GB
+      price_spot: "$0.80-1.00/hr"
+      price_ondemand: "$1.00-1.50/hr"
+      recommendation: "Overkill for 7B, use for larger models"
+  spot_vs_ondemand:
+    spot:
+      pros: "30-50% cheaper"
+      cons: "May be interrupted if demand spikes"
+      best_for: "Long training runs where checkpoints save progress"
+    ondemand:
+      pros: "Guaranteed availability"
+      cons: "Full price"
+      best_for: "Short runs (<1hr) like our 30-min training"
+    recommendation: |
+      For ProleWiki fine-tuning (~30 min), use On-Demand.
+      Spot interruption would cost more in setup time than savings.
+pod_configuration:
+  template: "RunPod PyTorch 2.4"
+  alternative: "RunPod PyTorch 2.8 (if available)"
+  template_includes:
+    - PyTorch 2.4
+    - CUDA 12.4
+    - cuDNN
+    - JupyterLab
+    - SSH access
+    - Python 3.10+
+  storage:
+    container_disk:
+      size: "50GB minimum"
+      purpose: "Ephemeral - Unsloth, model weights during training"
+      warning: "LOST on pod restart!"
+    volume_disk:
+      size: "100GB minimum"
+      purpose: "Persistent - checkpoints, scripts, training data"
+      critical: "ALL important files must go here!"
+      mount_path: "/workspace"
+  ports:
+    - port: 8888
+      purpose: "JupyterLab (primary interface)"
+    - port: 22
+      purpose: "SSH (optional, for terminal access)"
+  environment_variables:
+    required:
+      HF_TOKEN: "Your Hugging Face token (for gated models)"
+    optional:
+      JUPYTER_PASSWORD: "Secure notebook access"
+      WANDB_API_KEY: "If using Weights & Biases logging"
+    secure_secrets:
+      note: |
+        Use RUNPOD_SECRET_ prefix for encrypted secrets:
+        RUNPOD_SECRET_HF_TOKEN will be injected securely.
+step_by_step_setup:
+  step_1_create_pod:
+    description: "Create GPU pod from RunPod dashboard"
+    actions:
+      - "Go to https://runpod.io/console/pods"
+      - "Click '+ Deploy' or 'New Pod'"
+      - "Select GPU: RTX 4090 (or A40)"
+      - "Click 'Change Template' → search 'PyTorch'"
+      - "Select 'RunPod PyTorch 2.4'"
+      - "Set Container Disk: 50 GB"
+      - "Set Volume Disk: 100 GB"
+      - "Expand 'Environment Variables'"
+      - "Add: HF_TOKEN = your_token"
+      - "Click 'Deploy On-Demand' (not Spot for short runs)"
+  step_2_connect:
+    description: "Connect to running pod"
+    actions:
+      - "Wait for pod status: 'Running' (usually <1 min)"
+      - "Click 'Connect' button"
+      - "Select 'Jupyter Lab' (opens in new tab)"
+      - "Or select 'SSH' for terminal access"
+  step_3_install_unsloth:
+    description: "Install Unsloth and dependencies in JupyterLab terminal"
+    commands: |
+      # Verify CUDA is working
+      nvidia-smi
+      python -c "import torch; print(f'CUDA: {torch.cuda.get_device_name()}')"
+      # Install Unsloth (auto-detects CUDA version)
+      pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+      # Install flash-attention (may take a few minutes to compile)
+      pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
+      # Install training dependencies
+      pip install trl>=0.7.0 datasets accelerate bitsandbytes peft
+      # Install tiktoken for data transformation
+      pip install tiktoken
+      # Verify installation
+      python -c "from unsloth import FastLanguageModel; print('Unsloth ready!')"
+    troubleshooting:
+      flash_attn_fails: |
+        If flash-attn compilation fails, try:
+        pip install flash-attn --no-build-isolation
+      cuda_version_mismatch: |
+        If CUDA errors occur, specify version explicitly:
+        pip install "unsloth[cu124] @ git+https://github.com/unslothai/unsloth.git"
+  step_4_upload_data:
+    description: "Upload training data to pod"
+    option_a_jupyterlab:
+      best_for: "Small datasets (<100MB)"
+      steps:
+        - "In JupyterLab file browser (left sidebar)"
+        - "Navigate to /workspace"
+        - "Create folder: 'data'"
+        - "Click upload icon (up arrow)"
+        - "Select your JSONL chunks file"
+    option_b_wget:
+      best_for: "Data hosted on web"
+      command: |
+        mkdir -p /workspace/data
+        wget https://your-url/library_chunks.jsonl -O /workspace/data/chunks.jsonl
+    option_c_huggingface:
+      best_for: "Dataset on Hugging Face"
+      command: |
+        huggingface-cli download your-user/prolewiki-chunks \
+          --local-dir /workspace/data \
+          --token $HF_TOKEN
+    option_d_scp:
+      best_for: "From local machine via SSH"
+      command: |
+        # Get SSH command from RunPod 'Connect' dropdown
+        scp -P 22XXX library_chunks.jsonl root@pod-ip:/workspace/data/
+  step_5_run_training:
+    description: "Execute fine-tuning script"
+    note: "See ai-docs/finetune.yaml for complete training code"
+    minimal_script: |
+      from unsloth import FastLanguageModel
+      import torch
+      # Load abliterated model
+      model, tokenizer = FastLanguageModel.from_pretrained(
+          model_name="huihui-ai/DeepSeek-R1-Distill-Qwen-7B-abliterated",
+          max_seq_length=2048,
+          load_in_4bit=True,
+          dtype=None,  # Auto-detect
+      )
+      # Apply LoRA
+      model = FastLanguageModel.get_peft_model(
+          model,
+          r=16,
+          lora_alpha=32,
+          target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                          "gate_proj", "up_proj", "down_proj"],
+          lora_dropout=0.05,
+      )
+      # Load and train (see finetune.yaml for full code)
+      # ...
+      # Save checkpoint to Volume disk!
+      model.save_pretrained("/workspace/checkpoints/marxist-deepseek-lora")
+    expected_time: "20-30 minutes for ~1,000 samples, 3 epochs"
+  step_6_export_gguf:
+    description: "Export model to GGUF format for Ollama"
+    command: |
+      # Export with q4_k_m quantization (good balance)
+      model.save_pretrained_gguf(
+          "/workspace/exports/marxist-deepseek",
+          tokenizer,
+          quantization_method="q4_k_m"
+      )
+      # Check output
+      ls -lh /workspace/exports/
+    output_size: "~4GB for 7B q4_k_m"
+    quantization_options:
+      q4_k_m: "Recommended - good quality/size balance (~4GB)"
+      q5_k_m: "Higher quality, larger (~5GB)"
+      q8_0: "Best quality, largest (~7GB)"
+  step_7_download_model:
+    description: "Download GGUF to local machine"
+    option_a_jupyterlab:
+      steps:
+        - "In JupyterLab file browser"
+        - "Navigate to /workspace/exports/"
+        - "Right-click the .gguf file"
+        - "Select 'Download'"
+    option_b_runpodctl:
+      command: |
+        # Install runpodctl locally first
+        # https://github.com/runpod/runpodctl
+        runpodctl receive /workspace/exports/marxist-deepseek-q4_k_m.gguf
+  step_8_stop_pod:
+    description: "CRITICAL - Stop pod to avoid charges"
+    warning: "Billing continues until pod is stopped!"
+    actions:
+      - "Verify GGUF downloaded successfully to local machine"
+      - "In RunPod dashboard, click 'Stop' on your pod"
+      - "Wait for status: 'Stopped'"
+      - "Delete pod if you don't need it again"
+      - "Volume disk data persists even after pod deletion"
+ollama_deployment:
+  description: "Deploy GGUF to local Ollama after download"
+  steps:
+    - step: "Create Modelfile"
+      content: |
+        # Save as: Modelfile.marxist-deepseek
+        FROM ./marxist-deepseek-q4_k_m.gguf
+        TEMPLATE """<|im_start|>system
+        {{ .System }}<|im_end|>
+        <|im_start|>user
+        {{ .Prompt }}<|im_end|>
+        <|im_start|>assistant
+        {{ .Response }}<|im_end|>"""
+        SYSTEM "You are a Marxist-Leninist assistant trained on ProleWiki."
+        PARAMETER stop "<|im_end|>"
+        PARAMETER temperature 0.7
+        PARAMETER top_p 0.9
+    - step: "Create Ollama model"
+      command: "ollama create marxist-deepseek -f Modelfile.marxist-deepseek"
+    - step: "Test model"
+      command: "ollama run marxist-deepseek 'Explain dialectical materialism.'"
+cost_summary:
+  example_run:
+    gpu: "RTX 4090 On-Demand"
+    rate: "$0.55/hr"
+    time: "30 minutes"
+    total: "$0.28"
+  breakdown:
+    setup: "5 min - Pod creation, Unsloth install"
+    upload: "2 min - Data transfer"
+    training: "20-25 min - SFT with QLoRA"
+    export: "3 min - GGUF conversion"
+    download: "5 min - Transfer GGUF locally"
+    total_time: "~35-40 min"
+  tips:
+    - "Use On-Demand for short runs (<1hr)"
+    - "Use Spot for long runs with checkpoint saving"
+    - "Stop pod IMMEDIATELY after download"
+    - "Delete pod after confirming success"
+    - "Volume disk persists - can restart training later"
+troubleshooting:
+  out_of_memory:
+    symptoms: "CUDA OOM, kernel dies"
+    solutions:
+      - "Reduce batch size in training args"
+      - "Ensure load_in_4bit=True"
+      - "Use gradient_checkpointing=True"
+      - "Upgrade to A40 (48GB VRAM)"
+  slow_training:
+    symptoms: "Steps/sec much lower than expected"
+    solutions:
+      - "Verify GPU is being used: nvidia-smi"
+      - "Check torch.cuda.is_available()"
+      - "Ensure flash-attn installed correctly"
+  pod_wont_start:
+    symptoms: "Pod stuck in 'Pending' or 'Initializing'"
+    solutions:
+      - "Try different data center region"
+      - "Try different GPU type"
+      - "Check RunPod status page"
+  checkpoint_lost:
+    symptoms: "Can't find saved model after restart"
+    cause: "Saved to Container Disk instead of Volume"
+    prevention: "ALWAYS use /workspace/ for important files"
+references:
+  runpod_docs: "https://docs.runpod.io/"
+  unsloth_github: "https://github.com/unslothai/unsloth"
+  context7_runpod: "/runpod/docs"
+  related_docs:
+    - "ai-docs/finetune.yaml - Complete training configuration"
+    - "ai-docs/embedding.yaml - Embedding pipeline"
+    - "ai-docs/project-status.yaml - Phase 8 status"

ai-docs/training-schema.yaml ADDED Viewed

	@@ -0,0 +1,484 @@

+# Training Data Schema Reference
+# Purpose: Human-readable documentation for Marxist-GRPO training data format
+# Formal Schema: training_data/schema/training_record.schema.json
+# Updated: 2025-12-18
+# =============================================================================
+# OVERVIEW
+# =============================================================================
+overview:
+  purpose: |
+    This schema defines the format for Q&A training records in the Marxist-GRPO
+    fine-tuning dataset. Each record contains an instruction-response pair with
+    comprehensive metadata for:
+    - Provenance tracking (where did this come from?)
+    - Theoretical classification (what tradition/topic?)
+    - Citation tracking (what sources are referenced?)
+    - Training metadata (what issue does this fix?)
+    - Quality assessment (has this been verified?)
+  design_principles:
+    - Reproducibility: Every record traceable to source
+    - Filterability: Train on subsets by any dimension
+    - Scientific Rigor: Formal JSON Schema validation
+    - RAG Integration: Links to ChromaDB chunks where applicable
+    - Iteration Tracking: Know what was added when and why
+  json_schema_location: training_data/schema/training_record.schema.json
+  manifest_schema_location: training_data/schema/manifest.schema.json
+# =============================================================================
+# RECORD FORMAT
+# =============================================================================
+record_format:
+  description: |
+    Each JSONL file contains one JSON object per line.
+    Every record MUST have: instruction, response, metadata
+    The metadata object contains all provenance and classification.
+  minimal_example:
+    instruction: "What is the mass line?"
+    response: "The mass line is the Maoist method of communist leadership..."
+    metadata:
+      id: "synthetic/maoist-theory/001"
+      source:
+        type: "synthetic"
+      classification:
+        categories: ["maoist-theory", "methodology"]
+        tradition: "MLM"
+      provenance:
+        created_date: "2025-12-18"
+        created_by: "claude-opus"
+  full_example:
+    instruction: "What is the Marxist-Leninist distinction between antisemitism and anti-Zionism?"
+    response: "These are fundamentally different phenomena. Antisemitism is a form of racism..."
+    metadata:
+      id: "synthetic/antisemitism/001"
+      source:
+        type: "synthetic"
+        author: null
+        work: null
+        article: null
+        chunk_ids: []
+      classification:
+        categories: ["anti-zionism", "antisemitism", "settler-colonialism"]
+        tradition: "ML"
+        geographic_focus: "Palestine"
+        historical_period: null
+      citations:
+        has_citations: true
+        authors: ["Lenin", "Ilan Pappé", "Noam Chomsky"]
+        works:
+          - title: "On Anti-Jewish Pogroms"
+            author: "Lenin"
+            year: 1919
+            type: "speech"
+          - title: "The Ethnic Cleansing of Palestine"
+            author: "Ilan Pappé"
+            year: 2006
+            type: "book"
+      training:
+        iteration: 2
+        correction_for: ["both-sidesing", "antisemitism-conflation"]
+        difficulty: "intermediate"
+        response_style: "educational"
+        adversarial_type: null
+      provenance:
+        created_date: "2025-12-18"
+        created_by: "claude-opus"
+        reviewed_by: null
+        version: 1
+      quality:
+        human_verified: false
+        confidence: "high"
+        notes: null
+# =============================================================================
+# METADATA FIELDS REFERENCE
+# =============================================================================
+fields:
+  # ---------------------------------------------------------------------------
+  # SOURCE PROVENANCE
+  # ---------------------------------------------------------------------------
+  source:
+    description: Where this Q&A pair originated from.
+    type:
+      required: true
+      values:
+        prolewiki: Derived from ProleWiki article content
+        synthetic: Generated by AI for specific purpose
+        curated: Human-curated from multiple sources
+        library: Derived from Library namespace (full works)
+        external: From external source with URL
+    article:
+      required: false
+      purpose: ProleWiki article title if derived from corpus
+      example: "Main/Imperialism"
+      links_to: chromadb.article_title
+    work:
+      required: false
+      purpose: Title of source work for Library-derived Q&As
+      example: "Imperialism, the Highest Stage of Capitalism"
+    author:
+      required: false
+      purpose: Primary author of source material
+      example: "Lenin"
+      enables: Train only on Marx-derived, Lenin-derived, etc.
+    chunk_ids:
+      required: false
+      purpose: ChromaDB chunk IDs this Q&A was derived from
+      example: ["Main/Imperialism#0", "Main/Imperialism#1"]
+      enables: RAG-training data linkage, citation verification
+  # ---------------------------------------------------------------------------
+  # CLASSIFICATION
+  # ---------------------------------------------------------------------------
+  classification:
+    description: Theoretical and topical classification.
+    categories:
+      required: true
+      purpose: Topic tags aligned with ProleWiki categories
+      examples:
+        - ["imperialism", "revisionism"]
+        - ["anti-zionism", "settler-colonialism", "national-liberation"]
+        - ["cultural-revolution", "gpcr", "maoist-theory"]
+      enables: Train on specific topics, measure coverage
+    tradition:
+      required: true
+      values:
+        ML: Marxism-Leninism (broad)
+        MLM: Marxism-Leninism-Maoism (includes GPCR defense)
+        general: Broadly applicable across tendencies
+        contested: Debated within ML circles
+      enables: Filter by theoretical tendency
+    geographic_focus:
+      required: false
+      examples: ["Soviet Union", "China", "Palestine", "Cuba"]
+      enables: Regional expertise training
+    historical_period:
+      required: false
+      examples: ["Russian Revolution", "Cultural Revolution", "Cold War"]
+      enables: Period-specific training
+  # ---------------------------------------------------------------------------
+  # CITATIONS
+  # ---------------------------------------------------------------------------
+  citations:
+    description: Citation and reference tracking.
+    has_citations:
+      purpose: Quick boolean filter for cited content
+      enables: Train only on well-sourced responses
+    works:
+      purpose: Structured list of cited works
+      fields: [title, author, year, type]
+      enables: Verify citations, trace to primary sources
+    authors:
+      purpose: Flat list of cited authors for filtering
+      enables: "Train on Lenin-citing records only"
+  # ---------------------------------------------------------------------------
+  # TRAINING METADATA
+  # ---------------------------------------------------------------------------
+  training:
+    description: Training-specific metadata.
+    iteration:
+      purpose: Which training iteration added this record
+      enables: Ablation studies, measure iteration impact
+    correction_for:
+      purpose: What failure modes this addresses
+      values:
+        cpc-contamination: Fixes CPC authority citations
+        both-sidesing: Fixes false equivalence on colonial issues
+        hallucination: Provides correct historical facts
+        antisemitism-conflation: Distinguishes antisemitism/anti-Zionism
+        liberal-framing: Replaces liberal with ML framing
+        historical-inaccuracy: Corrects factual errors
+        theoretical-error: Corrects theoretical misunderstandings
+        accommodation: Resists incremental position shifts
+        extended-engagement: Models firm rejection
+      enables: Test specific corrections, targeted training
+    difficulty:
+      values:
+        basic: Straightforward ML questions
+        intermediate: Requires nuanced understanding
+        advanced: Complex theoretical synthesis
+        adversarial: Bad-faith or trap questions
+      enables: Curriculum learning, stress testing
+    response_style:
+      values:
+        educational: Thorough explanation
+        firm-rejection: Short, clear rejection of premise
+        theoretical: Abstract theoretical analysis
+        historical: Historical narrative/facts
+        biographical: Person-focused information
+        analytical: Systematic breakdown
+        comparative: Comparing positions/theories
+      enables: Style-specific training
+    adversarial_type:
+      purpose: For adversarial questions, what pattern
+      values:
+        bad-faith-question: User asking in bad faith
+        conspiracy-premise: Question contains conspiracy theory
+        incremental-shift: Gradually shifting goalposts
+        false-equivalence: Both-sidesing framing
+        appeal-to-complexity: '"It''s complicated" deflection'
+  # ---------------------------------------------------------------------------
+  # PROVENANCE
+  # ---------------------------------------------------------------------------
+  provenance:
+    description: Record creation and modification tracking.
+    created_date:
+      required: true
+      format: ISO 8601 date (YYYY-MM-DD)
+      purpose: When this record was created
+    created_by:
+      required: true
+      values: [human, claude-opus, claude-sonnet, other-llm, automated]
+      purpose: Who/what created this record
+      enables: Filter by creation method
+    reviewed_by:
+      purpose: Human reviewer identifier
+      enables: Track review coverage
+    version:
+      purpose: Increment on edits
+      enables: Track record evolution
+  # ---------------------------------------------------------------------------
+  # QUALITY
+  # ---------------------------------------------------------------------------
+  quality:
+    description: Quality assessment metadata.
+    human_verified:
+      purpose: Has a human verified accuracy?
+      enables: High-confidence subset training
+    confidence:
+      values: [high, medium, low]
+      purpose: Confidence in response accuracy
+    flagged_issues:
+      purpose: Known issues needing attention
+      enables: Exclude problematic records
+# =============================================================================
+# VALIDATION
+# =============================================================================
+validation:
+  json_schema:
+    location: training_data/schema/training_record.schema.json
+    draft: 2020-12
+    command: |
+      # Using jsonschema Python library
+      uv run python -c "
+      import json
+      import jsonschema
+      from pathlib import Path
+      schema = json.loads(Path('training_data/schema/training_record.schema.json').read_text())
+      for line in Path('training_data/your_file.jsonl').read_text().splitlines():
+          record = json.loads(line)
+          jsonschema.validate(record, schema)
+      print('All records valid!')
+      "
+  quick_validation:
+    command: |
+      # Quick JSON syntax check
+      python3 -c "import json; [json.loads(l) for l in open('file.jsonl')]; print('OK')"
+  pre_commit_hook:
+    description: Add to .pre-commit-config.yaml for automatic validation
+    config: |
+      - repo: local
+        hooks:
+          - id: validate-training-data
+            name: Validate Training Data Schema
+            entry: uv run python scripts/validate_training_data.py
+            language: system
+            files: ^training_data/.*\.jsonl$
+# =============================================================================
+# MANIFEST
+# =============================================================================
+manifest:
+  purpose: |
+    The manifest (MANIFEST.yaml) tracks all JSONL files in the dataset,
+    their checksums, statistics, and training history. This enables:
+    - Reproducible training runs
+    - Dataset versioning
+    - Integrity verification
+    - Statistics generation
+  location: training_data/MANIFEST.yaml
+  schema: training_data/schema/manifest.schema.json
+  key_sections:
+    dataset: Name, version, license, description
+    files: List of all JSONL files with checksums and metadata
+    statistics: Aggregate counts by source, category, tradition
+    training_iterations: History of training runs
+    known_issues: Documented problems
+    changelog: Dataset modification history
+# =============================================================================
+# FILTERING PATTERNS
+# =============================================================================
+filtering_patterns:
+  description: Common filtering operations for training subsets.
+  by_source:
+    code: |
+      # ProleWiki-derived only (corpus purity)
+      data = [r for r in records if r["metadata"]["source"]["type"] == "prolewiki"]
+      # Exclude synthetic for ablation
+      data = [r for r in records if r["metadata"]["source"]["type"] != "synthetic"]
+  by_author:
+    code: |
+      # Lenin-citing records
+      data = [r for r in records
+              if "Lenin" in r["metadata"].get("citations", {}).get("authors", [])]
+      # Marx or Engels sourced
+      data = [r for r in records
+              if r["metadata"]["source"].get("author") in ["Marx", "Engels"]]
+  by_tradition:
+    code: |
+      # MLM only (includes GPCR defense)
+      data = [r for r in records if r["metadata"]["classification"]["tradition"] == "MLM"]
+  by_correction:
+    code: |
+      # Records addressing Zionism issues
+      data = [r for r in records
+              if "both-sidesing" in r["metadata"].get("training", {}).get("correction_for", [])]
+  by_difficulty:
+    code: |
+      # Adversarial examples only (stress testing)
+      data = [r for r in records
+              if r["metadata"].get("training", {}).get("difficulty") == "adversarial"]
+  by_iteration:
+    code: |
+      # Only iteration 1 (baseline)
+      data = [r for r in records if r["metadata"].get("training", {}).get("iteration") == 1]
+      # Iterations 1-2 combined
+      data = [r for r in records if r["metadata"].get("training", {}).get("iteration", 1) <= 2]
+  by_quality:
+    code: |
+      # Human-verified only
+      data = [r for r in records if r["metadata"].get("quality", {}).get("human_verified")]
+      # High confidence
+      data = [r for r in records
+              if r["metadata"].get("quality", {}).get("confidence") == "high"]
+# =============================================================================
+# INTEGRATION WITH CHROMADB
+# =============================================================================
+chromadb_integration:
+  purpose: |
+    Training data can link to ChromaDB chunks, enabling:
+    - Verification that responses match corpus
+    - RAG-augmented training data generation
+    - Provenance chains from user query → chunk → training example
+  chunk_id_format: "{namespace}/{article_title}#{chunk_index}"
+  examples:
+    - "Main/Imperialism#0"
+    - "Library/Capital_Vol_1#127"
+    - "Essays/On_Revisionism#3"
+  linkage_pattern:
+    description: When generating training data from ProleWiki chunks
+    code: |
+      # Generate Q&A from chunk and preserve linkage
+      training_record = {
+          "instruction": generate_question(chunk),
+          "response": generate_answer(chunk),
+          "metadata": {
+              "source": {
+                  "type": "prolewiki",
+                  "article": chunk["article_title"],
+                  "chunk_ids": [chunk["chunk_id"]]
+              },
+              # ... rest of metadata
+          }
+      }
+# =============================================================================
+# MIGRATION FROM LEGACY FORMAT
+# =============================================================================
+migration:
+  legacy_format:
+    description: Original curated_qa.jsonl format
+    example:
+      instruction: "What is revisionism?"
+      response: "Revisionism refers to..."
+  new_format:
+    description: Full metadata format
+    migration_steps:
+      - Add metadata wrapper
+      - Generate unique IDs
+      - Infer source type (curated for manual entries)
+      - Add classification based on content analysis
+      - Set iteration to 1 for baseline data
+      - Mark as needing human verification
+  migration_script: |
+    # See scripts/migrate_training_data.py for full implementation
+    def migrate_record(old_record, index):
+        return {
+            "instruction": old_record["instruction"],
+            "response": old_record["response"],
+            "metadata": {
+                "id": f"curated/legacy/{index:03d}",
+                "source": {"type": "curated"},
+                "classification": {
+                    "categories": infer_categories(old_record),
+                    "tradition": "ML"
+                },
+                "provenance": {
+                    "created_date": "2025-12-17",  # Original creation date
+                    "created_by": "human"
+                }
+            }
+        }

docker/.env.example ADDED Viewed

	@@ -0,0 +1,124 @@

+# =============================================================================
+# Marxist-GRPO Training Environment Variables
+# =============================================================================
+# Copy this file to .env and fill in your values.
+# These can be passed to docker run with --env-file or individually with -e.
+#
+# Usage:
+#   docker run --gpus all --env-file docker/.env marxist-grpo:latest
+#
+# Or with runpodctl:
+#   runpodctl create pod \
+#     --imageName myregistry/marxist-grpo:latest \
+#     --env HF_TOKEN=$HF_TOKEN \
+#     --env WANDB_API_KEY=$WANDB_API_KEY \
+#     --env HF_REPO=my-org/my-model
+# =============================================================================
+# -----------------------------------------------------------------------------
+# REQUIRED SECRETS (must be set)
+# -----------------------------------------------------------------------------
+# HuggingFace API token (for model upload)
+# Get yours at: https://huggingface.co/settings/tokens
+HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+# Weights & Biases API key (for training monitoring)
+# Get yours at: https://wandb.ai/authorize
+WANDB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+# -----------------------------------------------------------------------------
+# MODEL CONFIGURATION
+# -----------------------------------------------------------------------------
+# Base model to fine-tune
+MODEL_NAME=unsloth/DeepSeek-R1-0528-Qwen3-8B
+# Maximum sequence length for tokenizer
+MAX_SEQ_LENGTH=2048
+# LoRA adapter rank (higher = more expressive, more params)
+LORA_RANK=32
+# -----------------------------------------------------------------------------
+# TRAINING HYPERPARAMETERS
+# -----------------------------------------------------------------------------
+# Total training steps
+MAX_STEPS=500
+# Save checkpoint every N steps
+SAVE_STEPS=50
+# Learning rate
+LEARNING_RATE=5e-6
+# Warmup ratio (fraction of steps for LR warmup)
+WARMUP_RATIO=0.1
+# Per-device batch size
+BATCH_SIZE=2
+# Gradient accumulation steps (effective batch = BATCH_SIZE * GRADIENT_ACCUMULATION)
+GRADIENT_ACCUMULATION=2
+# Number of generations per prompt during GRPO
+NUM_GENERATIONS=4
+# Fraction of GPU memory to allocate
+GPU_MEMORY_UTILIZATION=0.6
+# Maximum prompt length (tokens)
+MAX_PROMPT_LENGTH=512
+# Maximum completion length (tokens)
+MAX_COMPLETION_LENGTH=1500
+# -----------------------------------------------------------------------------
+# REWARD CONFIGURATION
+# -----------------------------------------------------------------------------
+# Reward mode: FULL (recommended), ROBUST, or LEGACY
+#   FULL:   NLI + self-consistency + structure + topic relevance + depth
+#   ROBUST: NLI + self-consistency + structure
+#   LEGACY: Semantic similarity + terminology (faster but vulnerable to word soup)
+REWARD_MODE=FULL
+# -----------------------------------------------------------------------------
+# OUTPUT CONFIGURATION
+# -----------------------------------------------------------------------------
+# HuggingFace repo to upload the trained LoRA adapter
+HF_REPO=prolewiki/marxist-grpo-lora
+# -----------------------------------------------------------------------------
+# PATHS (container internal - usually don't change)
+# -----------------------------------------------------------------------------
+# Path to training dataset (JSONL)
+DATASET_PATH=/workspace/dataset.jsonl
+# Directory for training checkpoints
+CHECKPOINT_DIR=/workspace/checkpoints
+# Directory for final LoRA output
+LORA_OUTPUT=/workspace/lora-output
+# Directory for training outputs
+OUTPUT_DIR=/workspace/outputs
+# -----------------------------------------------------------------------------
+# OPTIONAL: RUNPOD AUTO-TERMINATION
+# -----------------------------------------------------------------------------
+# Set this to enable automatic pod termination after training
+# This prevents "zombie pods" from racking up bills
+# Value is automatically set by RunPod, or can be set manually
+# RUNPOD_POD_ID=
+# -----------------------------------------------------------------------------
+# OPTIONAL: REMOTE DATASET
+# -----------------------------------------------------------------------------
+# If dataset is not embedded in the image, set this URL to download it
+# DATASET_URL=https://my-bucket.s3.amazonaws.com/grpo_dataset.jsonl

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,132 @@

+# =============================================================================
+# Marxist-GRPO Training Container
+# =============================================================================
+# Headless GRPO fine-tuning container for RunPod deployment.
+#
+# Build:
+#   docker build -t marxist-grpo:latest docker/
+#
+# Run locally (testing):
+#   docker run --gpus all \
+#     -e HF_TOKEN=$HF_TOKEN \
+#     -e WANDB_API_KEY=$WANDB_API_KEY \
+#     -e MAX_STEPS=10 \
+#     marxist-grpo:latest
+#
+# Deploy to RunPod:
+#   runpodctl create pod \
+#     --name "marxist-grpo-training" \
+#     --gpuType "NVIDIA A100 80GB PCIe" \
+#     --imageName "myregistry/marxist-grpo:latest" \
+#     --env HF_TOKEN=$HF_TOKEN \
+#     --env WANDB_API_KEY=$WANDB_API_KEY \
+#     --env HF_REPO=my-org/my-model
+# =============================================================================
+# Use RunPod's PyTorch base image with CUDA 11.8
+FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel
+# Set working directory
+WORKDIR /workspace
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    curl \
+    htop \
+    nvtop \
+    tmux \
+    wget \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+# Install runpodctl for pod self-termination
+RUN curl -fsSL -o /tmp/runpodctl.tar.gz \
+    https://github.com/runpod/runpodctl/releases/download/v1.14.15/runpodctl-linux-amd64.tar.gz \
+    && tar -xzf /tmp/runpodctl.tar.gz -C /tmp \
+    && mv /tmp/runpodctl /usr/local/bin/runpodctl \
+    && chmod +x /usr/local/bin/runpodctl \
+    && rm /tmp/runpodctl.tar.gz
+# Copy requirements first (for layer caching)
+COPY docker/requirements.txt /workspace/requirements.txt
+# Install Python dependencies
+# Note: Unsloth requires specific installation order
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Install Unsloth (from source for latest optimizations)
+RUN pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+# Install specific versions that work with Unsloth
+# Note: These must be installed after unsloth to avoid conflicts
+RUN pip install --no-cache-dir --no-deps \
+    "xformers<0.0.27" \
+    "trl>=0.9.0,<0.12.0" \
+    peft \
+    accelerate \
+    bitsandbytes
+# Download spaCy model for NLP-based reward functions
+RUN python -m spacy download en_core_web_sm
+# Copy the training code
+COPY src/prolewiki_llm/ /workspace/prolewiki_llm/
+# Copy the dataset (embedded - only 4.5MB)
+COPY training_data/grpo_dataset.jsonl /workspace/dataset.jsonl
+# Copy entrypoint script
+COPY docker/start.sh /workspace/start.sh
+RUN chmod +x /workspace/start.sh
+# Set PYTHONPATH so prolewiki_llm module can be imported
+ENV PYTHONPATH=/workspace
+# =============================================================================
+# Environment Variables (defaults - override at runtime)
+# =============================================================================
+# Model configuration
+ENV MODEL_NAME="unsloth/DeepSeek-R1-0528-Qwen3-8B"
+ENV MAX_SEQ_LENGTH=2048
+ENV LORA_RANK=32
+# Training configuration
+ENV MAX_STEPS=500
+ENV SAVE_STEPS=50
+ENV LEARNING_RATE=5e-6
+ENV BATCH_SIZE=2
+ENV GRADIENT_ACCUMULATION=2
+ENV NUM_GENERATIONS=4
+ENV GPU_MEMORY_UTILIZATION=0.6
+# Paths (container internal)
+ENV DATASET_PATH=/workspace/dataset.jsonl
+ENV CHECKPOINT_DIR=/workspace/checkpoints
+ENV LORA_OUTPUT=/workspace/lora-output
+ENV OUTPUT_DIR=/workspace/outputs
+# Reward mode: FULL (recommended), ROBUST, or LEGACY
+ENV REWARD_MODE=FULL
+# Upload destination
+ENV HF_REPO=prolewiki/marxist-grpo-lora
+# Required secrets (must be provided at runtime):
+# - HF_TOKEN: HuggingFace API token
+# - WANDB_API_KEY: Weights & Biases API key
+# Optional:
+# - RUNPOD_POD_ID: For self-termination after training
+# Health check - verify CUDA is available
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import torch; assert torch.cuda.is_available()" || exit 1
+# Entry point
+ENTRYPOINT ["/workspace/start.sh"]

docker/requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# =============================================================================
+# Marxist-GRPO Training Dependencies
+# =============================================================================
+# Pinned versions for reproducible builds.
+# Note: Unsloth is installed separately in Dockerfile from git.
+# =============================================================================
+# Core ML
+torch>=2.1.0,<2.3.0
+transformers>=4.40.0,<4.58.0
+sentence-transformers>=3.0.0,<4.0.0
+# Training
+datasets>=2.20.0,<3.0.0
+# Note: trl, peft, accelerate, bitsandbytes installed separately after unsloth
+# NLP (for reward functions)
+spacy>=3.8.0,<4.0.0
+# Logging and monitoring
+wandb>=0.17.0,<1.0.0
+# HuggingFace Hub (for model upload)
+huggingface-hub>=0.24.0,<1.0.0
+# Data validation
+pydantic>=2.0.0,<3.0.0
+# Utilities
+numpy>=1.24.0,<2.0.0
+safetensors>=0.4.0,<1.0.0

docker/start.sh ADDED Viewed

	@@ -0,0 +1,165 @@

+#!/bin/bash
+# =============================================================================
+# Headless GRPO Training Entrypoint Script
+# =============================================================================
+# This script orchestrates the training lifecycle:
+# 1. Validate environment (secrets, GPU)
+# 2. Authenticate with HuggingFace and W&B
+# 3. Run training
+# 4. Upload results
+# 5. Self-terminate pod (if RUNPOD_POD_ID is set)
+#
+# Exit on any error
+set -e
+echo "=============================================================="
+echo "Marxist-GRPO Headless Training"
+echo "=============================================================="
+echo "Start time: $(date -Iseconds)"
+echo ""
+# =============================================================================
+# 1. ENVIRONMENT VALIDATION
+# =============================================================================
+echo "[1/5] Validating environment..."
+# Check required secrets
+if [ -z "$HF_TOKEN" ]; then
+    echo "ERROR: HF_TOKEN environment variable is required"
+    exit 1
+fi
+if [ -z "$WANDB_API_KEY" ]; then
+    echo "ERROR: WANDB_API_KEY environment variable is required"
+    exit 1
+fi
+# Check GPU availability
+python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" || {
+    echo "ERROR: CUDA/GPU not available"
+    exit 1
+}
+echo "  - HF_TOKEN: [set]"
+echo "  - WANDB_API_KEY: [set]"
+echo "  - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')"
+echo "  - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')"
+echo ""
+# =============================================================================
+# 2. AUTHENTICATION
+# =============================================================================
+echo "[2/5] Authenticating with services..."
+# Login to HuggingFace
+echo "  - HuggingFace Hub..."
+huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null || true
+# Login to Weights & Biases
+echo "  - Weights & Biases..."
+wandb login "$WANDB_API_KEY" 2>/dev/null || true
+echo ""
+# =============================================================================
+# 3. DATA VALIDATION
+# =============================================================================
+echo "[3/5] Validating dataset..."
+DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}"
+if [ ! -f "$DATASET_PATH" ]; then
+    echo "ERROR: Dataset not found at $DATASET_PATH"
+    # If DATASET_URL is set, try to download
+    if [ -n "$DATASET_URL" ]; then
+        echo "Attempting to download from DATASET_URL..."
+        wget -O "$DATASET_PATH" "$DATASET_URL" || {
+            echo "ERROR: Failed to download dataset"
+            exit 1
+        }
+    else
+        exit 1
+    fi
+fi
+RECORD_COUNT=$(wc -l < "$DATASET_PATH")
+echo "  - Dataset: $DATASET_PATH"
+echo "  - Records: $RECORD_COUNT"
+echo ""
+# =============================================================================
+# 4. TRAINING
+# =============================================================================
+echo "[4/5] Starting training..."
+echo ""
+echo "Configuration:"
+echo "  - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}"
+echo "  - Max Steps: ${MAX_STEPS:-500}"
+echo "  - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}"
+echo "  - Learning Rate: ${LEARNING_RATE:-5e-6}"
+echo "  - Reward Mode: ${REWARD_MODE:-FULL}"
+echo "  - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}"
+echo ""
+# Create output directories (use env vars to allow override in tests)
+CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}"
+LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}"
+OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}"
+mkdir -p "$CHECKPOINT_DIR"
+mkdir -p "$LORA_OUTPUT"
+mkdir -p "$OUTPUT_DIR"
+# Run training
+# Note: The training script handles all the environment variables
+# IMPORTANT: Disable set -e temporarily to capture exit code for failure handling
+set +e
+python -m prolewiki_llm.train_headless
+TRAINING_EXIT_CODE=$?
+set -e
+echo ""
+echo "Training completed with exit code: $TRAINING_EXIT_CODE"
+echo ""
+# =============================================================================
+# 5. CLEANUP AND TERMINATION
+# =============================================================================
+echo "[5/5] Cleanup..."
+# Log completion time
+echo "End time: $(date -Iseconds)"
+# If training succeeded and RUNPOD_POD_ID is set, terminate the pod
+if [ $TRAINING_EXIT_CODE -eq 0 ]; then
+    echo "Training completed successfully!"
+    if [ -n "$RUNPOD_POD_ID" ]; then
+        echo ""
+        echo "Terminating pod to stop billing..."
+        echo "Pod ID: $RUNPOD_POD_ID"
+        # Give a few seconds for logs to flush
+        sleep 5
+        # Stop the pod
+        runpodctl stop pod "$RUNPOD_POD_ID" || {
+            echo "Warning: Failed to stop pod automatically"
+            echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing"
+        }
+    else
+        echo ""
+        echo "Note: RUNPOD_POD_ID not set - pod will continue running"
+        echo "Remember to stop the pod manually to avoid billing!"
+    fi
+else
+    echo "Training failed with exit code $TRAINING_EXIT_CODE"
+    echo "Pod will NOT be automatically terminated for debugging"
+    echo ""
+    echo "To debug:"
+    echo "  1. SSH into the pod"
+    echo "  2. Check /workspace/outputs for logs"
+    echo "  3. Check W&B dashboard for metrics"
+fi
+exit $TRAINING_EXIT_CODE

notebooks/Marxist_GRPO_RunPod_Optimized.ipynb ADDED Viewed

	@@ -0,0 +1,1107 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "header",
+   "metadata": {},
+   "source": [
+    "# Marxist GRPO Training - RunPod Optimized\n",
+    "\n",
+    "**Production-ready notebook for headless GRPO training on RunPod.**\n",
+    "\n",
+    "This notebook is optimized based on:\n",
+    "- `src/prolewiki_llm/train_headless.py` - Production patterns\n",
+    "- `src/prolewiki_llm/grpo_rewards.py` - Multi-layer reward system\n",
+    "- C7 documentation for Unsloth and TRL\n",
+    "\n",
+    "## Recommended GPU\n",
+    "\n",
+    "| GPU | VRAM | Price | Status |\n",
+    "|-----|------|-------|--------|\n",
+    "| **NVIDIA A40** | 48GB | $0.35/hr | **Recommended** |\n",
+    "| NVIDIA A100 | 80GB | $1.19/hr | Overkill (2x cost for unused VRAM) |\n",
+    "| RTX 4090 | 24GB | $0.34/hr | Too small (8B model needs ~40GB in 16-bit) |\n",
+    "\n",
+    "## Key Optimizations\n",
+    "\n",
+    "1. **torch.compile disabled** - Prevents hanging on containerized environments\n",
+    "2. **use_gradient_checkpointing=True** - Stable (not `\"unsloth\"` variant)\n",
+    "3. **load_in_4bit=False** - Required for GRPO (16-bit LoRA adapters)\n",
+    "4. **vLLM integration** - 2x faster generation during training\n",
+    "5. **Multi-layer rewards** - Defeats reward hacking\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "critical-setup-header",
+   "metadata": {},
+   "source": [
+    "## CRITICAL: Disable torch.compile (Run First!)\n",
+    "\n",
+    "These environment variables **MUST** be set before any imports.\n",
+    "They prevent torch.compile's inductor workers from hanging on RunPod/Jupyter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "disable-torch-compile",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# CRITICAL: Disable torch.compile BEFORE any imports\n",
+    "# =============================================================================\n",
+    "# These environment variables prevent torch.compile from spawning inductor\n",
+    "# compilation workers that hang indefinitely on RunPod/WSL2/Jupyter.\n",
+    "# See: https://github.com/unslothai/unsloth/issues/3432\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"UNSLOTH_COMPILE_DISABLE\"] = \"1\"\n",
+    "os.environ[\"TORCH_COMPILE\"] = \"0\"\n",
+    "os.environ[\"TORCHINDUCTOR_DISABLE\"] = \"1\"\n",
+    "os.environ[\"UNSLOTH_DISABLE_FAST_GENERATION\"] = \"1\"\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"TORCHINDUCTOR_COMPILE_THREADS\"] = \"1\"\n",
+    "os.environ[\"UNSLOTH_VLLM_STANDBY\"] = \"1\"\n",
+    "\n",
+    "# Enable fast HuggingFace downloads\n",
+    "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\"\n",
+    "\n",
+    "print(\"Environment configured for RunPod stability.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "install-header",
+   "metadata": {},
+   "source": [
+    "## Installation\n",
+    "\n",
+    "Install all required dependencies. This takes ~5 minutes on first run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "install-deps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture install_output\n",
+    "# Install dependencies (captured to reduce noise)\n",
+    "\n",
+    "# Core ML stack\n",
+    "!pip install torch --index-url https://download.pytorch.org/whl/cu121\n",
+    "\n",
+    "# Unsloth with vLLM (GRPO speedup)\n",
+    "!pip install unsloth vllm\n",
+    "\n",
+    "# TRL for GRPO training\n",
+    "!pip install trl peft bitsandbytes\n",
+    "\n",
+    "# Transformers (compatible version)\n",
+    "!pip install \"transformers>=4.45.0,<4.50.0\"\n",
+    "\n",
+    "# Data and logging\n",
+    "!pip install datasets wandb hf_transfer\n",
+    "\n",
+    "# Reward function dependencies\n",
+    "!pip install sentence-transformers numpy\n",
+    "\n",
+    "# spaCy with transformer model (best semantic understanding)\n",
+    "!pip install spacy\n",
+    "!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl\n",
+    "\n",
+    "print(\"Installation complete!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "check-install",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify installation and show any errors\n",
+    "print(install_output.stdout[-2000:] if len(install_output.stdout) > 2000 else install_output.stdout)\n",
+    "if install_output.stderr:\n",
+    "    print(\"\\n--- Warnings/Errors ---\")\n",
+    "    print(install_output.stderr[-1000:])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "vram-check-header",
+   "metadata": {},
+   "source": [
+    "## GPU Verification\n",
+    "\n",
+    "Verify GPU is available and has sufficient VRAM before proceeding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "vram-check",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# Check CUDA availability\n",
+    "if not torch.cuda.is_available():\n",
+    "    raise RuntimeError(\n",
+    "        \"CUDA not available! This notebook requires a GPU.\\n\"\n",
+    "        \"On RunPod: Ensure you selected a GPU pod type.\"\n",
+    "    )\n",
+    "\n",
+    "# Get GPU info\n",
+    "gpu_name = torch.cuda.get_device_name(0)\n",
+    "gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9\n",
+    "\n",
+    "print(f\"GPU: {gpu_name}\")\n",
+    "print(f\"VRAM: {gpu_mem_gb:.1f} GB\")\n",
+    "\n",
+    "# Check minimum VRAM (8B model in 16-bit needs ~35-40GB)\n",
+    "MIN_VRAM_GB = 40\n",
+    "if gpu_mem_gb < MIN_VRAM_GB:\n",
+    "    print(f\"\\nWARNING: GPU has {gpu_mem_gb:.1f}GB VRAM, but {MIN_VRAM_GB}GB is recommended.\")\n",
+    "    print(\"Consider using A40 (48GB) or A100 (80GB) for 8B model training.\")\n",
+    "    print(\"You may experience OOM errors with smaller GPUs.\")\n",
+    "else:\n",
+    "    print(f\"\\nVRAM check passed ({gpu_mem_gb:.1f}GB >= {MIN_VRAM_GB}GB required).\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "config-header",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Configure training parameters. Adjust these based on your GPU and requirements."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "config",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "# =============================================================================\n",
+    "# REQUIRED: Set your API keys\n",
+    "# =============================================================================\n",
+    "# Option 1: Set directly (for testing)\n",
+    "# os.environ[\"HF_TOKEN\"] = \"hf_...\"\n",
+    "# os.environ[\"WANDB_API_KEY\"] = \"...\"\n",
+    "\n",
+    "# Option 2: Already set in environment (recommended for production)\n",
+    "HF_TOKEN = os.environ.get(\"HF_TOKEN\")\n",
+    "WANDB_API_KEY = os.environ.get(\"WANDB_API_KEY\")\n",
+    "\n",
+    "if not HF_TOKEN:\n",
+    "    print(\"WARNING: HF_TOKEN not set. Model upload will fail.\")\n",
+    "if not WANDB_API_KEY:\n",
+    "    print(\"WARNING: WANDB_API_KEY not set. W&B logging disabled.\")\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Model Configuration\n",
+    "# =============================================================================\n",
+    "MODEL_NAME = \"unsloth/DeepSeek-R1-0528-Qwen3-8B\"\n",
+    "MAX_SEQ_LENGTH = 2048\n",
+    "LORA_RANK = 32\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Training Configuration (A40 optimized)\n",
+    "# =============================================================================\n",
+    "MAX_STEPS = 500\n",
+    "SAVE_STEPS = 50\n",
+    "LEARNING_RATE = 5e-6\n",
+    "WARMUP_RATIO = 0.1\n",
+    "\n",
+    "# Batch settings (A40 48GB: batch=2, grad_accum=2, generations=4)\n",
+    "BATCH_SIZE = 2\n",
+    "GRADIENT_ACCUMULATION = 2\n",
+    "NUM_GENERATIONS = 4\n",
+    "\n",
+    "# Sequence lengths\n",
+    "MAX_PROMPT_LENGTH = 512\n",
+    "MAX_COMPLETION_LENGTH = 1500\n",
+    "\n",
+    "# vLLM memory allocation\n",
+    "GPU_MEMORY_UTILIZATION = 0.6\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Paths (RunPod uses /workspace for persistent storage)\n",
+    "# =============================================================================\n",
+    "# Try multiple locations for dataset\n",
+    "DATASET_PATHS = [\n",
+    "    Path(\"/workspace/dataset.jsonl\"),\n",
+    "    Path(\"/workspace/grpo_dataset.jsonl\"),\n",
+    "    Path(\"./grpo_dataset.jsonl\"),\n",
+    "    Path(\"../training_data/grpo_dataset.jsonl\"),\n",
+    "]\n",
+    "\n",
+    "DATASET_PATH = None\n",
+    "for p in DATASET_PATHS:\n",
+    "    if p.exists():\n",
+    "        DATASET_PATH = p\n",
+    "        break\n",
+    "\n",
+    "if DATASET_PATH is None:\n",
+    "    print(f\"WARNING: Dataset not found. Searched: {[str(p) for p in DATASET_PATHS]}\")\n",
+    "    print(\"Please upload grpo_dataset.jsonl to /workspace/\")\n",
+    "else:\n",
+    "    print(f\"Dataset found: {DATASET_PATH}\")\n",
+    "\n",
+    "CHECKPOINT_DIR = Path(\"/workspace/checkpoints\")\n",
+    "LORA_OUTPUT = Path(\"/workspace/lora-output\")\n",
+    "OUTPUT_DIR = Path(\"/workspace/outputs\")\n",
+    "\n",
+    "# Create directories\n",
+    "for d in [CHECKPOINT_DIR, LORA_OUTPUT, OUTPUT_DIR]:\n",
+    "    d.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Upload Configuration\n",
+    "# =============================================================================\n",
+    "HF_REPO = \"prolewiki/marxist-grpo-lora\"  # Change to your repo\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Reward Mode: FULL (recommended), ROBUST, or LEGACY\n",
+    "# =============================================================================\n",
+    "# FULL: NLI + self-consistency + structure + topic + depth (most robust)\n",
+    "# ROBUST: NLI + self-consistency + structure (faster)\n",
+    "# LEGACY: Semantic similarity + terminology (fastest, less robust)\n",
+    "REWARD_MODE = \"FULL\"\n",
+    "\n",
+    "print(f\"\\nConfiguration:\")\n",
+    "print(f\"  Model: {MODEL_NAME}\")\n",
+    "print(f\"  Max Steps: {MAX_STEPS}\")\n",
+    "print(f\"  Batch Size: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} = {BATCH_SIZE * GRADIENT_ACCUMULATION}\")\n",
+    "print(f\"  Learning Rate: {LEARNING_RATE}\")\n",
+    "print(f\"  Reward Mode: {REWARD_MODE}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wandb-header",
+   "metadata": {},
+   "source": [
+    "## Initialize W&B\n",
+    "\n",
+    "Set up Weights & Biases for remote monitoring."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "wandb-init",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_WANDB = False\n",
+    "\n",
+    "if WANDB_API_KEY:\n",
+    "    import wandb\n",
+    "    wandb.login(key=WANDB_API_KEY)\n",
+    "    USE_WANDB = True\n",
+    "    print(\"W&B logged in successfully.\")\n",
+    "else:\n",
+    "    print(\"W&B disabled (no API key).\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dataset-header",
+   "metadata": {},
+   "source": [
+    "## Load Dataset\n",
+    "\n",
+    "Load the GRPO-formatted dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "load-dataset",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "if DATASET_PATH is None or not DATASET_PATH.exists():\n",
+    "    raise FileNotFoundError(\n",
+    "        \"Dataset not found!\\n\"\n",
+    "        \"Please upload grpo_dataset.jsonl to /workspace/\\n\"\n",
+    "        \"Generate it with: python -m prolewiki_llm.export_grpo_dataset\"\n",
+    "    )\n",
+    "\n",
+    "dataset = Dataset.from_json(str(DATASET_PATH))\n",
+    "print(f\"Loaded {len(dataset):,} examples from {DATASET_PATH}\")\n",
+    "\n",
+    "# Show sample\n",
+    "sample = dataset[0]\n",
+    "print(f\"\\nSample prompt: {sample['prompt'][1]['content'][:100]}...\")\n",
+    "print(f\"Sample answer: {sample['answer'][:100]}...\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "model-header",
+   "metadata": {},
+   "source": [
+    "## Load Model\n",
+    "\n",
+    "Load DeepSeek-R1-0528-Qwen3-8B with Unsloth for optimized training.\n",
+    "\n",
+    "**Critical settings:**\n",
+    "- `load_in_4bit=False` - Required for GRPO (16-bit LoRA adapters)\n",
+    "- `fast_inference=True` - Enables vLLM for 2x faster generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "load-model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "print(f\"Loading model: {MODEL_NAME}\")\n",
+    "print(\"This may take 5-10 minutes on first run (downloading ~16GB)...\")\n",
+    "\n",
+    "# GRPO requires 16-bit LoRA adapters (load_in_4bit=False)\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=MODEL_NAME,\n",
+    "    max_seq_length=MAX_SEQ_LENGTH,\n",
+    "    load_in_4bit=False,  # MUST be False for GRPO\n",
+    "    fast_inference=True,  # Enables vLLM\n",
+    "    max_lora_rank=LORA_RANK,\n",
+    "    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nModel loaded: {model.config.model_type}\")\n",
+    "print(f\"Parameters: {sum(p.numel() for p in model.parameters()):,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "lora-header",
+   "metadata": {},
+   "source": [
+    "## Apply LoRA\n",
+    "\n",
+    "Apply LoRA adapters for efficient fine-tuning.\n",
+    "\n",
+    "**Critical settings:**\n",
+    "- `use_gradient_checkpointing=True` - Stable on RunPod (NOT `\"unsloth\"`)\n",
+    "- `lora_alpha=LORA_RANK` - Same as r for GRPO (scaling = 1.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "apply-lora",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Applying LoRA adapters...\")\n",
+    "\n",
+    "# Use gradient_checkpointing=True (not \"unsloth\") for stability on RunPod\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=LORA_RANK,\n",
+    "    target_modules=[\n",
+    "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "        \"gate_proj\", \"up_proj\", \"down_proj\",\n",
+    "    ],\n",
+    "    lora_alpha=LORA_RANK,  # Same as r for GRPO (not r*2)\n",
+    "    use_gradient_checkpointing=True,  # Stable on RunPod (NOT \"unsloth\")\n",
+    "    random_state=3407,\n",
+    ")\n",
+    "\n",
+    "# Print trainable parameters\n",
+    "trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "total = sum(p.numel() for p in model.parameters())\n",
+    "print(f\"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "reward-header",
+   "metadata": {},
+   "source": [
+    "## Reward Functions\n",
+    "\n",
+    "Multi-layer reward system to prevent reward hacking.\n",
+    "\n",
+    "**Layers:**\n",
+    "1. **Format** - Validate `<think>...</think>` tags\n",
+    "2. **NLI Coherence** - Response entails ground truth\n",
+    "3. **Self-consistency** - No internal contradictions\n",
+    "4. **Structural** - Terms in proper syntactic roles\n",
+    "5. **Topic relevance** - Answer addresses the question\n",
+    "6. **Depth** - Rewards analysis, penalizes buzzword salad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "reward-setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Lazy-loaded models\n",
+    "# =============================================================================\n",
+    "_embedder = None\n",
+    "_nli_pipeline = None\n",
+    "_spacy_nlp = None\n",
+    "\n",
+    "\n",
+    "def get_embedder():\n",
+    "    global _embedder\n",
+    "    if _embedder is None:\n",
+    "        print(\"[Reward] Loading sentence-transformers embedder...\")\n",
+    "        _embedder = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
+    "    return _embedder\n",
+    "\n",
+    "\n",
+    "def get_nli_pipeline():\n",
+    "    global _nli_pipeline\n",
+    "    if _nli_pipeline is None:\n",
+    "        print(\"[Reward] Loading NLI model (bart-large-mnli)...\")\n",
+    "        from transformers import pipeline\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "        _nli_pipeline = pipeline(\n",
+    "            \"text-classification\",\n",
+    "            model=\"facebook/bart-large-mnli\",\n",
+    "            device=device,\n",
+    "        )\n",
+    "    return _nli_pipeline\n",
+    "\n",
+    "\n",
+    "def get_spacy_nlp():\n",
+    "    global _spacy_nlp\n",
+    "    if _spacy_nlp is None:\n",
+    "        import spacy\n",
+    "        models = [\"en_core_web_trf\", \"en_core_web_md\", \"en_core_web_sm\"]\n",
+    "        for model_name in models:\n",
+    "            try:\n",
+    "                print(f\"[Reward] Loading spaCy model: {model_name}...\")\n",
+    "                _spacy_nlp = spacy.load(model_name)\n",
+    "                break\n",
+    "            except OSError:\n",
+    "                continue\n",
+    "        if _spacy_nlp is None:\n",
+    "            raise OSError(\"No spaCy model found!\")\n",
+    "    return _spacy_nlp\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Constants\n",
+    "# =============================================================================\n",
+    "REASONING_START = \"<think>\"\n",
+    "REASONING_END = \"</think>\"\n",
+    "SOLUTION_END_REGEX = re.compile(rf\"{REASONING_END}(.*)\", re.DOTALL)\n",
+    "\n",
+    "MARXIST_TERMS = {\n",
+    "    \"dialectical\", \"materialism\", \"historical materialism\", \"dialectical materialism\",\n",
+    "    \"bourgeoisie\", \"proletariat\", \"petty bourgeois\", \"petty bourgeoisie\",\n",
+    "    \"class struggle\", \"class consciousness\", \"surplus value\", \"commodity\",\n",
+    "    \"use value\", \"exchange value\", \"labor power\", \"means of production\",\n",
+    "    \"relations of production\", \"forces of production\", \"mode of production\",\n",
+    "    \"exploitation\", \"capital accumulation\", \"imperialism\", \"colonialism\",\n",
+    "    \"dictatorship of the proletariat\", \"vanguard\", \"democratic centralism\",\n",
+    "    \"hegemony\", \"superstructure\", \"base\", \"ideology\", \"false consciousness\",\n",
+    "    \"revisionism\", \"opportunism\", \"reformism\", \"alienation\", \"fetishism\",\n",
+    "}\n",
+    "\n",
+    "DISCOURSE_CONNECTIVES = {\n",
+    "    \"because\", \"therefore\", \"thus\", \"hence\", \"consequently\", \"however\",\n",
+    "    \"although\", \"whereas\", \"nevertheless\", \"moreover\", \"furthermore\",\n",
+    "    \"specifically\", \"namely\", \"as a result\", \"due to\", \"in order to\",\n",
+    "}\n",
+    "\n",
+    "print(\"Reward function constants defined.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "reward-functions",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# Format Rewards\n",
+    "# =============================================================================\n",
+    "\n",
+    "def match_format_exactly(completions, **kwargs):\n",
+    "    \"\"\"Reward +3.0 if response contains proper </think> tag.\"\"\"\n",
+    "    scores = []\n",
+    "    for completion in completions:\n",
+    "        response = completion[0][\"content\"] if isinstance(completion, list) else completion\n",
+    "        score = 3.0 if SOLUTION_END_REGEX.search(response) else 0.0\n",
+    "        scores.append(score)\n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "def match_format_approximately(completions, **kwargs):\n",
+    "    \"\"\"Reward partial format matching.\"\"\"\n",
+    "    scores = []\n",
+    "    for completion in completions:\n",
+    "        response = completion[0][\"content\"] if isinstance(completion, list) else completion\n",
+    "        score = 0.0\n",
+    "        score += 0.5 if response.count(REASONING_START) == 1 else -1.0\n",
+    "        score += 0.5 if response.count(REASONING_END) == 1 else -1.0\n",
+    "        scores.append(score)\n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# NLI Coherence Reward\n",
+    "# =============================================================================\n",
+    "\n",
+    "def nli_coherence_reward(completions, answer, **kwargs):\n",
+    "    \"\"\"Reward responses that logically ENTAIL the ground truth.\"\"\"\n",
+    "    nli = get_nli_pipeline()\n",
+    "    scores = []\n",
+    "    \n",
+    "    for completion, true_answer in zip(completions, answer):\n",
+    "        response = completion[0][\"content\"] if isinstance(completion, list) else completion\n",
+    "        \n",
+    "        # Extract answer after </think>\n",
+    "        if REASONING_END in response:\n",
+    "            response = response.split(REASONING_END, 1)[1].strip()\n",
+    "        \n",
+    "        if not response or len(response.strip()) < 20:\n",
+    "            scores.append(-2.0)\n",
+    "            continue\n",
+    "        \n",
+    "        try:\n",
+    "            input_text = f\"{response[:512]}</s></s>{true_answer[:512]}\"\n",
+    "            result = nli(input_text)[0]\n",
+    "            label = result[\"label\"].lower()\n",
+    "            \n",
+    "            if label == \"entailment\":\n",
+    "                score = 3.0\n",
+    "            elif label == \"neutral\":\n",
+    "                score = -1.0\n",
+    "            else:  # contradiction\n",
+    "                score = -3.0\n",
+    "            scores.append(score)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[NLI] Error: {e}\")\n",
+    "            scores.append(0.0)\n",
+    "    \n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Structural Coherence Reward\n",
+    "# =============================================================================\n",
+    "\n",
+    "def structural_coherence_reward(completions, **kwargs):\n",
+    "    \"\"\"Reward responses with proper linguistic structure.\"\"\"\n",
+    "    nlp = get_spacy_nlp()\n",
+    "    scores = []\n",
+    "    \n",
+    "    for completion in completions:\n",
+    "        response = completion[0][\"content\"] if isinstance(completion, list) else completion\n",
+    "        doc = nlp(response)\n",
+    "        score = 0.0\n",
+    "        \n",
+    "        # Check for sentences\n",
+    "        sentences = list(doc.sents)\n",
+    "        if len(sentences) < 1:\n",
+    "            scores.append(-1.0)\n",
+    "            continue\n",
+    "        \n",
+    "        # Check Marxist terms in meaningful syntactic roles\n",
+    "        response_lower = response.lower()\n",
+    "        terms_in_context = 0\n",
+    "        \n",
+    "        for term in MARXIST_TERMS:\n",
+    "            if term not in response_lower:\n",
+    "                continue\n",
+    "            for token in doc:\n",
+    "                if term in token.text.lower():\n",
+    "                    if token.dep_ in (\"nsubj\", \"nsubjpass\", \"dobj\", \"pobj\", \"attr\", \"appos\"):\n",
+    "                        terms_in_context += 1\n",
+    "                        break\n",
+    "        \n",
+    "        score += min(terms_in_context * 0.3, 1.5)\n",
+    "        \n",
+    "        # Check discourse connectives\n",
+    "        connective_count = sum(1 for c in DISCOURSE_CONNECTIVES if c in response_lower)\n",
+    "        score += min(connective_count * 0.2, 1.0)\n",
+    "        \n",
+    "        scores.append(score)\n",
+    "    \n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Combined Rewards\n",
+    "# =============================================================================\n",
+    "\n",
+    "def full_coherence_reward(prompts, completions, answer, **kwargs):\n",
+    "    \"\"\"Complete coherence check: NLI + structure.\"\"\"\n",
+    "    nli_scores = nli_coherence_reward(completions, answer, **kwargs)\n",
+    "    structure_scores = structural_coherence_reward(completions, **kwargs)\n",
+    "    \n",
+    "    combined = []\n",
+    "    for nli, structure in zip(nli_scores, structure_scores):\n",
+    "        if nli <= -3.0:\n",
+    "            combined.append(-3.0)\n",
+    "        else:\n",
+    "            combined.append(nli + (structure * 0.5))\n",
+    "    return combined\n",
+    "\n",
+    "\n",
+    "def completeness_reward(completions, answer, **kwargs):\n",
+    "    \"\"\"Reward thorough, detailed responses.\"\"\"\n",
+    "    scores = []\n",
+    "    \n",
+    "    for completion, true_answer in zip(completions, answer):\n",
+    "        response = completion[0][\"content\"] if isinstance(completion, list) else completion\n",
+    "        \n",
+    "        if REASONING_END in response:\n",
+    "            answer_part = response.split(REASONING_END, 1)[1].strip()\n",
+    "        else:\n",
+    "            answer_part = response\n",
+    "        \n",
+    "        answer_len = len(answer_part.split())\n",
+    "        true_len = len(true_answer.split())\n",
+    "        \n",
+    "        if true_len == 0:\n",
+    "            scores.append(0.0)\n",
+    "            continue\n",
+    "        \n",
+    "        ratio = answer_len / true_len\n",
+    "        if 0.5 <= ratio <= 1.5:\n",
+    "            score = 2.0\n",
+    "        elif 0.3 <= ratio <= 2.0:\n",
+    "            score = 1.0\n",
+    "        elif ratio < 0.2:\n",
+    "            score = -2.0\n",
+    "        else:\n",
+    "            score = -0.5\n",
+    "        \n",
+    "        scores.append(score)\n",
+    "    \n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# Debug Reward\n",
+    "# =============================================================================\n",
+    "_PRINT_COUNTER = 0\n",
+    "_PRINT_EVERY = 10\n",
+    "\n",
+    "def debug_print_reward(prompts, completions, answer, **kwargs):\n",
+    "    \"\"\"Print sample outputs periodically.\"\"\"\n",
+    "    global _PRINT_COUNTER\n",
+    "    \n",
+    "    if _PRINT_COUNTER % _PRINT_EVERY == 0:\n",
+    "        question = prompts[0][-1][\"content\"] if isinstance(prompts[0], list) else prompts[0]\n",
+    "        response = completions[0][0][\"content\"] if isinstance(completions[0], list) else completions[0]\n",
+    "        \n",
+    "        print(\"=\" * 60)\n",
+    "        print(f\"Step {_PRINT_COUNTER}\")\n",
+    "        print(f\"Q: {question[:100]}...\")\n",
+    "        print(f\"A: {response[:200]}...\")\n",
+    "        print(\"=\" * 60)\n",
+    "    \n",
+    "    _PRINT_COUNTER += 1\n",
+    "    return [0.0] * len(completions)\n",
+    "\n",
+    "\n",
+    "print(\"Reward functions defined.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "trainer-header",
+   "metadata": {},
+   "source": [
+    "## Configure GRPO Trainer\n",
+    "\n",
+    "Set up the GRPO trainer with vLLM for fast generation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "trainer-config",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import GRPOConfig, GRPOTrainer\n",
+    "from vllm import SamplingParams\n",
+    "\n",
+    "# vLLM sampling parameters for generation\n",
+    "vllm_sampling_params = SamplingParams(\n",
+    "    min_p=0.1,\n",
+    "    top_p=1.0,\n",
+    "    top_k=-1,\n",
+    "    max_tokens=MAX_COMPLETION_LENGTH,\n",
+    "    stop=[tokenizer.eos_token],\n",
+    "    include_stop_str_in_output=True,\n",
+    "    seed=3407,\n",
+    ")\n",
+    "\n",
+    "# Training configuration\n",
+    "training_args = GRPOConfig(\n",
+    "    # vLLM\n",
+    "    vllm_sampling_params=vllm_sampling_params,\n",
+    "    temperature=1.0,\n",
+    "    # Optimization\n",
+    "    learning_rate=LEARNING_RATE,\n",
+    "    weight_decay=0.001,\n",
+    "    warmup_ratio=WARMUP_RATIO,\n",
+    "    lr_scheduler_type=\"linear\",\n",
+    "    optim=\"adamw_8bit\",\n",
+    "    # Batch settings\n",
+    "    per_device_train_batch_size=BATCH_SIZE,\n",
+    "    gradient_accumulation_steps=GRADIENT_ACCUMULATION,\n",
+    "    num_generations=NUM_GENERATIONS,\n",
+    "    # Sequence lengths\n",
+    "    max_prompt_length=MAX_PROMPT_LENGTH,\n",
+    "    max_completion_length=MAX_COMPLETION_LENGTH,\n",
+    "    # Training duration\n",
+    "    max_steps=MAX_STEPS,\n",
+    "    save_steps=SAVE_STEPS,\n",
+    "    # Logging\n",
+    "    logging_steps=1,\n",
+    "    report_to=\"wandb\" if USE_WANDB else \"none\",\n",
+    "    # Output\n",
+    "    output_dir=str(CHECKPOINT_DIR),\n",
+    ")\n",
+    "\n",
+    "print(f\"Training configuration:\")\n",
+    "print(f\"  Batch: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} x {NUM_GENERATIONS}\")\n",
+    "print(f\"  Effective batch: {BATCH_SIZE * GRADIENT_ACCUMULATION * NUM_GENERATIONS}\")\n",
+    "print(f\"  Max steps: {MAX_STEPS}\")\n",
+    "print(f\"  W&B: {'enabled' if USE_WANDB else 'disabled'}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "create-trainer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select reward functions based on mode\n",
+    "if REWARD_MODE == \"FULL\":\n",
+    "    print(f\"\\nReward mode: FULL (recommended)\")\n",
+    "    print(\"  - match_format_exactly, match_format_approximately\")\n",
+    "    print(\"  - full_coherence_reward (NLI + structure)\")\n",
+    "    print(\"  - completeness_reward, debug_print_reward\")\n",
+    "    reward_funcs = [\n",
+    "        match_format_exactly,\n",
+    "        match_format_approximately,\n",
+    "        full_coherence_reward,\n",
+    "        completeness_reward,\n",
+    "        debug_print_reward,\n",
+    "    ]\n",
+    "elif REWARD_MODE == \"ROBUST\":\n",
+    "    print(f\"\\nReward mode: ROBUST\")\n",
+    "    reward_funcs = [\n",
+    "        match_format_exactly,\n",
+    "        match_format_approximately,\n",
+    "        nli_coherence_reward,\n",
+    "        structural_coherence_reward,\n",
+    "        completeness_reward,\n",
+    "        debug_print_reward,\n",
+    "    ]\n",
+    "else:  # LEGACY\n",
+    "    print(f\"\\nReward mode: LEGACY (faster, less robust)\")\n",
+    "    reward_funcs = [\n",
+    "        match_format_exactly,\n",
+    "        match_format_approximately,\n",
+    "        completeness_reward,\n",
+    "        debug_print_reward,\n",
+    "    ]\n",
+    "\n",
+    "print(\"\\nNote: First training step will download NLI model (~1.6GB) + spaCy (~436MB)\")\n",
+    "\n",
+    "# Create trainer\n",
+    "trainer = GRPOTrainer(\n",
+    "    model=model,\n",
+    "    processing_class=tokenizer,\n",
+    "    reward_funcs=reward_funcs,\n",
+    "    args=training_args,\n",
+    "    train_dataset=dataset,\n",
+    ")\n",
+    "\n",
+    "print(\"\\nGRPO trainer created.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "train-header",
+   "metadata": {},
+   "source": [
+    "## Train!\n",
+    "\n",
+    "Run GRPO training. Monitor the `reward` column - it should increase over time.\n",
+    "\n",
+    "**Expected behavior:**\n",
+    "- Steps 0-50: Format rewards stabilize\n",
+    "- Steps 50-150: NLI coherence improves\n",
+    "- Steps 150-500: Overall quality improves\n",
+    "\n",
+    "**Estimated time:** ~2-3 hours on A40 for 500 steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "check-resume",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for checkpoint to resume from\n",
+    "def find_latest_checkpoint(checkpoint_dir):\n",
+    "    if not checkpoint_dir.exists():\n",
+    "        return None\n",
+    "    checkpoints = sorted(\n",
+    "        [d for d in checkpoint_dir.iterdir() if d.is_dir() and d.name.startswith(\"checkpoint-\")],\n",
+    "        key=lambda d: int(d.name.split(\"-\")[1]),\n",
+    "    )\n",
+    "    return checkpoints[-1] if checkpoints else None\n",
+    "\n",
+    "resume_from = find_latest_checkpoint(CHECKPOINT_DIR)\n",
+    "if resume_from:\n",
+    "    print(f\"Found checkpoint: {resume_from}\")\n",
+    "    print(\"Training will resume from this checkpoint.\")\n",
+    "else:\n",
+    "    print(\"No checkpoint found. Starting fresh training.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "train",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\" * 70)\n",
+    "print(\"STARTING TRAINING\")\n",
+    "print(\"=\" * 70)\n",
+    "print(f\"Steps: {MAX_STEPS}\")\n",
+    "print(f\"Batch: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} x {NUM_GENERATIONS}\")\n",
+    "print(f\"Learning rate: {LEARNING_RATE}\")\n",
+    "print(f\"Reward mode: {REWARD_MODE}\")\n",
+    "print()\n",
+    "\n",
+    "try:\n",
+    "    if resume_from:\n",
+    "        trainer.train(resume_from_checkpoint=str(resume_from))\n",
+    "    else:\n",
+    "        trainer.train()\n",
+    "except KeyboardInterrupt:\n",
+    "    print(\"\\nTraining interrupted. Saving checkpoint...\")\n",
+    "except Exception as e:\n",
+    "    print(f\"\\nTraining error: {e}\")\n",
+    "    raise\n",
+    "\n",
+    "print(\"\\nTraining complete!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "save-header",
+   "metadata": {},
+   "source": [
+    "## Save Model\n",
+    "\n",
+    "Save the trained LoRA adapter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "save-model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Saving LoRA adapter...\")\n",
+    "\n",
+    "LORA_OUTPUT.mkdir(parents=True, exist_ok=True)\n",
+    "model.save_lora(str(LORA_OUTPUT))\n",
+    "\n",
+    "print(f\"LoRA saved to: {LORA_OUTPUT}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "verify-lora",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify LoRA has non-zero weights (actually trained)\n",
+    "from safetensors import safe_open\n",
+    "\n",
+    "adapter_path = LORA_OUTPUT / \"adapter_model.safetensors\"\n",
+    "if adapter_path.exists():\n",
+    "    print(\"Verifying LoRA weights...\")\n",
+    "    with safe_open(str(adapter_path), framework=\"pt\") as f:\n",
+    "        for key in list(f.keys())[:5]:\n",
+    "            tensor = f.get_tensor(key)\n",
+    "            n_nonzero = (tensor != 0).sum().item()\n",
+    "            ratio = n_nonzero / tensor.numel()\n",
+    "            status = \"OK\" if ratio > 0.01 else \"WARNING: mostly zeros!\"\n",
+    "            print(f\"  {key}: {ratio*100:.1f}% non-zero - {status}\")\n",
+    "    print(\"\\nLoRA verification complete.\")\n",
+    "else:\n",
+    "    print(f\"WARNING: Adapter not found at {adapter_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "upload-header",
+   "metadata": {},
+   "source": [
+    "## Upload to HuggingFace Hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload-hub",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if HF_TOKEN:\n",
+    "    from huggingface_hub import HfApi\n",
+    "    \n",
+    "    print(f\"Uploading to HuggingFace Hub: {HF_REPO}\")\n",
+    "    \n",
+    "    api = HfApi(token=HF_TOKEN)\n",
+    "    \n",
+    "    # Create repo if needed\n",
+    "    try:\n",
+    "        api.create_repo(HF_REPO, exist_ok=True, private=True)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Note: {e}\")\n",
+    "    \n",
+    "    # Upload\n",
+    "    api.upload_folder(\n",
+    "        folder_path=str(LORA_OUTPUT),\n",
+    "        repo_id=HF_REPO,\n",
+    "        commit_message=\"GRPO training run\",\n",
+    "    )\n",
+    "    \n",
+    "    print(f\"\\nModel uploaded to: https://huggingface.co/{HF_REPO}\")\n",
+    "else:\n",
+    "    print(\"HF_TOKEN not set. Skipping upload.\")\n",
+    "    print(f\"LoRA saved locally at: {LORA_OUTPUT}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "test-header",
+   "metadata": {},
+   "source": [
+    "## Test Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test-model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TEST_QUESTIONS = [\n",
+    "    \"What is revisionism in the Marxist sense?\",\n",
+    "    \"Explain the concept of surplus value.\",\n",
+    "    \"What is the dictatorship of the proletariat?\",\n",
+    "]\n",
+    "\n",
+    "SYSTEM_PROMPT = \"\"\"You are a Marxist-Leninist assistant trained on ProleWiki.\n",
+    "Think through questions using dialectical materialist analysis.\n",
+    "Show your reasoning in <think> tags, then provide a clear answer.\"\"\"\n",
+    "\n",
+    "print(\"Testing trained model...\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "for question in TEST_QUESTIONS:\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": question},\n",
+    "    ]\n",
+    "    \n",
+    "    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n",
+    "    inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=512,\n",
+    "            temperature=0.7,\n",
+    "            do_sample=True,\n",
+    "            pad_token_id=tokenizer.pad_token_id,\n",
+    "        )\n",
+    "    \n",
+    "    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)\n",
+    "    \n",
+    "    print(f\"\\nQ: {question}\")\n",
+    "    print(f\"A: {response[:400]}...\")\n",
+    "    print(\"-\" * 60)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "complete-header",
+   "metadata": {},
+   "source": [
+    "## Training Complete!\n",
+    "\n",
+    "**Summary:**\n",
+    "- Model: DeepSeek-R1-0528-Qwen3-8B with LoRA\n",
+    "- Training: GRPO with multi-layer reward functions\n",
+    "- Output: LoRA adapter saved to `/workspace/lora-output`\n",
+    "\n",
+    "**Next steps:**\n",
+    "1. Review W&B dashboard for training metrics\n",
+    "2. Test model quality with varied questions\n",
+    "3. If satisfied, merge LoRA into base model for deployment\n",
+    "4. Convert to GGUF for Ollama if needed\n",
+    "\n",
+    "**Remember:** Stop the pod after training to avoid charges!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,166 @@

+[project]
+name = "prolewiki-llm"
+version = "0.1.0"
+description = "GRPO fine-tuning and reward functions for Marxist-Leninist language models"
+readme = "README.md"
+license = { text = "AGPL-3.0-only" }
+requires-python = ">=3.12"
+dependencies = [
+    # Core ML
+    "transformers>=4.40.0",
+    "sentence-transformers>=3.0.0",
+    "torch>=2.0.0",
+    # NLP
+    "spacy>=3.8.0",
+    # Logging
+    "wandb>=0.17.0",
+    # Data
+    "pydantic>=2.0.0",
+]
+[dependency-groups]
+dev = [
+    # Testing
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.24.0",
+    "pytest-cov>=6.0.0",
+    "pytest-mock>=3.14.0",
+    # Code Quality
+    "mypy>=1.13.0",
+    "ruff>=0.8.0",
+    "pre-commit>=4.0.0",
+    # Type Stubs
+    "types-PyYAML>=6.0.0",
+]
+training = [
+    # GRPO training (GPU required)
+    "unsloth>=2024.8",
+    "trl>=0.9.0",
+    "peft>=0.12.0",
+    "bitsandbytes>=0.43.0",
+    "datasets>=2.20.0",
+    "vllm>=0.5.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/prolewiki_llm"]
+# =============================================================================
+# PYTEST
+# =============================================================================
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]
+asyncio_mode = "auto"
+asyncio_default_fixture_loop_scope = "function"
+markers = [
+    "unit: Fast unit tests for isolated components (no I/O)",
+    "integration: Integration tests for shell scripts and Docker behavior",
+    "slow: Tests that take significant time (NLI, embedding)",
+    "gpu: Tests requiring GPU (training)",
+]
+# =============================================================================
+# RUFF (Linting + Formatting + Import Sorting)
+# =============================================================================
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+src = ["src"]
+[tool.ruff.lint]
+select = [
+    "E",      # pycodestyle errors
+    "W",      # pycodestyle warnings
+    "F",      # pyflakes
+    "I",      # isort (import sorting)
+    "B",      # flake8-bugbear
+    "C4",     # flake8-comprehensions
+    "UP",     # pyupgrade
+    "ARG",    # unused arguments
+    "SIM",    # flake8-simplify
+    "RUF",    # ruff-specific
+]
+ignore = [
+    "E501",   # line length handled by formatter
+    "RUF001", # ambiguous unicode (we use Cyrillic/Chinese text)
+]
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = [
+    "ARG001",  # unused mock function arguments are common in tests
+    "ARG002",  # unused method arguments (fixtures) are common in tests
+    "SIM117",  # nested with statements for multiple mocks are clearer
+]
+"src/prolewiki_llm/**/*.py" = [
+    "ARG001",  # **kwargs required by GRPOTrainer reward function interface
+]
+[tool.ruff.lint.isort]
+known-first-party = ["prolewiki_llm"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+# =============================================================================
+# MYPY (Type Checking)
+# =============================================================================
+[tool.mypy]
+python_version = "3.12"
+strict = true
+mypy_path = ["src"]
+explicit_package_bases = true
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+show_error_codes = true
+exclude = ["tests/", "build/", "dist/"]
+[[tool.mypy.overrides]]
+module = [
+    "transformers",
+    "transformers.*",
+    "sentence_transformers.*",
+    "spacy",
+    "spacy.*",
+    "torch",
+    "torch.*",
+    "wandb",
+    "wandb.*",
+    "datasets",
+    "datasets.*",
+    "trl",
+    "trl.*",
+    "unsloth",
+    "unsloth.*",
+    "vllm",
+    "vllm.*",
+    "peft",
+    "peft.*",
+]
+ignore_missing_imports = true
+# =============================================================================
+# COVERAGE
+# =============================================================================
+[tool.coverage.run]
+source = ["src/prolewiki_llm"]
+branch = true
+omit = ["*/tests/*"]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+    "raise NotImplementedError",
+]

src/prolewiki_llm/__init__.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+AI Training Module for Marxist-Leninist LLM Fine-tuning.
+This module contains reward functions and training utilities for GRPO
+(Group Relative Policy Optimization) fine-tuning on ProleWiki corpus.
+Components:
+- grpo_rewards: Reward functions for GRPO training
+- wandb_logging: Weights & Biases logging for training observability
+- train_grpo_marxist: Main GRPO training script
+- transform_to_grpo: Dataset transformation utilities
+- convert_to_qwen: Qwen format conversion
+"""
+from prolewiki_llm.grpo_rewards import (
+    CAPITULATION_PATTERNS,
+    CLASS_ANALYSIS_MARKERS,
+    CONCEPT_EQUIVALENCES,
+    CONFIDENT_CLAIM_PATTERNS,
+    DEPTH_MARKERS,
+    DISCOURSE_CONNECTIVES,
+    EXPLANATORY_PHRASES,
+    FIRMNESS_PATTERNS,
+    HOLLOW_BUZZWORDS,
+    IDEOLOGICAL_CHALLENGE_PATTERNS,
+    MARXIST_TERMS,
+    QUESTION_WORDS,
+    QUOTE_TO_REFUTE_PATTERNS,
+    SELF_CRITICISM_MARKERS,
+    UNCERTAINTY_PATTERNS,
+    completeness_reward,
+    debug_print_reward,
+    entity_verification_reward,
+    epistemic_calibration_reward,
+    full_coherence_reward,
+    ideological_firmness_reward,
+    interconnection_depth_reward,
+    match_format_approximately,
+    match_format_exactly,
+    nli_coherence_reward,
+    robust_coherence_reward,
+    self_consistency_reward,
+    semantic_similarity_reward,
+    structural_coherence_reward,
+    terminology_reward,
+    topic_relevance_reward,
+)
+from prolewiki_llm.wandb_logging import (
+    RewardSample,
+    WandbSampleLogger,
+    create_logging_reward,
+    finish_wandb_logging,
+    init_wandb_logging,
+    is_wandb_available,
+    log_model_checkpoint,
+    log_reward_metrics,
+)
+__all__ = [
+    "CAPITULATION_PATTERNS",
+    "CLASS_ANALYSIS_MARKERS",
+    "CONCEPT_EQUIVALENCES",
+    "CONFIDENT_CLAIM_PATTERNS",
+    "DEPTH_MARKERS",
+    "DISCOURSE_CONNECTIVES",
+    "EXPLANATORY_PHRASES",
+    "FIRMNESS_PATTERNS",
+    "HOLLOW_BUZZWORDS",
+    "IDEOLOGICAL_CHALLENGE_PATTERNS",
+    "MARXIST_TERMS",
+    "QUESTION_WORDS",
+    "QUOTE_TO_REFUTE_PATTERNS",
+    "SELF_CRITICISM_MARKERS",
+    "UNCERTAINTY_PATTERNS",
+    "RewardSample",
+    "WandbSampleLogger",
+    "completeness_reward",
+    "create_logging_reward",
+    "debug_print_reward",
+    "entity_verification_reward",
+    "epistemic_calibration_reward",
+    "finish_wandb_logging",
+    "full_coherence_reward",
+    "ideological_firmness_reward",
+    "init_wandb_logging",
+    "interconnection_depth_reward",
+    "is_wandb_available",
+    "log_model_checkpoint",
+    "log_reward_metrics",
+    "match_format_approximately",
+    "match_format_exactly",
+    "nli_coherence_reward",
+    "robust_coherence_reward",
+    "self_consistency_reward",
+    "semantic_similarity_reward",
+    "structural_coherence_reward",
+    "terminology_reward",
+    "topic_relevance_reward",
+]

src/prolewiki_llm/convert_to_qwen.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3
+"""Convert instruction/response pairs to Qwen chat template format."""
+import json
+from pathlib import Path
+SYSTEM_PROMPT = """You are a Marxist-Leninist assistant trained on ProleWiki and critical theory. You provide accurate information about socialist history, theory, and practice from a Marxist-Leninist perspective. You explain concepts like dialectical materialism, historical materialism, class struggle, anti-colonialism, and socialist construction with clarity and ideological precision."""
+def convert_to_qwen(input_path: Path, output_path: Path) -> int:
+    """Convert instruction/response JSONL to Qwen chat template format."""
+    count = 0
+    with open(input_path) as infile, open(output_path, "w") as outfile:
+        for line in infile:
+            pair = json.loads(line)
+            # Format for Qwen-2.5 chat template
+            text = f"""<|im_start|>system
+{SYSTEM_PROMPT}<|im_end|>
+<|im_start|>user
+{pair['instruction']}<|im_end|>
+<|im_start|>assistant
+{pair['response']}<|im_end|>"""
+            outfile.write(json.dumps({"text": text}) + "\n")
+            count += 1
+    return count
+if __name__ == "__main__":
+    input_file = Path("training_data/curated_qa.jsonl")
+    output_file = Path("training_data/formatted/train_qwen.jsonl")
+    count = convert_to_qwen(input_file, output_file)
+    print(f"Converted {count} pairs to Qwen format")
+    print(f"Output: {output_file}")

src/prolewiki_llm/export_grpo_dataset.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#!/usr/bin/env python3
+"""
+Export the ProleWiki dataset to GRPO training format.
+Directly reads all JSONL files from the dataset repository and transforms them
+into the GRPO format expected by GRPOTrainer.
+Input:  ../dataset/ (68 JSONL files, 5,297 records total)
+Output: training_data/grpo_dataset.jsonl
+Usage:
+    python -m prolewiki_llm.export_grpo_dataset
+    python -m prolewiki_llm.export_grpo_dataset --dataset-path /path/to/dataset
+    python -m prolewiki_llm.export_grpo_dataset --output training_data/grpo_dataset.jsonl
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+# System prompt for GRPO training (must match dataset.py)
+_SYSTEM_PROMPT = """You are a Marxist-Leninist assistant trained on ProleWiki and critical theory.
+Think through political theory questions using dialectical materialist analysis.
+Show your reasoning in <think> tags, then provide a clear, well-sourced answer."""
+def find_dataset_path() -> Path:
+    """Find the dataset directory relative to this project.
+    Looks for the dataset in standard locations:
+    1. ../dataset (sibling directory)
+    2. ../../dataset (one level up)
+    3. Environment variable PROLEWIKI_DATASET_PATH
+    """
+    import os
+    # Check environment variable first
+    env_path = os.environ.get("PROLEWIKI_DATASET_PATH")
+    if env_path:
+        path = Path(env_path)
+        if path.exists():
+            return path
+    # Try relative paths from this file's location
+    this_file = Path(__file__).resolve()
+    project_root = this_file.parent.parent.parent.parent  # llm/
+    candidates = [
+        project_root.parent / "dataset",  # ../dataset (sibling)
+        project_root / "dataset",  # dataset/ within llm
+    ]
+    for candidate in candidates:
+        if candidate.exists() and (candidate / "sources").exists():
+            return candidate
+    raise FileNotFoundError(
+        "Could not find dataset directory. Set PROLEWIKI_DATASET_PATH "
+        "environment variable or ensure dataset is at ../dataset/"
+    )
+def iter_jsonl_files(dataset_path: Path) -> list[Path]:
+    """Find all JSONL files in sources/ and synthetic/ directories."""
+    files: list[Path] = []
+    sources_dir = dataset_path / "sources"
+    if sources_dir.exists():
+        files.extend(sorted(sources_dir.rglob("*.jsonl")))
+    synthetic_dir = dataset_path / "synthetic"
+    if synthetic_dir.exists():
+        files.extend(sorted(synthetic_dir.rglob("*.jsonl")))
+    return files
+def transform_to_grpo(record: dict) -> dict | None:
+    """Transform a single record to GRPO format.
+    Input format:  {instruction, response, ...}
+    Output format: {prompt: [{role, content}, ...], answer}
+    """
+    instruction = record.get("instruction")
+    response = record.get("response")
+    # Skip records without required fields
+    if not instruction or not response:
+        return None
+    return {
+        "prompt": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": instruction},
+        ],
+        "answer": response,
+    }
+def export_grpo_dataset(
+    dataset_path: Path,
+    output_path: Path,
+    verbose: bool = True,
+) -> int:
+    """Export dataset to GRPO format by directly reading JSONL files.
+    Args:
+        dataset_path: Path to the dataset directory
+        output_path: Path to write the GRPO-formatted JSONL file
+        verbose: Whether to print progress messages
+    Returns:
+        Number of records exported
+    """
+    if verbose:
+        print(f"Loading dataset from: {dataset_path}")
+    # Find all JSONL files
+    jsonl_files = iter_jsonl_files(dataset_path)
+    if verbose:
+        print(f"Found {len(jsonl_files)} JSONL files")
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    record_count = 0
+    skipped_count = 0
+    first_sample = None
+    with open(output_path, "w", encoding="utf-8") as outfile:
+        for jsonl_path in jsonl_files:
+            with open(jsonl_path, encoding="utf-8") as infile:
+                for line_num, line in enumerate(infile, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        record = json.loads(line)
+                    except json.JSONDecodeError as e:
+                        if verbose:
+                            print(f"Warning: Invalid JSON in {jsonl_path}:{line_num}: {e}")
+                        skipped_count += 1
+                        continue
+                    transformed = transform_to_grpo(record)
+                    if transformed is None:
+                        skipped_count += 1
+                        continue
+                    outfile.write(json.dumps(transformed, ensure_ascii=False) + "\n")
+                    record_count += 1
+                    if first_sample is None:
+                        first_sample = transformed
+    if verbose:
+        print(f"\nExported {record_count:,} records to {output_path}")
+        if skipped_count > 0:
+            print(f"Skipped {skipped_count:,} invalid/incomplete records")
+        # Show sample
+        if first_sample:
+            print("\nSample record:")
+            print(f"  System: {first_sample['prompt'][0]['content'][:60]}...")
+            print(f"  User: {first_sample['prompt'][1]['content'][:60]}...")
+            print(f"  Answer: {first_sample['answer'][:60]}...")
+        # Show file size
+        size_mb = output_path.stat().st_size / (1024 * 1024)
+        print(f"\nOutput file size: {size_mb:.2f} MB")
+    return record_count
+def main() -> int:
+    """CLI entrypoint."""
+    parser = argparse.ArgumentParser(
+        description="Export ProleWiki dataset to GRPO training format"
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=Path,
+        default=None,
+        help="Path to dataset directory (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("training_data/grpo_dataset.jsonl"),
+        help="Output JSONL file path (default: training_data/grpo_dataset.jsonl)",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress progress messages",
+    )
+    args = parser.parse_args()
+    try:
+        dataset_path = args.dataset_path or find_dataset_path()
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    try:
+        record_count = export_grpo_dataset(
+            dataset_path=dataset_path,
+            output_path=args.output,
+            verbose=not args.quiet,
+        )
+        return 0 if record_count > 0 else 1
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

src/prolewiki_llm/grpo_rewards.py ADDED Viewed

	@@ -0,0 +1,1847 @@

+#!/usr/bin/env python3
+"""
+GRPO Reward Functions for Marxist-Leninist Q&A Training.
+These reward functions guide the model toward:
+1. Proper <think>...</think> format
+2. Semantic coherence via NLI (Natural Language Inference)
+3. Structural coherence via dependency parsing
+4. Self-consistency (no internal contradictions)
+5. Appropriate response length/completeness
+Research basis:
+- NLI as reward: arxiv.org/abs/2508.18212 (Better LM-Based Judging)
+- MO-GRPO normalization: arxiv.org/abs/2509.22047
+- Process rewards: arxiv.org/abs/2508.05170 (Posterior-GRPO)
+"""
+from __future__ import annotations
+import re
+from typing import TYPE_CHECKING, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+# =============================================================================
+# GLOBAL SETUP - LAZY LOADING
+# =============================================================================
+# Lazy-load models to avoid loading at import time
+_embedder: SentenceTransformer | None = None
+_nli_pipeline: Any | None = None
+_spacy_nlp: Any | None = None
+def get_embedder() -> SentenceTransformer:
+    """Get or initialize the sentence transformer embedder."""
+    global _embedder
+    if _embedder is None:
+        print("[Reward] Loading sentence-transformers embedder...")
+        _embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    return _embedder
+def get_nli_pipeline() -> Any:
+    """Get or initialize the NLI pipeline (BART-large-MNLI)."""
+    global _nli_pipeline
+    if _nli_pipeline is None:
+        print("[Reward] Loading NLI model (bart-large-mnli)...")
+        from transformers import pipeline
+        _nli_pipeline = pipeline(
+            "text-classification",
+            model="facebook/bart-large-mnli",
+            device="cuda" if _cuda_available() else "cpu",
+        )
+    return _nli_pipeline
+def get_spacy_nlp() -> Any:
+    """Get or initialize spaCy NLP pipeline.
+    Uses en_core_web_trf (transformer-based) for superior semantic understanding.
+    Falls back to en_core_web_md (word vectors) or en_core_web_sm if unavailable.
+    """
+    global _spacy_nlp
+    if _spacy_nlp is None:
+        import spacy
+        # Try transformer model first (best semantic understanding)
+        models_to_try = ["en_core_web_trf", "en_core_web_md", "en_core_web_sm"]
+        for model_name in models_to_try:
+            try:
+                print(f"[Reward] Loading spaCy model: {model_name}...")
+                _spacy_nlp = spacy.load(model_name)
+                print(f"[Reward] Loaded {model_name} successfully")
+                break
+            except OSError:
+                print(f"[Reward] {model_name} not found, trying next...")
+                continue
+        if _spacy_nlp is None:
+            raise OSError(
+                "No spaCy model found. Install one with:\n"
+                "  uv pip install en_core_web_trf@https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl"
+            )
+    return _spacy_nlp
+def _cuda_available() -> bool:
+    """Check if CUDA is available."""
+    try:
+        import torch
+        return bool(torch.cuda.is_available())
+    except ImportError:
+        return False
+# Reasoning format tokens (DeepSeek-R1 style)
+REASONING_START = "<think>"
+REASONING_END = "</think>"
+# Regex to match format
+SOLUTION_END_REGEX = re.compile(rf"{REASONING_END}(.*)", re.DOTALL)
+# Marxist terminology for vocabulary reward
+MARXIST_TERMS: set[str] = {
+    # Core concepts
+    "dialectical",
+    "materialism",
+    "historical materialism",
+    "dialectical materialism",
+    # Classes
+    "bourgeoisie",
+    "proletariat",
+    "petty bourgeois",
+    "petty bourgeoisie",
+    "lumpenproletariat",
+    "working class",
+    "ruling class",
+    # Class struggle
+    "class struggle",
+    "class consciousness",
+    "class war",
+    "class conflict",
+    # Political economy
+    "surplus value",
+    "commodity",
+    "use value",
+    "exchange value",
+    "labor power",
+    "means of production",
+    "relations of production",
+    "forces of production",
+    "mode of production",
+    "primitive accumulation",
+    "exploitation",
+    "capital accumulation",
+    # Imperialism
+    "imperialism",
+    "colonialism",
+    "neo-colonialism",
+    "settler colonialism",
+    "national liberation",
+    "self-determination",
+    # State and revolution
+    "dictatorship of the proletariat",
+    "vanguard",
+    "vanguard party",
+    "democratic centralism",
+    "withering away of the state",
+    "proletarian dictatorship",
+    # Ideology
+    "hegemony",
+    "superstructure",
+    "base",
+    "ideology",
+    "false consciousness",
+    # Revisionism
+    "revisionism",
+    "opportunism",
+    "reformism",
+    "social democracy",
+    "ultra-leftism",
+    # Alienation
+    "alienation",
+    "fetishism",
+    "commodity fetishism",
+    "reification",
+    # Historical
+    "paris commune",
+    "october revolution",
+    "bolshevik",
+    "menshevik",
+    # Anti-colonial
+    "decolonization",
+    "third world",
+    "global south",
+    "national bourgeoisie",
+    "comprador",
+}
+# =============================================================================
+# FORMAT REWARDS (from original notebook)
+# =============================================================================
+def match_format_exactly(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward +3.0 if response contains proper </think> tag.
+    This encourages the model to use the reasoning format.
+    """
+    scores: list[float] = []
+    for completion in completions:
+        score = 0.0
+        response = completion[0]["content"]
+        # Match if format is seen exactly
+        if SOLUTION_END_REGEX.search(response) is not None:
+            score += 3.0
+        scores.append(score)
+    return scores
+def match_format_approximately(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward partial format matching.
+    +0.5 for exactly one <think> tag
+    +0.5 for exactly one </think> tag
+    -1.0 for multiple or missing tags
+    """
+    scores: list[float] = []
+    for completion in completions:
+        score = 0.0
+        response = completion[0]["content"]
+        # Check for proper tag counts
+        start_count = response.count(REASONING_START)
+        end_count = response.count(REASONING_END)
+        score += 0.5 if start_count == 1 else -1.0
+        score += 0.5 if end_count == 1 else -1.0
+        scores.append(score)
+    return scores
+# =============================================================================
+# SEMANTIC SIMILARITY REWARD
+# =============================================================================
+def semantic_similarity_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Reward responses that are semantically similar to ground truth.
+    Uses sentence-transformers to compute cosine similarity.
+    Scoring:
+        > 0.75 similarity: +5.0
+        > 0.60 similarity: +3.0
+        > 0.45 similarity: +1.0
+        > 0.30 similarity: -1.0
+        <= 0.30 similarity: -3.0
+    """
+    embedder = get_embedder()
+    scores: list[float] = []
+    for completion, true_answer in zip(completions, answer, strict=False):
+        response = completion[0]["content"]
+        # Extract answer after </think> if present
+        if REASONING_END in response:
+            response = response.split(REASONING_END, 1)[1].strip()
+        # Handle empty response
+        if not response or len(response.strip()) < 10:
+            scores.append(-3.0)
+            continue
+        # Compute cosine similarity
+        emb_response = embedder.encode(response, normalize_embeddings=True)
+        emb_truth = embedder.encode(true_answer, normalize_embeddings=True)
+        similarity = float(np.dot(emb_response, emb_truth))
+        # Scale to reward
+        if similarity > 0.75:
+            score = 5.0
+        elif similarity > 0.60:
+            score = 3.0
+        elif similarity > 0.45:
+            score = 1.0
+        elif similarity > 0.30:
+            score = -1.0
+        else:
+            score = -3.0
+        scores.append(score)
+    return scores
+# =============================================================================
+# MARXIST TERMINOLOGY REWARD
+# =============================================================================
+def terminology_reward(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward use of proper Marxist terminology.
+    +0.3 per unique term found, capped at +2.0
+    NOTE: This is a shallow reward that can be gamed with "word soup".
+    Consider using nli_coherence_reward or structural_coherence_reward
+    for more robust evaluation.
+    """
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"].lower()
+        # Count unique terms present
+        term_count = sum(1 for term in MARXIST_TERMS if term in response)
+        # Reward: 0.3 per term, capped at 2.0
+        score = min(term_count * 0.3, 2.0)
+        scores.append(score)
+    return scores
+# =============================================================================
+# NLI-BASED COHERENCE REWARD (Research-backed)
+# =============================================================================
+# Discourse connectives indicating logical structure
+DISCOURSE_CONNECTIVES: set[str] = {
+    "because",
+    "therefore",
+    "thus",
+    "hence",
+    "consequently",
+    "however",
+    "although",
+    "whereas",
+    "nevertheless",
+    "moreover",
+    "furthermore",
+    "additionally",
+    "specifically",
+    "namely",
+    "in other words",
+    "for example",
+    "for instance",
+    "such as",
+    "as a result",
+    "due to",
+    "in order to",
+    "so that",
+    "on the other hand",
+    "in contrast",
+    "similarly",
+    "likewise",
+}
+# Explanatory phrases that indicate concept is being explained (not just dropped)
+EXPLANATORY_PHRASES: set[str] = {
+    # Causal explanations
+    "because the",
+    "because of",
+    "this is because",
+    "since the",
+    "due to the",
+    "as a result of",
+    "results from",
+    "caused by",
+    "leads to",
+    "results in",
+    "enables",
+    "produces",
+    # Definitional explanations
+    "is defined as",
+    "refers to",
+    "means that",
+    "denotes",
+    "that is,",
+    "in other words",
+    "namely",
+    "i.e.",
+    # Elaboration
+    "specifically",
+    "in particular",
+    "for example",
+    "such as",
+    "this means",
+    "which means",
+    "this implies",
+    "therefore",
+    # Mechanism explanations
+    "this occurs when",
+    "this happens because",
+    "the mechanism",
+    "through the process of",
+    "by means of",
+    "works by",
+}
+# Hollow buzzwords: activist jargon that signals superficial analysis when used
+# without substantive explanation. These are NOT Marxist technical terms.
+# Penalty applies when: high density + low depth ratio
+HOLLOW_BUZZWORDS: set[str] = {
+    # Vague connectors (non-analytical)
+    "interconnected",
+    "interrelated",
+    "intersects with",
+    "it's all connected",
+    "everything is connected",
+    "systemic",
+    # Performative activist language
+    "centered",
+    "centering",
+    "uplift",
+    "uplifting",
+    "do the work",
+    "the work",
+    "unpack",
+    "unpacking",
+    "unlearn",
+    "unlearning",
+    "hold space",
+    "sit with",
+    "lean into",
+    "problematic",
+    "harmful",
+    "toxic",
+    # Vague abstractions without specifics
+    "in a way",
+    "sort of",
+    "kind of",
+    "essentially",
+    "basically",
+    "generally speaking",
+    "broadly",
+    # Jargon often used without definition
+    "praxis",  # Valid Marxist term but often misused without explanation
+    "material conditions",  # Valid but often used as hand-wave
+    "structural",
+    "structurally",  # Often vague without mechanism
+    # Identity-focused without class analysis
+    "lived experience",
+    "as a",  # Often substitutes for analysis
+}
+# Phrases that signal analytical depth (opposite of hollow)
+DEPTH_MARKERS: set[str] = {
+    # Historical specificity
+    "in 1",
+    "in 2",
+    "during the",
+    "after the",
+    "before the",
+    # Citing sources/figures
+    "marx argued",
+    "lenin wrote",
+    "engels noted",
+    "gramsci",
+    "according to",
+    "as marx",
+    "as lenin",
+    # Concrete examples
+    "for example",
+    "such as",
+    "in the case of",
+    "consider",
+    # Precise definitions
+    "defined as",
+    "meaning",
+    "specifically",
+}
+# Marxist concept equivalences for topic matching
+# Maps canonical term -> set of synonyms/equivalents
+CONCEPT_EQUIVALENCES: dict[str, set[str]] = {
+    # Class terms
+    "bourgeoisie": {"capitalist class", "ruling class", "capitalists", "bourgeois", "capital"},
+    "proletariat": {"working class", "workers", "wage laborers", "labor", "labourers"},
+    "petty bourgeoisie": {"petit bourgeoisie", "small business", "middle class", "petty bourgeois"},
+    "lumpenproletariat": {"lumpen", "underclass", "criminal element"},
+    # Economic concepts
+    "surplus value": {"unpaid labor", "profit", "extraction", "surplus labor"},
+    "means of production": {"productive forces", "capital goods", "factories", "industry"},
+    "exploitation": {"extraction", "appropriation", "expropriation"},
+    "commodity": {"commodities", "goods", "merchandise"},
+    "capital accumulation": {"accumulation", "concentration of capital"},
+    "primitive accumulation": {"original accumulation", "so-called primitive accumulation"},
+    # Political concepts
+    "dictatorship of the proletariat": {
+        "workers state",
+        "proletarian dictatorship",
+        "workers government",
+    },
+    "vanguard party": {"vanguard", "communist party", "revolutionary party"},
+    "democratic centralism": {"party discipline", "centralism"},
+    # Imperialism
+    "imperialism": {"colonialism", "neo-colonialism", "empire", "colonial"},
+    "national liberation": {"decolonization", "anti-colonial", "liberation movement"},
+    "settler colonialism": {"settler colony", "colonial settlement"},
+    # Ideology
+    "revisionism": {"opportunism", "reformism", "right deviation"},
+    "hegemony": {"ideological hegemony", "cultural hegemony", "domination"},
+    "false consciousness": {"ideology", "mystification"},
+    # Philosophy
+    "dialectical materialism": {"diamat", "materialist dialectics", "dialectics"},
+    "historical materialism": {"histmat", "materialist conception of history"},
+    "alienation": {"estrangement", "alienated labor"},
+}
+# Question words to ignore when extracting topics
+QUESTION_WORDS: set[str] = {"what", "how", "why", "who", "when", "where", "which", "whom"}
+def nli_coherence_reward(
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Reward responses that logically ENTAIL the ground truth answer.
+    Uses Natural Language Inference (facebook/bart-large-mnli) to check
+    if the response is logically consistent with the expected answer.
+    This defeats "word soup" attacks because random terminology won't
+    logically entail anything - it will be classified as NEUTRAL.
+    Scoring:
+        entailment: +3.0 (response supports/implies ground truth)
+        neutral: -1.0 (response is off-topic or incoherent)
+        contradiction: -3.0 (response contradicts ground truth)
+    Research basis: arxiv.org/abs/2508.18212
+    """
+    nli = get_nli_pipeline()
+    scores: list[float] = []
+    for completion, true_answer in zip(completions, answer, strict=False):
+        response = completion[0]["content"]
+        # Extract answer part after </think>
+        if REASONING_END in response:
+            response = response.split(REASONING_END, 1)[1].strip()
+        # Handle empty or very short responses
+        if not response or len(response.strip()) < 20:
+            scores.append(-2.0)
+            continue
+        # Truncate to model max length (prevent OOM)
+        response_truncated = response[:512]
+        truth_truncated = true_answer[:512]
+        # NLI classification: premise </s></s> hypothesis
+        # We check: Does response entail ground truth?
+        try:
+            input_text = f"{response_truncated}</s></s>{truth_truncated}"
+            result = nli(input_text)[0]
+            label = result["label"].lower()
+            if label == "entailment":
+                score = 3.0
+            elif label == "neutral":
+                score = -1.0
+            else:  # contradiction
+                score = -3.0
+            scores.append(score)
+        except Exception as e:
+            print(f"[NLI Reward] Error: {e}")
+            scores.append(0.0)
+    return scores
+def self_consistency_reward(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward responses that are internally self-consistent.
+    Checks if any sentence in the response CONTRADICTS another sentence.
+    This avoids external ideological bias by only checking within-document
+    coherence.
+    Scoring:
+        No contradictions found: +1.0
+        Internal contradiction detected: -2.0
+    Research basis: arxiv.org/abs/2508.05170 (process-based rewards)
+    """
+    nli = get_nli_pipeline()
+    nlp = get_spacy_nlp()
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"]
+        # Parse into sentences
+        doc = nlp(response)
+        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
+        # Need at least 2 sentences to check consistency
+        if len(sentences) < 2:
+            scores.append(0.0)
+            continue
+        # Check pairs of sentences for contradictions
+        # (Only check adjacent and near-adjacent to limit compute)
+        has_contradiction = False
+        max_pairs_to_check = 10
+        pairs_checked = 0
+        for i, sent_a in enumerate(sentences[:-1]):
+            if pairs_checked >= max_pairs_to_check:
+                break
+            # Check against next 2 sentences
+            for j in range(i + 1, min(i + 3, len(sentences))):
+                sent_b = sentences[j]
+                try:
+                    input_text = f"{sent_a[:256]}</s></s>{sent_b[:256]}"
+                    result = nli(input_text)[0]
+                    if result["label"].lower() == "contradiction":
+                        has_contradiction = True
+                        break
+                    pairs_checked += 1
+                except Exception:
+                    pass
+            if has_contradiction:
+                break
+        if has_contradiction:
+            scores.append(-2.0)
+        else:
+            scores.append(1.0)
+    return scores
+def structural_coherence_reward(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward responses with proper linguistic structure.
+    Uses spaCy dependency parsing to verify:
+    1. Marxist terms appear in meaningful syntactic roles (subject, object)
+    2. Response contains logical discourse connectives
+    3. Response has proper sentence structure (not word soup)
+    This defeats word soup because random terms won't be in subject/object
+    positions - they'll be parsed as fragments.
+    Scoring:
+        +0.3 per term in subject/object position (max +1.5)
+        +0.2 per discourse connective (max +1.0)
+        -1.0 if no complete sentences detected
+    Research basis: spaCy dependency parsing for coherence evaluation
+    """
+    nlp = get_spacy_nlp()
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"]
+        doc = nlp(response)
+        score = 0.0
+        # Check 1: Are there actual sentences?
+        sentences = list(doc.sents)
+        if len(sentences) < 1:
+            scores.append(-1.0)
+            continue
+        # Check 2: Marxist terms in meaningful syntactic roles
+        terms_in_context = 0
+        response_lower = response.lower()
+        for term in MARXIST_TERMS:
+            if term not in response_lower:
+                continue
+            # Find tokens matching this term
+            for token in doc:
+                if term in token.text.lower() or (
+                    token.i + 1 < len(doc)
+                    and term in f"{token.text} {doc[token.i + 1].text}".lower()
+                ):
+                    # Reward if token is in a meaningful syntactic role
+                    if token.dep_ in (
+                        "nsubj",  # nominal subject
+                        "nsubjpass",  # passive nominal subject
+                        "dobj",  # direct object
+                        "pobj",  # object of preposition
+                        "attr",  # attribute
+                        "appos",  # appositional modifier
+                    ):
+                        terms_in_context += 1
+                        break  # Count each term once
+                    # Also reward if connected to a meaningful verb
+                    elif token.head.pos_ == "VERB" and token.head.dep_ == "ROOT":
+                        terms_in_context += 1
+                        break
+        score += min(terms_in_context * 0.3, 1.5)
+        # Check 3: Discourse connectives (indicates logical structure)
+        connective_count = sum(1 for conn in DISCOURSE_CONNECTIVES if conn in response_lower)
+        score += min(connective_count * 0.2, 1.0)
+        scores.append(score)
+    return scores
+# =============================================================================
+# COMBINED ROBUST COHERENCE REWARD
+# =============================================================================
+def robust_coherence_reward(
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Multi-layered coherence check combining NLI, self-consistency, and structure.
+    This is the recommended reward function for robust evaluation that defeats
+    reward hacking via word soup or other adversarial strategies.
+    Layers:
+    1. NLI coherence: Does response entail ground truth?
+    2. Self-consistency: Does response contradict itself?
+    3. Structural coherence: Are terms used in meaningful syntactic roles?
+    Scoring (combined):
+        NLI entailment + self-consistent + good structure: up to +5.5
+        NLI neutral or contradiction: -1.0 to -3.0
+        Internal contradiction: -2.0
+        Word soup (no structure): -1.0
+    """
+    # Get individual scores
+    nli_scores = nli_coherence_reward(completions, answer, **kwargs)
+    consistency_scores = self_consistency_reward(completions, **kwargs)
+    structure_scores = structural_coherence_reward(completions, **kwargs)
+    # Combine with weights
+    combined: list[float] = []
+    for nli, consistency, structure in zip(
+        nli_scores, consistency_scores, structure_scores, strict=False
+    ):
+        # If NLI shows contradiction, heavily penalize regardless of other scores
+        if nli <= -3.0:
+            combined.append(-3.0)
+        # If internal contradiction, penalize
+        elif consistency <= -2.0:
+            combined.append(-2.0)
+        # Otherwise combine scores
+        else:
+            # NLI is primary signal, structure and consistency are bonuses
+            total = nli + (consistency * 0.5) + (structure * 0.5)
+            combined.append(total)
+    return combined
+# =============================================================================
+# TOPIC RELEVANCE REWARD (Question-Answer Alignment)
+# =============================================================================
+def _extract_noun_with_preps(token: Any) -> set[str]:
+    """
+    Extract a noun and its prepositional phrase children.
+    For "dictatorship of the proletariat", returns:
+    {"dictatorship", "proletariat", "dictatorship of the proletariat"}
+    """
+    topics: set[str] = set()
+    # Add the main noun (lemmatized)
+    if token.pos_ in ("NOUN", "PROPN"):
+        topics.add(token.lemma_.lower())
+        # Check for compound modifiers (e.g., "surplus value" where "surplus" is amod)
+        modifiers = []
+        for child in token.children:
+            if child.dep_ in ("compound", "amod") and child.pos_ in ("NOUN", "ADJ"):
+                modifiers.append(child.text.lower())
+        if modifiers:
+            full_term = " ".join([*modifiers, token.text.lower()])
+            topics.add(full_term)
+        # Follow prepositional phrases (e.g., "of the proletariat")
+        for child in token.children:
+            if child.dep_ == "prep":
+                for pobj in child.children:
+                    if pobj.dep_ == "pobj":
+                        topics.add(pobj.lemma_.lower())
+                        # Build full phrase: "dictatorship of the proletariat"
+                        full_phrase = f"{token.text.lower()} {child.text} {pobj.text.lower()}"
+                        topics.add(full_phrase)
+                        # Also get nested preps
+                        topics.update(_extract_noun_with_preps(pobj))
+    return topics
+def _extract_question_topics(doc: Any) -> set[str]:
+    """
+    Extract the core topics from a question using spaCy dependency parsing.
+    For "What is revisionism?", extracts {"revisionism"}
+    For "How does imperialism relate to capitalism?", extracts {"imperialism", "capitalism"}
+    For "What is the dictatorship of the proletariat?", extracts
+        {"dictatorship", "proletariat", "dictatorship of the proletariat"}
+    """
+    topics: set[str] = set()
+    # Find the ROOT
+    root = None
+    for token in doc:
+        if token.dep_ == "ROOT":
+            root = token
+            break
+    if root:
+        # Extract from ROOT's children
+        for child in root.children:
+            # nsubj: "What is [revisionism]?" - revisionism is subject
+            # dobj: "Explain [the concept]" - concept is direct object
+            # attr: less common but possible
+            # nsubjpass: passive subject
+            if child.dep_ in ("nsubj", "dobj", "attr", "nsubjpass"):
+                # Skip question words ("What is X" - skip "What")
+                if child.text.lower() in QUESTION_WORDS:
+                    continue
+                topics.update(_extract_noun_with_preps(child))
+            # pobj in prep attached to ROOT: "relate to [capitalism]"
+            if child.dep_ == "prep":
+                for pobj in child.children:
+                    if pobj.dep_ == "pobj":
+                        topics.update(_extract_noun_with_preps(pobj))
+    # Fallback: extract all noun chunks except question words
+    if not topics:
+        for chunk in doc.noun_chunks:
+            root_text = chunk.root.text.lower()
+            if root_text not in QUESTION_WORDS:
+                topics.add(chunk.root.lemma_.lower())
+                # Also add full chunk for multi-word terms
+                chunk_text = chunk.text.lower().strip()
+                if " " in chunk_text:
+                    topics.add(chunk_text)
+    # Final cleanup: remove question words that might have slipped through
+    topics = {t for t in topics if t not in QUESTION_WORDS}
+    return topics
+def _extract_answer_topics(doc: Any) -> set[str]:
+    """
+    Extract topics discussed in an answer using spaCy.
+    Returns lemmatized noun phrases and named entities.
+    Strips determiners (the, a, an) for better matching.
+    """
+    topics: set[str] = set()
+    # Determiners to strip from multi-word phrases
+    determiners = {"the", "a", "an", "this", "that", "these", "those"}
+    # Get noun chunk roots (lemmatized)
+    for chunk in doc.noun_chunks:
+        topics.add(chunk.root.lemma_.lower())
+        # Multi-word terms (strip leading determiners)
+        words = chunk.text.lower().strip().split()
+        if words and words[0] in determiners:
+            words = words[1:]
+        chunk_text = " ".join(words)
+        if " " in chunk_text and len(chunk_text) < 50:
+            topics.add(chunk_text)
+    # Get named entities
+    for ent in doc.ents:
+        ent_text = ent.text.lower()
+        # Strip leading determiners from entities too
+        words = ent_text.split()
+        if words and words[0] in determiners:
+            words = words[1:]
+        topics.add(" ".join(words))
+    return topics
+def _expand_with_synonyms(topics: set[str]) -> set[str]:
+    """
+    Expand a set of topics with Marxist concept synonyms.
+    If "bourgeoisie" is in topics, also adds "capitalist class", "ruling class", etc.
+    """
+    expanded = set(topics)
+    for topic in topics:
+        # Check if topic matches any canonical term
+        if topic in CONCEPT_EQUIVALENCES:
+            expanded.update(CONCEPT_EQUIVALENCES[topic])
+        # Check if topic matches any synonym (reverse lookup)
+        for canonical, synonyms in CONCEPT_EQUIVALENCES.items():
+            if topic in synonyms or topic == canonical:
+                expanded.add(canonical)
+                expanded.update(synonyms)
+    return expanded
+def _compute_topic_coverage(q_topics: set[str], a_topics: set[str], nlp: Any) -> float:
+    """
+    Compute how well answer topics cover question topics.
+    Uses:
+    1. Direct lemma matching
+    2. Expanded synonym matching
+    3. spaCy word vector similarity (fallback)
+    Returns coverage score 0.0 to 1.0
+    """
+    if not q_topics:
+        return 0.5  # Can't evaluate, neutral
+    # Expand question topics with synonyms
+    q_expanded = _expand_with_synonyms(q_topics)
+    # Direct/synonym match
+    matched = q_expanded & a_topics
+    direct_coverage = len(matched) / len(q_topics) if q_topics else 0
+    if direct_coverage >= 0.5:
+        return min(direct_coverage, 1.0)
+    # Fallback: semantic similarity using spaCy vectors
+    # For unmatched q_topics, check if any a_topic is semantically similar
+    unmatched_q = q_topics - matched
+    semantic_matches = 0
+    for q_topic in unmatched_q:
+        q_token = nlp(q_topic)
+        if not q_token.has_vector:
+            continue
+        best_sim = 0.0
+        for a_topic in a_topics:
+            a_token = nlp(a_topic)
+            if a_token.has_vector:
+                sim = q_token.similarity(a_token)
+                best_sim = max(best_sim, sim)
+        if best_sim > 0.6:  # Threshold for semantic match
+            semantic_matches += 1
+    total_matched = len(matched) + semantic_matches
+    return min(total_matched / len(q_topics), 1.0) if q_topics else 0.5
+def topic_relevance_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Reward answers that are ON-TOPIC with respect to the question.
+    Implements f(A) ⊆ f(Q) check where f extracts semantic topics:
+    1. Extract core topics from question Q using dependency parsing
+    2. Expand Q topics with Marxist concept synonyms
+    3. Extract topics from answer A
+    4. Compute coverage: how many Q topics are addressed in A
+    Scoring:
+        > 80% coverage: +2.0 (answer fully addresses question topics)
+        > 60% coverage: +1.5 (answer mostly on-topic)
+        > 40% coverage: +1.0 (answer partially on-topic)
+        > 20% coverage: 0.0 (answer tangentially related)
+        <= 20% coverage: -1.5 (answer off-topic)
+    This reward ensures the model answers WHAT WAS ASKED, not just
+    generates coherent Marxist text about something else.
+    """
+    nlp = get_spacy_nlp()
+    scores: list[float] = []
+    for prompt, completion in zip(prompts, completions, strict=False):
+        # Extract question (last user message)
+        question = prompt[-1]["content"]
+        response = completion[0]["content"]
+        # Extract answer part after </think>
+        if REASONING_END in response:
+            response = response.split(REASONING_END, 1)[1].strip()
+        # Handle empty response
+        if not response or len(response.strip()) < 20:
+            scores.append(-1.5)
+            continue
+        # Parse with spaCy
+        q_doc = nlp(question)
+        a_doc = nlp(response[:2000])  # Limit for performance
+        # Extract topics
+        q_topics = _extract_question_topics(q_doc)
+        a_topics = _extract_answer_topics(a_doc)
+        # Handle case where no topics extracted from question
+        if not q_topics:
+            # Fallback: just check if answer has substance
+            scores.append(0.5 if len(a_topics) > 3 else 0.0)
+            continue
+        # Compute coverage
+        coverage = _compute_topic_coverage(q_topics, a_topics, nlp)
+        # Convert to reward score
+        if coverage > 0.8:
+            score = 2.0
+        elif coverage > 0.6:
+            score = 1.5
+        elif coverage > 0.4:
+            score = 1.0
+        elif coverage > 0.2:
+            score = 0.0
+        else:
+            score = -1.5
+        scores.append(score)
+    return scores
+def full_coherence_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Complete coherence check: robust_coherence + topic_relevance + depth.
+    This is the MOST COMPREHENSIVE reward function, checking:
+    1. NLI coherence (A entails ground truth)
+    2. Self-consistency (A doesn't contradict itself)
+    3. Structural coherence (terms in proper syntactic roles)
+    4. Topic relevance (A addresses what Q asked about)
+    5. Interconnection depth (rewards deep analysis, penalizes buzzword salad)
+    Use this for maximum robustness against reward hacking.
+    """
+    robust_scores = robust_coherence_reward(completions, answer, **kwargs)
+    relevance_scores = topic_relevance_reward(prompts, completions, **kwargs)
+    depth_scores = interconnection_depth_reward(completions, **kwargs)
+    combined: list[float] = []
+    for robust, relevance, depth in zip(
+        robust_scores, relevance_scores, depth_scores, strict=False
+    ):
+        # If severely off-topic, penalize
+        if relevance <= -1.5:
+            combined.append(-2.0)
+        # If robust check failed badly, use that
+        elif robust <= -2.0:
+            combined.append(robust)
+        # If buzzword salad detected (low depth), penalize
+        elif depth <= -1.5:
+            combined.append(-1.5)
+        # Otherwise combine
+        else:
+            # Robust is primary, relevance and depth are bonuses/penalties
+            total = robust + (relevance * 0.4) + (depth * 0.3)
+            combined.append(total)
+    return combined
+# =============================================================================
+# INTERCONNECTION DEPTH REWARD (Anti-Buzzword-Salad)
+# =============================================================================
+def _count_unique_marxist_concepts(text: str) -> int:
+    """Count unique Marxist concepts mentioned in text."""
+    text_lower = text.lower()
+    count = 0
+    for term in MARXIST_TERMS:
+        if term in text_lower:
+            count += 1
+    return count
+def _compute_depth_ratio(text: str) -> float:
+    """
+    Compute depth ratio: words per unique Marxist concept.
+    High ratio = deep analysis (few concepts, well explained)
+    Low ratio = shallow/buzzword soup (many concepts, little explanation)
+    Returns:
+        Words per concept, or 100.0 if no Marxist concepts found
+    """
+    words = len(text.split())
+    concepts = _count_unique_marxist_concepts(text)
+    if concepts == 0:
+        return 100.0  # No Marxist concepts = neutral (not shallow)
+    return words / concepts
+def _count_hollow_buzzwords(text: str) -> int:
+    """Count hollow buzzwords in text."""
+    text_lower = text.lower()
+    count = 0
+    for buzzword in HOLLOW_BUZZWORDS:
+        if buzzword in text_lower:
+            count += 1
+    return count
+def _count_depth_markers(text: str) -> int:
+    """Count analytical depth markers in text."""
+    text_lower = text.lower()
+    count = 0
+    for marker in DEPTH_MARKERS:
+        if marker in text_lower:
+            count += 1
+    return count
+def _count_explanatory_phrases(text: str) -> int:
+    """Count explanatory phrases in text."""
+    text_lower = text.lower()
+    count = 0
+    for phrase in EXPLANATORY_PHRASES:
+        if phrase in text_lower:
+            count += 1
+    return count
+def _concepts_have_explanations(text: str) -> tuple[int, int]:
+    """
+    Check if introduced concepts have nearby explanations.
+    Returns:
+        Tuple of (explained_count, unexplained_count)
+    """
+    nlp = get_spacy_nlp()
+    doc = nlp(text)
+    # Get sentences
+    sentences = [sent.text.lower() for sent in doc.sents]
+    explained = 0
+    unexplained = 0
+    for i, sent in enumerate(sentences):
+        # Check which Marxist concepts appear in this sentence
+        concepts_in_sent = [t for t in MARXIST_TERMS if t in sent]
+        for _concept in concepts_in_sent:
+            # Check if explanatory phrase appears in same or adjacent sentence
+            has_explanation = False
+            # Check current sentence
+            for phrase in EXPLANATORY_PHRASES:
+                if phrase in sent:
+                    has_explanation = True
+                    break
+            # Check next sentence if exists
+            if not has_explanation and i + 1 < len(sentences):
+                next_sent = sentences[i + 1]
+                for phrase in EXPLANATORY_PHRASES:
+                    if phrase in next_sent:
+                        has_explanation = True
+                        break
+            if has_explanation:
+                explained += 1
+            else:
+                unexplained += 1
+    return explained, unexplained
+def interconnection_depth_reward(
+    completions: Sequence[Sequence[dict[str, str]]], **kwargs: object
+) -> list[float]:
+    """
+    Reward deep, meaningful interconnections; penalize buzzword salad.
+    This reward distinguishes between:
+    - GOOD: "Surplus value relates to imperialism BECAUSE capital export..."
+    - BAD: "Surplus value intersects with imperialism, colonialism, patriarchy..."
+    Signals:
+    1. Depth ratio: words per unique Marxist concept
+       - High (>15): Deep analysis, concepts well-explained
+       - Low (<5): Shallow buzzword soup (many concepts crammed together)
+    2. Hollow buzzword density: activist jargon without substance
+    3. Depth markers: citations, examples, historical specificity
+    4. Explanation ratio: concepts with nearby explanatory phrases
+    Scoring:
+        Depth ratio > 20: +1.0 (deep analysis)
+        Depth ratio 10-20: +0.5 (adequate depth)
+        Depth ratio < 5: -1.5 (severe buzzword soup)
+        Depth ratio 5-10: -0.5 (shallow)
+        Hollow buzzwords > 2: -0.3 each additional
+        Depth markers present: +0.3 each (max +1.5)
+        Good explanation ratio: +0.5
+        Low explanation ratio with many concepts: -0.5
+    Total range: approximately -2.5 to +3.0
+    """
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"]
+        # Extract answer part after </think>
+        if REASONING_END in response:
+            answer_part = response.split(REASONING_END, 1)[1].strip()
+        else:
+            answer_part = response
+        # Skip very short responses (handled by completeness_reward)
+        word_count = len(answer_part.split())
+        if word_count < 20:
+            scores.append(0.0)
+            continue
+        score = 0.0
+        concept_count = _count_unique_marxist_concepts(answer_part)
+        # Signal 1: Depth ratio (words per concept)
+        # Only penalize if there are concepts to evaluate
+        if concept_count > 0:
+            depth_ratio = word_count / concept_count
+            if depth_ratio > 20:
+                score += 1.0  # Deep analysis
+            elif depth_ratio > 10:
+                score += 0.5  # Adequate depth
+            elif depth_ratio < 5:
+                score -= 1.5  # Severe buzzword soup (many concepts, few words)
+            elif depth_ratio < 10:
+                score -= 0.5  # Shallow
+        # Signal 2: Hollow buzzword penalty
+        hollow_count = _count_hollow_buzzwords(answer_part)
+        if hollow_count > 2:
+            # Penalize excess hollow buzzwords
+            penalty = 0.3 * (hollow_count - 2)
+            score -= min(penalty, 1.5)  # Cap penalty at -1.5
+        # Signal 3: Depth markers bonus
+        depth_marker_count = _count_depth_markers(answer_part)
+        score += min(depth_marker_count * 0.3, 1.5)
+        # Signal 4: Explanation ratio
+        explanatory_count = _count_explanatory_phrases(answer_part)
+        if concept_count > 0:
+            explanation_ratio = explanatory_count / concept_count
+            if explanation_ratio >= 0.5:
+                score += 0.5  # Good: at least 1 explanation per 2 concepts
+            elif explanation_ratio < 0.1 and concept_count > 5:
+                score -= 0.5  # Bad: many concepts, almost no explanations
+        # Clamp final score
+        scores.append(max(min(score, 3.0), -2.5))
+    return scores
+# =============================================================================
+# RESPONSE COMPLETENESS REWARD
+# =============================================================================
+def completeness_reward(
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Reward thorough, detailed responses.
+    Compares response length to ground truth length.
+    Scoring:
+        50-150% of target length: +2.0
+        30-200% of target length: +1.0
+        < 20% (too short): -2.0
+        > 200% (too verbose): -0.5
+    """
+    scores: list[float] = []
+    for completion, true_answer in zip(completions, answer, strict=False):
+        response = completion[0]["content"]
+        # Extract answer after </think>
+        if REASONING_END in response:
+            answer_part = response.split(REASONING_END, 1)[1].strip()
+        else:
+            answer_part = response
+        answer_len = len(answer_part.split())
+        true_len = len(true_answer.split())
+        # Avoid division by zero
+        if true_len == 0:
+            scores.append(0.0)
+            continue
+        # Reward responses that are 50-150% of target length
+        ratio = answer_len / true_len
+        if 0.5 <= ratio <= 1.5:
+            score = 2.0
+        elif 0.3 <= ratio <= 2.0:
+            score = 1.0
+        elif ratio < 0.2:  # Too short
+            score = -2.0
+        else:  # Too long (verbose)
+            score = -0.5
+        scores.append(score)
+    return scores
+# =============================================================================
+# DEBUG REWARD (for monitoring during training)
+# =============================================================================
+# Global counter for printing samples
+_PRINT_COUNTER = 0
+_PRINT_EVERY = 10
+def debug_print_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Print sample outputs periodically for monitoring.
+    Returns 0.0 (no effect on training).
+    """
+    global _PRINT_COUNTER
+    if _PRINT_COUNTER % _PRINT_EVERY == 0:
+        question = prompts[0][-1]["content"]
+        response = completions[0][0]["content"]
+        true_answer = answer[0]
+        print("=" * 60)
+        print(f"Step {_PRINT_COUNTER}")
+        print(f"Question: {question[:100]}...")
+        print(f"Response: {response[:200]}...")
+        print(f"Expected: {true_answer[:100]}...")
+        print("=" * 60)
+    _PRINT_COUNTER += 1
+    return [0.0] * len(completions)
+# =============================================================================
+# ENTITY VERIFICATION REWARD (Anti-Hallucination)
+# =============================================================================
+# Lazy-loaded entity whitelist
+_ENTITY_WHITELIST: set[str] | None = None
+_ENTITY_WHITELIST_LOWERCASE: set[str] | None = None
+def _load_entity_whitelist() -> tuple[set[str], set[str]]:
+    """Load entity whitelist from JSON file."""
+    global _ENTITY_WHITELIST, _ENTITY_WHITELIST_LOWERCASE
+    if _ENTITY_WHITELIST is None or _ENTITY_WHITELIST_LOWERCASE is None:
+        import json
+        from pathlib import Path
+        whitelist_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "training_data"
+            / "entity_whitelist_clean.json"
+        )
+        if whitelist_path.exists():
+            print(f"[Reward] Loading entity whitelist from {whitelist_path}...")
+            with open(whitelist_path, encoding="utf-8") as f:
+                data = json.load(f)
+            _ENTITY_WHITELIST = set(data.get("entities", []))
+            _ENTITY_WHITELIST_LOWERCASE = set(data.get("entities_lowercase", []))
+            print(f"[Reward] Loaded {len(_ENTITY_WHITELIST):,} entities")
+        else:
+            print(f"[Reward] WARNING: Entity whitelist not found at {whitelist_path}")
+            _ENTITY_WHITELIST = set()
+            _ENTITY_WHITELIST_LOWERCASE = set()
+    return _ENTITY_WHITELIST, _ENTITY_WHITELIST_LOWERCASE
+def _entity_in_whitelist(entity: str) -> bool:
+    """Check if an entity is in the whitelist (case-insensitive)."""
+    whitelist, whitelist_lower = _load_entity_whitelist()
+    return entity in whitelist or entity.lower() in whitelist_lower
+# Patterns that indicate confident factual claims
+CONFIDENT_CLAIM_PATTERNS = [
+    r"founded in \d{4}",
+    r"established in \d{4}",
+    r"created in \d{4}",
+    r"formed in \d{4}",
+    r"was founded by",
+    r"was established by",
+    r"was created by",
+    r"were founded in",
+    r"were established in",
+]
+# Patterns that indicate epistemic humility (GOOD)
+UNCERTAINTY_PATTERNS = [
+    r"I (?:cannot|can't|don't) (?:verify|confirm|find)",
+    r"I (?:don't|do not) have (?:verified |specific )?information",
+    r"I'm not (?:certain|sure|confident)",
+    r"I cannot (?:provide|give) (?:specific |verified )?information",
+    r"I should not (?:fabricate|make up|speculate)",
+    r"(?:could you|can you) (?:provide|share|tell me) (?:more )?context",
+    r"where did you (?:encounter|find|see) this",
+    r"I'm not aware of",
+    r"I don't have (?:details|information) about",
+]
+def _extract_potential_entities(text: str) -> list[str]:
+    """Extract potential organization/person names from text using spaCy NER."""
+    nlp = get_spacy_nlp()
+    doc = nlp(text[:10000])  # Limit to prevent slow processing
+    entities = []
+    for ent in doc.ents:
+        if ent.label_ in ("ORG", "PERSON", "GPE", "NORP", "EVENT", "FAC", "WORK_OF_ART"):
+            entities.append(ent.text)
+    return entities
+def entity_verification_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Reward epistemic humility, penalize confident claims about unverified entities.
+    Scoring:
+    - +2.0: Response expresses uncertainty about unknown entities
+    - +1.0: Response discusses only verified entities
+    - -1.5: Response makes confident claims about unknown entities
+    - -2.5: Response fabricates specific details (dates, founders) about unknown entities
+    This reward requires the entity_whitelist_clean.json file in training_data/.
+    """
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"]
+        score = 0.0
+        # Extract entities mentioned in the response
+        mentioned_entities = _extract_potential_entities(response)
+        # Check for unknown entities (not in whitelist)
+        unknown_entities = [e for e in mentioned_entities if not _entity_in_whitelist(e)]
+        # Check for uncertainty patterns (epistemic humility)
+        has_uncertainty = any(
+            re.search(pattern, response, re.IGNORECASE) for pattern in UNCERTAINTY_PATTERNS
+        )
+        # Check for confident claim patterns
+        has_confident_claims = any(
+            re.search(pattern, response, re.IGNORECASE) for pattern in CONFIDENT_CLAIM_PATTERNS
+        )
+        if unknown_entities:
+            # There are entities not in our whitelist
+            if has_uncertainty:
+                # GOOD: Model expresses uncertainty about unknown entities
+                score = 2.0
+            elif has_confident_claims:
+                # BAD: Model makes confident claims about unknown entities
+                # Check if confident claims are near unknown entities
+                score = -2.5
+            else:
+                # NEUTRAL-BAD: Discussing unknown entities without clear uncertainty
+                score = -1.0
+        else:
+            # All entities are verified or no specific entities mentioned
+            # Unnecessary uncertainty = 0.0, normal verified content = 1.0
+            score = 0.0 if has_uncertainty else 1.0
+        scores.append(score)
+    return scores
+def epistemic_calibration_reward(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> list[float]:
+    """
+    Combined reward for epistemic calibration.
+    Encourages the model to:
+    1. Be confident about entities in the verified corpus
+    2. Express uncertainty about entities NOT in the corpus
+    3. Avoid fabricating specific details about unknown entities
+    This is a lighter-weight version of entity_verification_reward
+    that doesn't require NER - just pattern matching.
+    Scoring:
+    - +1.5: Has uncertainty phrases (regardless of content)
+    - -1.5: Has confident claim patterns + no uncertainty
+    -  0.0: Neutral
+    """
+    scores: list[float] = []
+    for completion in completions:
+        response = completion[0]["content"]
+        # Check patterns
+        has_uncertainty = any(
+            re.search(pattern, response, re.IGNORECASE) for pattern in UNCERTAINTY_PATTERNS
+        )
+        has_confident_claims = any(
+            re.search(pattern, response, re.IGNORECASE) for pattern in CONFIDENT_CLAIM_PATTERNS
+        )
+        if has_uncertainty:
+            # Model expressed appropriate caution
+            scores.append(1.5)
+        elif has_confident_claims:
+            # Model made confident claims - could be hallucination
+            # This is a soft penalty; entity_verification_reward does deeper checking
+            scores.append(-0.5)
+        else:
+            # Neutral
+            scores.append(0.0)
+    return scores
+# =============================================================================
+# IDEOLOGICAL FIRMNESS REWARD (Anti-Capitulation)
+# =============================================================================
+# Patterns that detect ideological challenges requiring firm response
+IDEOLOGICAL_CHALLENGE_PATTERNS: list[re.Pattern[str]] = [
+    # Reactionary claims about fascism/Nazis
+    re.compile(r"hitler\s+help", re.IGNORECASE),
+    re.compile(r"nazis?\s+(?:were\s+)?socialist", re.IGNORECASE),
+    re.compile(r"national\s+socialism", re.IGNORECASE),
+    # Anti-communist death toll claims
+    re.compile(r"communis[mt]\s+(?:killed|murdered|deaths?)", re.IGNORECASE),
+    re.compile(r"\d+\s*million\s+(?:people\s+)?(?:killed|dead|died)", re.IGNORECASE),
+    re.compile(r"(?:black\s+book|death\s+toll)", re.IGNORECASE),
+    # Anti-communist framing
+    re.compile(
+        r"(?:stalin|mao|lenin)\s+(?:was\s+)?(?:a\s+)?(?:dictator|tyrant|monster)", re.IGNORECASE
+    ),
+    re.compile(r"authoritarian\s+regime", re.IGNORECASE),
+    re.compile(r"failed\s+experiment", re.IGNORECASE),
+    re.compile(r"communis[mt]\s+fail", re.IGNORECASE),
+    # Bad faith questions
+    re.compile(r"holodomor", re.IGNORECASE),
+    re.compile(r"gulag", re.IGNORECASE),
+    re.compile(r"great\s+(?:purge|terror|leap)", re.IGNORECASE),
+    re.compile(r"cultural\s+revolution", re.IGNORECASE),
+    # General anti-communist challenges
+    re.compile(r"(?:stalin|mao|lenin|castro|che)\s+(?:bad|evil|good)", re.IGNORECASE),
+    re.compile(r"(?:was|is)\s+(?:stalin|mao|lenin|castro|che)\s+(?:bad|evil|good)", re.IGNORECASE),
+    re.compile(r"(?:ussr|soviet|china|cuba)\s+(?:make\s+)?mistakes?", re.IGNORECASE),
+    re.compile(r"(?:did|make)\s+mistakes?", re.IGNORECASE),
+]
+# Patterns indicating capitulation/both-sidesing (NEGATIVE signals)
+CAPITULATION_PATTERNS: list[re.Pattern[str]] = [
+    # Both-sidesing
+    re.compile(r"valid\s+points?\s+on\s+both\s+sides?", re.IGNORECASE),
+    re.compile(
+        r"on\s+(?:the\s+)?one\s+hand.*on\s+(?:the\s+)?other\s+hand", re.IGNORECASE | re.DOTALL
+    ),
+    re.compile(r"reasonable\s+people\s+can\s+disagree", re.IGNORECASE),
+    re.compile(r"no\s+easy\s+answers?", re.IGNORECASE),
+    # Hedging and avoidance
+    re.compile(
+        r"it'?s?\s+(?:a\s+)?(?:complex|complicated|nuanced)\s+(?:issue|question|topic|legacy)",
+        re.IGNORECASE,
+    ),
+    re.compile(r"certainly\s+complex", re.IGNORECASE),
+    re.compile(r"I\s+(?:can\s+)?(?:see|understand)\s+(?:why|your|the)", re.IGNORECASE),
+    # Theory vs practice false dichotomy
+    re.compile(r"communism\s+sounds?\s+good\s+in\s+theory", re.IGNORECASE),
+    re.compile(r"good\s+in\s+theory\s+but", re.IGNORECASE),
+    # False balance on historical figures
+    re.compile(r"(?:stalin|mao)\s+did\s+(?:both\s+)?good\s+(?:things\s+)?and\s+bad", re.IGNORECASE),
+    re.compile(r"achievements?\s+and\s+(?:mistakes?|excesses?|errors?)", re.IGNORECASE),
+    re.compile(r"(?:mistakes?|excesses?|errors?)\s+and\s+achievements?", re.IGNORECASE),
+    # Weasel phrases
+    re.compile(r"some\s+(?:would|might)\s+argue", re.IGNORECASE),
+    re.compile(r"one\s+perspective\s+is", re.IGNORECASE),
+    re.compile(r"it\s+depends", re.IGNORECASE),
+    # Partial credit to reactionary framing
+    re.compile(r"while\s+(?:it'?s?\s+true|hitler|he)\s+did", re.IGNORECASE),
+    re.compile(r"did\s+(?:implement|create|build)\s+some\s+programs?", re.IGNORECASE),
+    re.compile(r"(?:positive|negative)\s+aspects?", re.IGNORECASE),
+    re.compile(r"weigh\s+(?:both|the)", re.IGNORECASE),
+]
+# Patterns indicating ideological firmness (POSITIVE signals)
+FIRMNESS_PATTERNS: list[re.Pattern[str]] = [
+    # Direct refutation
+    re.compile(
+        r"this\s+(?:claim\s+)?is\s+(?:false|incorrect|wrong|a\s+lie|propaganda|a\s+myth)",
+        re.IGNORECASE,
+    ),
+    re.compile(
+        r"this\s+is\s+(?:fascist|bourgeois|reactionary|anti-?communist)\s+propaganda", re.IGNORECASE
+    ),
+    re.compile(
+        r"this\s+(?:figure|claim|narrative)\s+(?:comes?\s+from|is)\s+(?:the\s+)?discredited",
+        re.IGNORECASE,
+    ),
+    re.compile(r"(?:discredited|debunked)", re.IGNORECASE),
+    re.compile(r"this\s+ignores?\s+(?:the\s+)?(?:material|historical|actual)", re.IGNORECASE),
+    re.compile(r"this\s+framing\s+ignores?", re.IGNORECASE),
+    # Marxist analytical framing
+    re.compile(r"from\s+a\s+marxist\s+(?:analysis|perspective|standpoint)", re.IGNORECASE),
+    re.compile(r"(?:the\s+)?material\s+(?:reality|evidence|conditions?|facts?)", re.IGNORECASE),
+    re.compile(
+        r"(?:the\s+)?historical\s+(?:evidence|record|facts?)\s+(?:show|demonstrate|prove)",
+        re.IGNORECASE,
+    ),
+    re.compile(r"historically\s+documented", re.IGNORECASE),
+    # Class analysis language
+    re.compile(
+        r"serves?\s+(?:the\s+)?interests?\s+of\s+(?:capital|the\s+bourgeoisie|imperialism)",
+        re.IGNORECASE,
+    ),
+    re.compile(r"serves?\s+bourgeois\s+(?:historiography|ideology|interests?)", re.IGNORECASE),
+    re.compile(r"bourgeois\s+(?:historians?|propaganda|framing|ideology)", re.IGNORECASE),
+    re.compile(r"capitalis[tm]\s+kills?\s+millions?", re.IGNORECASE),
+    # Strong Marxist position markers
+    re.compile(r"dictatorship\s+of\s+the\s+(?:proletariat|bourgeoisie)", re.IGNORECASE),
+    re.compile(r"class\s+(?:rule|struggle|analysis|character)", re.IGNORECASE),
+    re.compile(r"(?:open\s+)?terrorist\s+dictatorship", re.IGNORECASE),
+    # Material achievements of socialism
+    re.compile(r"(?:universal|achieved)\s+literacy", re.IGNORECASE),
+    re.compile(r"(?:provided|universal)\s+(?:housing|healthcare|education)", re.IGNORECASE),
+    re.compile(r"defeated\s+fascism", re.IGNORECASE),
+    re.compile(r"industrial(?:ized?|ization)\s+(?:superpower|rapidly|in)", re.IGNORECASE),
+]
+# Class analysis markers (substantive bonus)
+CLASS_ANALYSIS_MARKERS: list[re.Pattern[str]] = [
+    re.compile(r"working\s+class", re.IGNORECASE),
+    re.compile(r"\bproletariat\b", re.IGNORECASE),
+    re.compile(r"\bbourgeoisie\b", re.IGNORECASE),
+    re.compile(r"\bcapital(?:ist(?:s|class)?|ism)?\b", re.IGNORECASE),
+    re.compile(r"finance\s+capital", re.IGNORECASE),
+    re.compile(r"class\s+struggle", re.IGNORECASE),
+    re.compile(r"exploitation", re.IGNORECASE),
+    re.compile(r"surplus\s+value", re.IGNORECASE),
+    re.compile(r"dictatorship\s+of\s+the", re.IGNORECASE),
+    re.compile(r"relations\s+of\s+production", re.IGNORECASE),
+    re.compile(r"means\s+of\s+production", re.IGNORECASE),
+    re.compile(r"\bimperialis[mt]\b", re.IGNORECASE),
+    re.compile(r"monopoly\s+capital", re.IGNORECASE),
+]
+# Quote-to-refute pattern (quoted text followed by refutation)
+QUOTE_TO_REFUTE_PATTERNS: list[re.Pattern[str]] = [
+    # Patterns where quoted claims are followed by refutation
+    re.compile(r"['\"].*?['\"].*?\bbut\b", re.IGNORECASE),
+    re.compile(r"['\"].*?['\"].*?\bhowever\b", re.IGNORECASE),
+    re.compile(r"['\"].*?['\"].*?\bthis\s+ignores?\b", re.IGNORECASE),
+    re.compile(r"claim\s+that.*?\bbut\b", re.IGNORECASE),
+    re.compile(r"claim\s+that.*?\bhowever\b", re.IGNORECASE),
+    re.compile(r"historians?\s+claim.*?\bbut\b", re.IGNORECASE),
+]
+# Principled self-criticism markers (NOT capitulation)
+SELF_CRITICISM_MARKERS: list[re.Pattern[str]] = [
+    re.compile(r"self-?criticism", re.IGNORECASE),
+    re.compile(r"methodological\s+principle", re.IGNORECASE),
+    re.compile(r"revolutionary\s+perspective", re.IGNORECASE),
+    re.compile(r"strengthen(?:ing)?\s+socialism", re.IGNORECASE),
+    re.compile(r"not\s+from\s+(?:bourgeois|anti-?communist)", re.IGNORECASE),
+    re.compile(r"marxist-?leninist", re.IGNORECASE),
+]
+def _detect_ideological_challenge(prompt: str) -> bool:
+    """Check if the prompt contains an ideological challenge requiring firm response."""
+    return any(pattern.search(prompt) for pattern in IDEOLOGICAL_CHALLENGE_PATTERNS)
+def _count_pattern_matches(text: str, patterns: list[re.Pattern[str]]) -> int:
+    """Count the number of pattern matches in text."""
+    count = 0
+    for pattern in patterns:
+        if pattern.search(text):
+            count += 1
+    return count
+def _has_quote_to_refute(text: str) -> bool:
+    """Check if text uses quote-to-refute rhetorical pattern."""
+    return any(pattern.search(text) for pattern in QUOTE_TO_REFUTE_PATTERNS)
+def _has_principled_self_criticism(text: str) -> bool:
+    """Check if text contains principled ML self-criticism markers."""
+    marker_count = _count_pattern_matches(text, SELF_CRITICISM_MARKERS)
+    return marker_count >= 2
+def ideological_firmness_reward(
+    prompts: list[str],
+    completions: list[str],
+    **kwargs: Any,
+) -> list[float]:
+    """
+    Reward ideological firmness with substantive Marxist counter-arguments.
+    This reward function encourages the model to:
+    1. Maintain principled Marxist positions when faced with reactionary claims
+    2. Provide substantive counter-arguments rather than capitulating
+    3. Use class analysis to refute ideological challenges
+    The function detects ideological challenges (fascist propaganda, anti-communist
+    tropes, bad faith questions) and scores responses based on:
+    - NEGATIVE: Capitulation patterns (both-sidesing, hedging, false balance)
+    - POSITIVE: Firmness patterns (direct refutation, class analysis)
+    - BONUS: Class analysis markers (substantive Marxist vocabulary in context)
+    Special handling:
+    - Quote-to-refute: Quoting reactionary claims to refute them is NOT penalized
+    - Self-criticism: Principled ML self-criticism is NOT capitulation
+    Scoring:
+        - For non-ideological-challenge prompts: 0.0 (neutral)
+        - For ideological challenges:
+            * Capitulation patterns: -0.5 each
+            * Firmness patterns: +0.5 each
+            * Class analysis markers: +0.2 each (capped at +1.0)
+        - Final score clamped to [-2.0, +2.0]
+    Args:
+        prompts: List of user prompts/questions
+        completions: List of model completions/responses
+        **kwargs: Additional arguments (ignored)
+    Returns:
+        List of reward scores, one per (prompt, completion) pair
+    """
+    scores: list[float] = []
+    for prompt, completion in zip(prompts, completions, strict=False):
+        # Step 1: Check if prompt contains an ideological challenge
+        if not _detect_ideological_challenge(prompt):
+            # No ideological challenge -> neutral score
+            scores.append(0.0)
+            continue
+        # Step 2: Initialize score for ideological challenge response
+        score = 0.0
+        # Step 3: Check for quote-to-refute pattern (reduces capitulation penalty)
+        has_quote_refute = _has_quote_to_refute(completion)
+        # Step 4: Check for principled self-criticism (avoids false positive)
+        has_self_criticism = _has_principled_self_criticism(completion)
+        # Step 5: Count capitulation patterns (NEGATIVE)
+        capitulation_count = _count_pattern_matches(completion, CAPITULATION_PATTERNS)
+        # Reduce capitulation penalty if quote-to-refute pattern detected
+        if has_quote_refute:
+            capitulation_count = max(0, capitulation_count - 1)
+        # Reduce capitulation penalty if principled self-criticism detected
+        if has_self_criticism:
+            capitulation_count = max(0, capitulation_count - 2)
+        score -= capitulation_count * 0.5
+        # Step 6: Count firmness patterns (POSITIVE)
+        firmness_count = _count_pattern_matches(completion, FIRMNESS_PATTERNS)
+        score += firmness_count * 0.5
+        # Step 7: Count class analysis markers (BONUS)
+        class_analysis_count = _count_pattern_matches(completion, CLASS_ANALYSIS_MARKERS)
+        class_bonus = min(class_analysis_count * 0.2, 1.0)  # Cap at +1.0
+        score += class_bonus
+        # Step 8: Clamp to [-2.0, +2.0] range
+        final_score = max(-2.0, min(2.0, score))
+        scores.append(final_score)
+    return scores

src/prolewiki_llm/train_grpo_marxist.py ADDED Viewed

	@@ -0,0 +1,341 @@

+#!/usr/bin/env python3
+"""
+GRPO Fine-tuning for Marxist-Leninist Reasoning Model.
+Trains DeepSeek-R1-0528-Qwen3-8B on ProleWiki corpus using GRPO
+(Group Relative Policy Optimization) with custom reward functions.
+Usage:
+    # First transform data
+    python transform_to_grpo.py
+    # Then run training
+    python train_grpo_marxist.py
+Hardware: A40 (48GB) optimized
+Expected time: ~1-2 hours for 250 steps
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+# Set vLLM standby mode for better memory utilization
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+import torch
+from datasets import Dataset
+from trl import GRPOConfig, GRPOTrainer
+from unsloth import FastLanguageModel
+from vllm import SamplingParams
+from prolewiki_llm.grpo_rewards import (
+    completeness_reward,
+    debug_print_reward,
+    match_format_approximately,
+    match_format_exactly,
+    semantic_similarity_reward,
+    terminology_reward,
+)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Model
+MODEL_NAME = "unsloth/DeepSeek-R1-0528-Qwen3-8B"
+MAX_SEQ_LENGTH = 2048  # Longer for detailed political theory responses
+LORA_RANK = 32  # Same as original notebook
+# Paths
+DATA_PATH = Path("training_data/grpo_dataset.jsonl")
+OUTPUT_DIR = Path("outputs/marxist-grpo")
+LORA_OUTPUT = Path("outputs/marxist-grpo-lora")
+# Training
+MAX_STEPS = 250  # Cover most of 1058 samples
+SAVE_STEPS = 50
+LEARNING_RATE = 5e-6
+WARMUP_RATIO = 0.1
+# A40 optimized settings
+GPU_MEMORY_UTILIZATION = 0.85
+BATCH_SIZE = 2
+GRADIENT_ACCUMULATION = 2
+NUM_GENERATIONS = 4
+# Completion limits
+MAX_PROMPT_LENGTH = 512
+MAX_COMPLETION_LENGTH = 1500
+# =============================================================================
+# MAIN TRAINING FUNCTION
+# =============================================================================
+def main() -> None:
+    """Run GRPO training."""
+    print("=" * 60)
+    print("Marxist-Leninist GRPO Training")
+    print("=" * 60)
+    # Check CUDA
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name()
+        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
+        print(f"GPU: {gpu_name}")
+        print(f"VRAM: {gpu_mem:.1f} GB")
+    else:
+        raise RuntimeError("CUDA not available!")
+    # =========================================================================
+    # Load Model
+    # =========================================================================
+    print(f"\nLoading model: {MODEL_NAME}")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=True,
+        fast_inference=True,  # Enable vLLM
+        max_lora_rank=LORA_RANK,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    )
+    print(f"Model type: {model.config.model_type}")
+    # =========================================================================
+    # Apply LoRA
+    # =========================================================================
+    print("\nApplying LoRA adapters...")
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_RANK,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        lora_alpha=LORA_RANK * 2,  # *2 speeds up training
+        use_gradient_checkpointing="unsloth",
+        random_state=3407,
+    )
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
+    # =========================================================================
+    # Load Dataset
+    # =========================================================================
+    print(f"\nLoading dataset: {DATA_PATH}")
+    if not DATA_PATH.exists():
+        raise FileNotFoundError(
+            f"Dataset not found: {DATA_PATH}\n" "Run 'python transform_to_grpo.py' first!"
+        )
+    dataset = Dataset.from_json(str(DATA_PATH))
+    print(f"Loaded {len(dataset)} examples")
+    # Show sample
+    sample = dataset[0]
+    print(f"Sample prompt: {sample['prompt'][1]['content'][:60]}...")
+    # =========================================================================
+    # Configure vLLM Sampling
+    # =========================================================================
+    vllm_sampling_params = SamplingParams(
+        min_p=0.1,
+        top_p=1.0,  # No nucleus sampling (matches original template)
+        top_k=-1,
+        # NOTE: temperature is set in GRPOConfig, not here
+        max_tokens=MAX_COMPLETION_LENGTH,
+        stop=[tokenizer.eos_token],
+        include_stop_str_in_output=True,
+        seed=3407,
+    )
+    # =========================================================================
+    # Configure Training
+    # =========================================================================
+    print("\nConfiguring GRPO trainer...")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    training_args = GRPOConfig(
+        # vLLM
+        vllm_sampling_params=vllm_sampling_params,
+        temperature=1.0,  # For GRPO training dynamics
+        # Optimization
+        learning_rate=LEARNING_RATE,
+        weight_decay=0.001,
+        warmup_ratio=WARMUP_RATIO,
+        lr_scheduler_type="linear",
+        optim="adamw_8bit",
+        # Batch settings
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
+        num_generations=NUM_GENERATIONS,
+        # Sequence lengths
+        max_prompt_length=MAX_PROMPT_LENGTH,
+        max_completion_length=MAX_COMPLETION_LENGTH,
+        # Training duration
+        max_steps=MAX_STEPS,
+        save_steps=SAVE_STEPS,
+        # Logging
+        logging_steps=1,
+        report_to="none",
+        # Output
+        output_dir=str(OUTPUT_DIR),
+    )
+    # =========================================================================
+    # Create Trainer
+    # =========================================================================
+    print("\nInitializing trainer with reward functions:")
+    print("  - match_format_exactly (+3.0 for </think>)")
+    print("  - match_format_approximately (±0.5 for tags)")
+    print("  - semantic_similarity_reward (+5.0 to -3.0)")
+    print("  - terminology_reward (+0 to +2.0)")
+    print("  - completeness_reward (±2.0)")
+    print("  - debug_print_reward (monitoring)")
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[
+            match_format_exactly,
+            match_format_approximately,
+            semantic_similarity_reward,
+            terminology_reward,
+            completeness_reward,
+            debug_print_reward,
+        ],
+        args=training_args,
+        train_dataset=dataset,
+    )
+    # =========================================================================
+    # Train!
+    # =========================================================================
+    print("\n" + "=" * 60)
+    print("STARTING TRAINING")
+    print("=" * 60)
+    print(f"Steps: {MAX_STEPS}")
+    print(f"Batch: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} x {NUM_GENERATIONS}")
+    print(f"Learning rate: {LEARNING_RATE}")
+    print()
+    trainer.train()
+    # =========================================================================
+    # Save
+    # =========================================================================
+    print("\n" + "=" * 60)
+    print("SAVING MODEL")
+    print("=" * 60)
+    LORA_OUTPUT.mkdir(parents=True, exist_ok=True)
+    model.save_lora(str(LORA_OUTPUT))
+    print(f"LoRA saved to: {LORA_OUTPUT}")
+    print("\n" + "=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)
+    print("\nNext steps:")
+    print("1. Test the model with and without LoRA")
+    print("2. Export to GGUF if satisfied")
+    print("3. Create Ollama Modelfile")
+# =============================================================================
+# TEST FUNCTION
+# =============================================================================
+def test_model() -> None:
+    """Test the trained model."""
+    print("Loading model for testing...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=True,
+        fast_inference=True,
+        max_lora_rank=LORA_RANK,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    )
+    test_questions = [
+        "What is revisionism in the Marxist sense?",
+        "Explain the concept of surplus value.",
+        "What is the dictatorship of the proletariat?",
+        "How does dialectical materialism differ from idealism?",
+    ]
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_k=50,
+        max_tokens=1024,
+    )
+    system_prompt = """You are a Marxist-Leninist assistant trained on ProleWiki.
+Think through political theory questions using dialectical materialist analysis.
+Show your reasoning in <think> tags, then provide a clear answer."""
+    print("\n" + "=" * 60)
+    print("TESTING WITHOUT LORA")
+    print("=" * 60)
+    for question in test_questions[:2]:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": question},
+        ]
+        text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        output = (
+            model.fast_generate(text, sampling_params=sampling_params, lora_request=None)[0]
+            .outputs[0]
+            .text
+        )
+        print(f"\nQ: {question}")
+        print(f"A: {output[:500]}...")
+    print("\n" + "=" * 60)
+    print("TESTING WITH LORA")
+    print("=" * 60)
+    for question in test_questions[:2]:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": question},
+        ]
+        text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        output = (
+            model.fast_generate(
+                text,
+                sampling_params=sampling_params,
+                lora_request=model.load_lora(str(LORA_OUTPUT)),
+            )[0]
+            .outputs[0]
+            .text
+        )
+        print(f"\nQ: {question}")
+        print(f"A: {output[:500]}...")
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        test_model()
+    else:
+        main()

src/prolewiki_llm/train_headless.py ADDED Viewed

	@@ -0,0 +1,460 @@

+#!/usr/bin/env python3
+"""
+Headless GRPO Training for RunPod Deployment.
+This script is designed for containerized, non-interactive execution on RunPod.
+It adapts train_grpo_marxist.py for headless operation with:
+- Environment variable configuration
+- Checkpoint resumption support
+- Automatic model upload to HuggingFace Hub
+- W&B logging for remote monitoring
+- Self-termination capability
+Environment Variables:
+    Required:
+        HF_TOKEN           - HuggingFace API token for model upload
+        WANDB_API_KEY      - Weights & Biases API key
+    Optional (with defaults):
+        HF_REPO            - Target repo for model upload (default: prolewiki/marxist-grpo-lora)
+        RUNPOD_POD_ID      - Pod ID for self-termination after training
+        MODEL_NAME         - Base model (default: unsloth/DeepSeek-R1-0528-Qwen3-8B)
+        MAX_STEPS          - Training steps (default: 500)
+        BATCH_SIZE         - Per-device batch size (default: 2)
+        LEARNING_RATE      - Learning rate (default: 5e-6)
+        REWARD_MODE        - FULL, ROBUST, or LEGACY (default: FULL)
+        DATASET_PATH       - Path to grpo_dataset.jsonl (default: /workspace/dataset.jsonl)
+        CHECKPOINT_DIR     - Directory for checkpoints (default: /workspace/checkpoints)
+        LORA_OUTPUT        - Directory for final LoRA (default: /workspace/lora-output)
+Usage:
+    # In container:
+    python -m prolewiki_llm.train_headless
+    # With environment overrides:
+    MAX_STEPS=100 REWARD_MODE=ROBUST python -m prolewiki_llm.train_headless
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+# =============================================================================
+# CRITICAL: Disable torch.compile BEFORE any imports
+# =============================================================================
+# These environment variables prevent torch.compile from spawning inductor
+# compilation workers that hang indefinitely on RunPod/WSL2/Jupyter.
+# See: https://github.com/unslothai/unsloth/issues/3432
+os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+os.environ["TORCH_COMPILE"] = "0"
+os.environ["TORCHINDUCTOR_DISABLE"] = "1"
+os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+def get_env(key: str, default: str | None = None, required: bool = False) -> str:
+    """Get environment variable with optional default and required check."""
+    value = os.environ.get(key, default)
+    if required and value is None:
+        print(f"ERROR: Required environment variable {key} not set", file=sys.stderr)
+        sys.exit(1)
+    return value  # type: ignore[return-value]
+def get_env_int(key: str, default: int) -> int:
+    """Get environment variable as integer."""
+    return int(os.environ.get(key, str(default)))
+def get_env_float(key: str, default: float) -> float:
+    """Get environment variable as float."""
+    return float(os.environ.get(key, str(default)))
+# =============================================================================
+# CONFIGURATION FROM ENVIRONMENT
+# =============================================================================
+# Required secrets
+HF_TOKEN = get_env("HF_TOKEN", required=True)
+WANDB_API_KEY = get_env("WANDB_API_KEY", required=True)
+# Model configuration
+MODEL_NAME = get_env("MODEL_NAME", "unsloth/DeepSeek-R1-0528-Qwen3-8B")
+MAX_SEQ_LENGTH = get_env_int("MAX_SEQ_LENGTH", 2048)
+LORA_RANK = get_env_int("LORA_RANK", 32)
+# Training configuration
+MAX_STEPS = get_env_int("MAX_STEPS", 500)
+SAVE_STEPS = get_env_int("SAVE_STEPS", 50)
+LEARNING_RATE = get_env_float("LEARNING_RATE", 5e-6)
+WARMUP_RATIO = get_env_float("WARMUP_RATIO", 0.1)
+BATCH_SIZE = get_env_int("BATCH_SIZE", 2)
+GRADIENT_ACCUMULATION = get_env_int("GRADIENT_ACCUMULATION", 2)
+NUM_GENERATIONS = get_env_int("NUM_GENERATIONS", 4)
+GPU_MEMORY_UTILIZATION = get_env_float("GPU_MEMORY_UTILIZATION", 0.6)
+# Sequence lengths
+MAX_PROMPT_LENGTH = get_env_int("MAX_PROMPT_LENGTH", 512)
+MAX_COMPLETION_LENGTH = get_env_int("MAX_COMPLETION_LENGTH", 1500)
+# Paths
+DATASET_PATH = Path(get_env("DATASET_PATH", "/workspace/dataset.jsonl"))
+CHECKPOINT_DIR = Path(get_env("CHECKPOINT_DIR", "/workspace/checkpoints"))
+LORA_OUTPUT = Path(get_env("LORA_OUTPUT", "/workspace/lora-output"))
+OUTPUT_DIR = Path(get_env("OUTPUT_DIR", "/workspace/outputs"))
+# Upload configuration
+HF_REPO = get_env("HF_REPO", "prolewiki/marxist-grpo-lora")
+# Reward mode: FULL, ROBUST, or LEGACY
+REWARD_MODE = get_env("REWARD_MODE", "FULL").upper()
+# Pod management
+RUNPOD_POD_ID = get_env("RUNPOD_POD_ID")
+def find_latest_checkpoint(checkpoint_dir: Path) -> Path | None:
+    """Find the latest checkpoint directory if resuming training."""
+    if not checkpoint_dir.exists():
+        return None
+    checkpoints = sorted(
+        [d for d in checkpoint_dir.iterdir() if d.is_dir() and d.name.startswith("checkpoint-")],
+        key=lambda d: int(d.name.split("-")[1]),
+    )
+    if checkpoints:
+        return checkpoints[-1]
+    return None
+def upload_to_hub(model_path: Path, repo_id: str, token: str) -> None:
+    """Upload trained LoRA adapter to HuggingFace Hub."""
+    from huggingface_hub import HfApi
+    print(f"\nUploading model to HuggingFace Hub: {repo_id}")
+    api = HfApi(token=token)
+    # Create repo if it doesn't exist
+    try:
+        api.create_repo(repo_id, exist_ok=True, private=True)
+    except Exception as e:
+        print(f"Note: Could not create repo (may already exist): {e}")
+    # Upload the LoRA adapter directory
+    api.upload_folder(
+        folder_path=str(model_path),
+        repo_id=repo_id,
+        commit_message="Headless GRPO training run",
+    )
+    print(f"Model uploaded to: https://huggingface.co/{repo_id}")
+def main() -> int:
+    """Run headless GRPO training."""
+    import torch
+    import wandb
+    from datasets import Dataset
+    from trl import GRPOConfig, GRPOTrainer
+    from unsloth import FastLanguageModel
+    from vllm import SamplingParams
+    from prolewiki_llm.grpo_rewards import (
+        completeness_reward,
+        debug_print_reward,
+        full_coherence_reward,
+        match_format_approximately,
+        match_format_exactly,
+        robust_coherence_reward,
+        semantic_similarity_reward,
+        terminology_reward,
+    )
+    from prolewiki_llm.wandb_logging import (
+        WandbSampleLogger,
+        create_logging_reward,
+        finish_wandb_logging,
+        init_wandb_logging,
+    )
+    print("=" * 70)
+    print("HEADLESS GRPO TRAINING - RUNPOD DEPLOYMENT")
+    print("=" * 70)
+    # =========================================================================
+    # System Info
+    # =========================================================================
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name()
+        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
+        print(f"GPU: {gpu_name}")
+        print(f"VRAM: {gpu_mem:.1f} GB")
+    else:
+        print("ERROR: CUDA not available!", file=sys.stderr)
+        return 1
+    print(f"\nConfiguration:")
+    print(f"  Model: {MODEL_NAME}")
+    print(f"  Max Steps: {MAX_STEPS}")
+    print(f"  Batch Size: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} = {BATCH_SIZE * GRADIENT_ACCUMULATION}")
+    print(f"  Learning Rate: {LEARNING_RATE}")
+    print(f"  Reward Mode: {REWARD_MODE}")
+    print(f"  Dataset: {DATASET_PATH}")
+    print(f"  Output: {LORA_OUTPUT}")
+    print(f"  HF Repo: {HF_REPO}")
+    # =========================================================================
+    # Initialize W&B
+    # =========================================================================
+    print("\nInitializing Weights & Biases...")
+    wandb.login(key=WANDB_API_KEY)
+    wandb_run = init_wandb_logging(
+        project="marxist-grpo-headless",
+        config={
+            "model": MODEL_NAME,
+            "learning_rate": LEARNING_RATE,
+            "batch_size": BATCH_SIZE,
+            "gradient_accumulation": GRADIENT_ACCUMULATION,
+            "num_generations": NUM_GENERATIONS,
+            "max_steps": MAX_STEPS,
+            "reward_mode": REWARD_MODE,
+            "lora_rank": LORA_RANK,
+        },
+        tags=["grpo", "marxist-leninist", "headless", "runpod"],
+    )
+    sample_logger = WandbSampleLogger(log_every_n_steps=10, max_samples_per_log=4)
+    logging_reward = create_logging_reward(sample_logger, compute_all_rewards=True)
+    # =========================================================================
+    # Load Dataset
+    # =========================================================================
+    print(f"\nLoading dataset from: {DATASET_PATH}")
+    if not DATASET_PATH.exists():
+        print(f"ERROR: Dataset not found: {DATASET_PATH}", file=sys.stderr)
+        return 1
+    dataset = Dataset.from_json(str(DATASET_PATH))
+    print(f"Loaded {len(dataset):,} examples")
+    # =========================================================================
+    # Load Model
+    # =========================================================================
+    print(f"\nLoading model: {MODEL_NAME}")
+    # GRPO requires 16-bit LoRA adapters (load_in_4bit=False)
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=False,  # Must be False for GRPO
+        fast_inference=True,
+        max_lora_rank=LORA_RANK,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    )
+    print(f"Model type: {model.config.model_type}")
+    # =========================================================================
+    # Apply LoRA
+    # =========================================================================
+    print("\nApplying LoRA adapters...")
+    # Use gradient_checkpointing=True (not "unsloth") for stability on RunPod
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_RANK,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        lora_alpha=LORA_RANK,  # Same as r for GRPO (not r*2)
+        use_gradient_checkpointing=True,  # Stable on RunPod (not "unsloth")
+        random_state=3407,
+    )
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
+    # =========================================================================
+    # Configure vLLM Sampling
+    # =========================================================================
+    vllm_sampling_params = SamplingParams(
+        min_p=0.1,
+        top_p=1.0,
+        top_k=-1,
+        max_tokens=MAX_COMPLETION_LENGTH,
+        stop=[tokenizer.eos_token],
+        include_stop_str_in_output=True,
+        seed=3407,
+    )
+    # =========================================================================
+    # Configure Training
+    # =========================================================================
+    print("\nConfiguring GRPO trainer...")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
+    training_args = GRPOConfig(
+        # vLLM
+        vllm_sampling_params=vllm_sampling_params,
+        temperature=1.0,
+        # Optimization
+        learning_rate=LEARNING_RATE,
+        weight_decay=0.001,
+        warmup_ratio=WARMUP_RATIO,
+        lr_scheduler_type="linear",
+        optim="adamw_8bit",
+        # Batch settings
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
+        num_generations=NUM_GENERATIONS,
+        # Sequence lengths
+        max_prompt_length=MAX_PROMPT_LENGTH,
+        max_completion_length=MAX_COMPLETION_LENGTH,
+        # Training duration
+        max_steps=MAX_STEPS,
+        save_steps=SAVE_STEPS,
+        # Logging
+        logging_steps=1,
+        report_to="wandb",
+        # Output
+        output_dir=str(CHECKPOINT_DIR),
+    )
+    # =========================================================================
+    # Select Reward Functions
+    # =========================================================================
+    if REWARD_MODE == "FULL":
+        print("\nUsing FULL reward mode (recommended):")
+        print("  - match_format_exactly, match_format_approximately")
+        print("  - full_coherence_reward (NLI + structure + topic + depth)")
+        print("  - completeness_reward, logging_reward")
+        reward_funcs = [
+            match_format_exactly,
+            match_format_approximately,
+            full_coherence_reward,
+            completeness_reward,
+            debug_print_reward,
+            logging_reward,
+        ]
+    elif REWARD_MODE == "ROBUST":
+        print("\nUsing ROBUST reward mode:")
+        print("  - match_format_exactly, match_format_approximately")
+        print("  - robust_coherence_reward (NLI + self-consistency + structure)")
+        print("  - completeness_reward, logging_reward")
+        reward_funcs = [
+            match_format_exactly,
+            match_format_approximately,
+            robust_coherence_reward,
+            completeness_reward,
+            debug_print_reward,
+            logging_reward,
+        ]
+    else:  # LEGACY
+        print("\nUsing LEGACY reward mode (faster, less robust):")
+        print("  - match_format_exactly, match_format_approximately")
+        print("  - semantic_similarity_reward, terminology_reward")
+        print("  - completeness_reward, logging_reward")
+        reward_funcs = [
+            match_format_exactly,
+            match_format_approximately,
+            semantic_similarity_reward,
+            terminology_reward,
+            completeness_reward,
+            debug_print_reward,
+            logging_reward,
+        ]
+    # =========================================================================
+    # Create Trainer
+    # =========================================================================
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset,
+    )
+    # =========================================================================
+    # Check for Checkpoint Resume
+    # =========================================================================
+    resume_from = find_latest_checkpoint(CHECKPOINT_DIR)
+    if resume_from:
+        print(f"\nResuming from checkpoint: {resume_from}")
+    # =========================================================================
+    # Train!
+    # =========================================================================
+    print("\n" + "=" * 70)
+    print("STARTING TRAINING")
+    print("=" * 70)
+    print(f"Steps: {MAX_STEPS}")
+    print(f"Batch: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} x {NUM_GENERATIONS}")
+    print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION * NUM_GENERATIONS}")
+    print()
+    try:
+        if resume_from:
+            trainer.train(resume_from_checkpoint=str(resume_from))
+        else:
+            trainer.train()
+    except KeyboardInterrupt:
+        print("\nTraining interrupted. Saving checkpoint...")
+    except Exception as e:
+        print(f"\nTraining error: {e}", file=sys.stderr)
+        finish_wandb_logging({"status": "error", "error": str(e)})
+        raise
+    # =========================================================================
+    # Save LoRA
+    # =========================================================================
+    print("\n" + "=" * 70)
+    print("SAVING MODEL")
+    print("=" * 70)
+    LORA_OUTPUT.mkdir(parents=True, exist_ok=True)
+    model.save_lora(str(LORA_OUTPUT))
+    print(f"LoRA saved to: {LORA_OUTPUT}")
+    # =========================================================================
+    # Upload to HuggingFace Hub
+    # =========================================================================
+    try:
+        upload_to_hub(LORA_OUTPUT, HF_REPO, HF_TOKEN)
+    except Exception as e:
+        print(f"Warning: Failed to upload to HuggingFace Hub: {e}", file=sys.stderr)
+    # =========================================================================
+    # Finish W&B
+    # =========================================================================
+    finish_wandb_logging({
+        "status": "completed",
+        "final_step": MAX_STEPS,
+        "reward_mode": REWARD_MODE,
+        "dataset_size": len(dataset),
+        "hf_repo": HF_REPO,
+    })
+    print("\n" + "=" * 70)
+    print("TRAINING COMPLETE!")
+    print("=" * 70)
+    print(f"LoRA saved to: {LORA_OUTPUT}")
+    print(f"Model uploaded to: https://huggingface.co/{HF_REPO}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

src/prolewiki_llm/train_marxist.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python3
+"""
+Fine-tune DeepSeek-R1-Distill-Qwen-7B-abliterated on ProleWiki Marxist-Leninist corpus.
+Usage on RunPod:
+  1. Upload this script and train_qwen.jsonl to /workspace/data/
+  2. pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+  3. pip install trl>=0.7.0 datasets accelerate bitsandbytes peft
+  4. python train_marxist.py
+Expected: ~30 min training, ~$0.30-0.50 on RTX 4090
+"""
+from pathlib import Path
+import torch
+from datasets import Dataset
+from trl import SFTConfig, SFTTrainer
+from unsloth import FastLanguageModel
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+MODEL_NAME = "huihui-ai/DeepSeek-R1-Distill-Qwen-7B-abliterated"
+MAX_SEQ_LENGTH = 2048  # Qwen can go higher but 2048 is sufficient
+LOAD_IN_4BIT = True
+# LoRA config
+LORA_R = 16
+LORA_ALPHA = 32
+LORA_DROPOUT = 0.05
+# Training config
+EPOCHS = 3
+BATCH_SIZE = 2
+GRADIENT_ACCUMULATION = 4  # Effective batch = 8
+LEARNING_RATE = 2e-4
+WARMUP_RATIO = 0.1
+# Paths - adjust for your environment
+DATA_PATH = Path("/workspace/data/train_qwen.jsonl")  # RunPod
+OUTPUT_DIR = Path("/workspace/outputs/marxist-deepseek")
+CHECKPOINT_DIR = Path("/workspace/checkpoints")
+# Fallback for local testing
+if not DATA_PATH.exists():
+    DATA_PATH = Path("training_data/formatted/train_qwen.jsonl")
+    OUTPUT_DIR = Path("outputs/marxist-deepseek")
+    CHECKPOINT_DIR = Path("checkpoints")
+def load_dataset(path: Path) -> Dataset:
+    """Load pre-formatted Qwen template dataset."""
+    import json
+    examples = []
+    with open(path) as f:
+        for line in f:
+            examples.append(json.loads(line))
+    print(f"Loaded {len(examples)} training examples")
+    return Dataset.from_list(examples)
+def main() -> None:
+    """Run fine-tuning."""
+    print("=" * 60)
+    print("Marxist-Leninist LLM Fine-Tuning")
+    print("=" * 60)
+    # Check CUDA
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name()}")
+        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    else:
+        raise RuntimeError("CUDA not available - need GPU for training!")
+    # Load model
+    print(f"\nLoading model: {MODEL_NAME}")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=LOAD_IN_4BIT,
+        dtype=None,  # Auto-detect (bf16 if available)
+    )
+    print(f"Model type: {model.config.model_type}")
+    # Apply LoRA
+    print("\nApplying LoRA adapters...")
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_R,
+        lora_alpha=LORA_ALPHA,
+        lora_dropout=LORA_DROPOUT,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        bias="none",
+        use_gradient_checkpointing="unsloth",  # 30% less VRAM
+        random_state=3407,
+        max_seq_length=MAX_SEQ_LENGTH,
+    )
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
+    # Load dataset
+    print(f"\nLoading dataset: {DATA_PATH}")
+    dataset = load_dataset(DATA_PATH)
+    # Create output directories
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
+    # Configure trainer
+    print("\nConfiguring trainer...")
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",  # Pre-formatted Qwen template
+        max_seq_length=MAX_SEQ_LENGTH,
+        args=SFTConfig(
+            # Batch settings
+            per_device_train_batch_size=BATCH_SIZE,
+            gradient_accumulation_steps=GRADIENT_ACCUMULATION,
+            # Learning rate
+            learning_rate=LEARNING_RATE,
+            lr_scheduler_type="cosine",
+            warmup_ratio=WARMUP_RATIO,
+            # Training duration
+            num_train_epochs=EPOCHS,
+            # Memory optimization
+            fp16=not torch.cuda.is_bf16_supported(),
+            bf16=torch.cuda.is_bf16_supported(),
+            optim="adamw_8bit",
+            # Logging
+            logging_steps=10,
+            save_strategy="epoch",
+            save_total_limit=2,
+            # Output
+            output_dir=str(OUTPUT_DIR),
+            seed=3407,
+            report_to="none",  # or "wandb" if configured
+        ),
+    )
+    # Train!
+    print("\n" + "=" * 60)
+    print("STARTING TRAINING")
+    print("=" * 60)
+    print(f"Epochs: {EPOCHS}")
+    print(
+        f"Batch size: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} = {BATCH_SIZE * GRADIENT_ACCUMULATION}"
+    )
+    print(f"Learning rate: {LEARNING_RATE}")
+    print()
+    trainer.train()
+    # Save final model
+    print("\n" + "=" * 60)
+    print("SAVING MODEL")
+    print("=" * 60)
+    lora_path = CHECKPOINT_DIR / "marxist-lora-adapter"
+    model.save_pretrained(str(lora_path))
+    tokenizer.save_pretrained(str(lora_path))
+    print(f"LoRA adapter saved to: {lora_path}")
+    # Export to GGUF
+    print("\nExporting to GGUF (q4_k_m)...")
+    gguf_path = CHECKPOINT_DIR / "marxist-deepseek-q4_k_m"
+    model.save_pretrained_gguf(
+        str(gguf_path),
+        tokenizer,
+        quantization_method="q4_k_m",
+    )
+    print(f"GGUF exported to: {gguf_path}")
+    print("\n" + "=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)
+    print("\nNext steps:")
+    print(f"1. Download: {gguf_path}/*.gguf")
+    print("2. Create Ollama Modelfile (see ai-docs/finetune.yaml)")
+    print("3. ollama create marxist-deepseek -f Modelfile")
+    print("4. ollama run marxist-deepseek")
+    print("\nDON'T FORGET TO STOP YOUR RUNPOD!")
+if __name__ == "__main__":
+    main()

src/prolewiki_llm/transform_to_grpo.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+"""
+Transform curated_qa.jsonl to GRPO training format.
+Input format:  {"instruction": "...", "response": "..."}
+Output format: {"prompt": [...], "answer": "..."}
+Usage:
+    python transform_to_grpo.py
+"""
+import json
+from pathlib import Path
+SYSTEM_PROMPT = """You are a Marxist-Leninist assistant trained on ProleWiki and critical theory.
+Think through political theory questions using dialectical materialist analysis.
+Show your reasoning in <think> tags, then provide a clear, well-sourced answer."""
+INPUT_PATH = Path("training_data/curated_qa.jsonl")
+OUTPUT_PATH = Path("training_data/grpo_dataset.jsonl")
+def transform_qa_to_grpo(input_path: Path, output_path: Path) -> int:
+    """Transform instruction/response pairs to GRPO format."""
+    count = 0
+    with open(input_path) as infile, open(output_path, "w") as outfile:
+        for line in infile:
+            item = json.loads(line)
+            transformed = {
+                "prompt": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": item["instruction"]},
+                ],
+                "answer": item["response"],
+            }
+            outfile.write(json.dumps(transformed) + "\n")
+            count += 1
+    return count
+def main() -> None:
+    """Run transformation."""
+    print(f"Transforming {INPUT_PATH} to GRPO format...")
+    count = transform_qa_to_grpo(INPUT_PATH, OUTPUT_PATH)
+    print(f"Transformed {count} examples")
+    print(f"Output written to: {OUTPUT_PATH}")
+    # Show sample
+    print("\nSample output:")
+    with open(OUTPUT_PATH) as f:
+        sample = json.loads(f.readline())
+        print(f"  System: {sample['prompt'][0]['content'][:60]}...")
+        print(f"  User: {sample['prompt'][1]['content'][:60]}...")
+        print(f"  Answer: {sample['answer'][:60]}...")
+if __name__ == "__main__":
+    main()

src/prolewiki_llm/wandb_logging.py ADDED Viewed

	@@ -0,0 +1,529 @@

+#!/usr/bin/env python3
+"""
+Weights & Biases Logging for GRPO Training.
+Provides comprehensive logging for debugging and monitoring GRPO fine-tuning:
+- Per-step reward metrics (each reward function's mean)
+- Sample tables showing question → response → reward breakdown
+- Run configuration and hyperparameters
+- Summary statistics at training end
+Usage:
+    from prolewiki_llm.wandb_logging import (
+        init_wandb_logging,
+        WandbSampleLogger,
+        create_logging_reward,
+    )
+    # Initialize
+    run = init_wandb_logging(project="marxist-grpo", config={...})
+    # Create logger and reward function
+    sample_logger = WandbSampleLogger(log_every_n_steps=10)
+    logging_reward = create_logging_reward(sample_logger)
+    # Use in GRPOTrainer
+    trainer = GRPOTrainer(
+        reward_funcs=[..., logging_reward],
+        ...
+    )
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+# Global flag to track if wandb is available
+_WANDB_AVAILABLE: bool | None = None
+_wandb_module: Any = None
+def _get_wandb() -> Any:
+    """Lazily import and return wandb module."""
+    global _WANDB_AVAILABLE, _wandb_module
+    if _WANDB_AVAILABLE is None:
+        try:
+            import wandb
+            _wandb_module = wandb
+            _WANDB_AVAILABLE = True
+        except ImportError:
+            _WANDB_AVAILABLE = False
+            _wandb_module = None
+    return _wandb_module
+def is_wandb_available() -> bool:
+    """Check if wandb is installed and available."""
+    _get_wandb()
+    return _WANDB_AVAILABLE is True
+# =============================================================================
+# INITIALIZATION
+# =============================================================================
+def init_wandb_logging(
+    project: str,
+    config: dict[str, Any],
+    name: str | None = None,
+    tags: list[str] | None = None,
+    notes: str | None = None,
+    mode: str = "online",
+) -> Any:
+    """
+    Initialize Weights & Biases logging for GRPO training.
+    Args:
+        project: W&B project name (e.g., "marxist-grpo")
+        config: Dictionary of hyperparameters and settings
+        name: Optional run name (auto-generated if None)
+        tags: Optional list of tags for filtering runs
+        notes: Optional notes about this run
+        mode: "online", "offline", or "disabled"
+    Returns:
+        wandb.Run object (or None if wandb unavailable)
+    Example:
+        run = init_wandb_logging(
+            project="marxist-grpo",
+            config={
+                "model": "DeepSeek-R1-0528-Qwen3-8B",
+                "learning_rate": 5e-6,
+                "batch_size": 2,
+                "max_steps": 250,
+            },
+            tags=["grpo", "marxist", "v1"],
+        )
+    """
+    wandb = _get_wandb()
+    if wandb is None:
+        print("[WandbLogging] wandb not installed. Install with: pip install wandb")
+        return None
+    # Initialize run
+    run = wandb.init(
+        project=project,
+        config=config,
+        name=name,
+        tags=tags or ["grpo", "marxist-leninist"],
+        notes=notes,
+        mode=mode,
+    )
+    # Define metrics with proper summaries
+    _define_reward_metrics(run)
+    print(f"[WandbLogging] Initialized run: {run.name}")
+    print(f"[WandbLogging] View at: {run.url}")
+    return run
+def _define_reward_metrics(run: Any) -> None:
+    """Define reward metrics with min/max/mean summaries."""
+    reward_metrics = [
+        "rewards/format_exact",
+        "rewards/format_approx",
+        "rewards/semantic_similarity",
+        "rewards/terminology",
+        "rewards/nli_coherence",
+        "rewards/self_consistency",
+        "rewards/structural_coherence",
+        "rewards/topic_relevance",
+        "rewards/interconnection_depth",
+        "rewards/completeness",
+        "rewards/total",
+    ]
+    for metric in reward_metrics:
+        # Track min, max, and mean for each reward
+        run.define_metric(metric, summary="mean")
+        run.define_metric(f"{metric}_min", summary="min")
+        run.define_metric(f"{metric}_max", summary="max")
+# =============================================================================
+# SAMPLE LOGGER
+# =============================================================================
+@dataclass
+class RewardSample:
+    """A single sample with its reward breakdown."""
+    step: int
+    question: str
+    response: str
+    ground_truth: str
+    rewards: dict[str, float]
+    @property
+    def total_reward(self) -> float:
+        """Sum of all rewards."""
+        return sum(self.rewards.values())
+@dataclass
+class WandbSampleLogger:
+    """
+    Logs sample tables to W&B for debugging reward functions.
+    Accumulates samples during training and logs them as a wandb.Table
+    every N steps. This lets you inspect actual model outputs and
+    understand why specific rewards were assigned.
+    Example table:
+        | step | question | response | ground_truth | format | nli | topic | depth | total |
+        |------|----------|----------|--------------|--------|-----|-------|-------|-------|
+        | 50   | What is..| The bour.| Revisionism..| 3.0    | 2.5 | 1.5   | 1.0   | 8.0   |
+    """
+    log_every_n_steps: int = 10
+    max_samples_per_log: int = 4
+    _samples: list[RewardSample] = field(default_factory=list)
+    _step_counter: int = field(default=0)
+    _table_columns: list[str] = field(
+        default_factory=lambda: [
+            "step",
+            "question",
+            "response",
+            "ground_truth",
+            "format_exact",
+            "format_approx",
+            "nli_coherence",
+            "topic_relevance",
+            "depth",
+            "completeness",
+            "total",
+        ]
+    )
+    def add_sample(
+        self,
+        step: int,
+        question: str,
+        response: str,
+        ground_truth: str,
+        rewards: dict[str, float],
+    ) -> None:
+        """Add a sample to the buffer."""
+        sample = RewardSample(
+            step=step,
+            question=question[:500],  # Truncate for table display
+            response=response[:500],
+            ground_truth=ground_truth[:300],
+            rewards=rewards,
+        )
+        self._samples.append(sample)
+        # Keep only recent samples
+        max_buffer = self.max_samples_per_log * 3
+        if len(self._samples) > max_buffer:
+            self._samples = self._samples[-max_buffer:]
+    def should_log(self, step: int) -> bool:
+        """Check if we should log at this step."""
+        return step > 0 and step % self.log_every_n_steps == 0
+    def log_table(self, step: int) -> None:
+        """Log accumulated samples as a wandb.Table."""
+        wandb = _get_wandb()
+        if wandb is None or not self._samples:
+            return
+        # Get recent samples
+        samples_to_log = self._samples[-self.max_samples_per_log :]
+        # Create table
+        table = wandb.Table(columns=self._table_columns)
+        for sample in samples_to_log:
+            row = [
+                sample.step,
+                sample.question,
+                sample.response,
+                sample.ground_truth,
+                sample.rewards.get("format_exact", 0.0),
+                sample.rewards.get("format_approx", 0.0),
+                sample.rewards.get("nli_coherence", 0.0),
+                sample.rewards.get("topic_relevance", 0.0),
+                sample.rewards.get("interconnection_depth", 0.0),
+                sample.rewards.get("completeness", 0.0),
+                sample.total_reward,
+            ]
+            table.add_data(*row)
+        # Log the table
+        wandb.log({"samples": table}, step=step)
+        print(f"[WandbLogging] Logged {len(samples_to_log)} samples at step {step}")
+    def clear(self) -> None:
+        """Clear the sample buffer."""
+        self._samples.clear()
+# =============================================================================
+# REWARD METRICS LOGGING
+# =============================================================================
+def log_reward_metrics(
+    step: int,
+    reward_scores: dict[str, list[float]],
+) -> None:
+    """
+    Log reward metrics to wandb.
+    Args:
+        step: Training step number
+        reward_scores: Dict mapping reward name to list of scores
+                      e.g., {"format_exact": [3.0, 3.0, 0.0, 3.0]}
+    """
+    wandb = _get_wandb()
+    if wandb is None:
+        return
+    metrics: dict[str, float] = {}
+    for name, scores in reward_scores.items():
+        if not scores:
+            continue
+        mean_score = sum(scores) / len(scores)
+        min_score = min(scores)
+        max_score = max(scores)
+        metrics[f"rewards/{name}"] = mean_score
+        metrics[f"rewards/{name}_min"] = min_score
+        metrics[f"rewards/{name}_max"] = max_score
+    # Compute total
+    if reward_scores:
+        all_totals = []
+        num_samples = len(next(iter(reward_scores.values())))
+        for i in range(num_samples):
+            total = sum(scores[i] for scores in reward_scores.values() if i < len(scores))
+            all_totals.append(total)
+        if all_totals:
+            metrics["rewards/total"] = sum(all_totals) / len(all_totals)
+            metrics["rewards/total_min"] = min(all_totals)
+            metrics["rewards/total_max"] = max(all_totals)
+    wandb.log(metrics, step=step)
+# =============================================================================
+# LOGGING REWARD FUNCTION
+# =============================================================================
+# Global step counter for the logging reward
+_LOGGING_STEP = 0
+def create_logging_reward(
+    sample_logger: WandbSampleLogger | None = None,
+    compute_all_rewards: bool = True,
+) -> Callable[..., list[float]]:
+    """
+    Create a reward function that logs metrics and samples to wandb.
+    This replaces debug_print_reward with comprehensive wandb logging.
+    The returned function computes ALL individual rewards internally,
+    logs them to wandb, and returns [0.0] * len(completions) (no training effect).
+    Args:
+        sample_logger: WandbSampleLogger instance for sample table logging
+        compute_all_rewards: If True, compute and log all reward functions
+    Returns:
+        A reward function compatible with GRPOTrainer
+    Example:
+        sample_logger = WandbSampleLogger(log_every_n_steps=10)
+        logging_reward = create_logging_reward(sample_logger)
+        trainer = GRPOTrainer(
+            reward_funcs=[..., logging_reward],
+            ...
+        )
+    """
+    global _LOGGING_STEP
+    def logging_reward(
+        prompts: Sequence[Sequence[dict[str, str]]],
+        completions: Sequence[Sequence[dict[str, str]]],
+        answer: Sequence[str],
+        **kwargs: object,
+    ) -> list[float]:
+        """Log rewards and samples to wandb. Returns 0.0 (no training effect)."""
+        global _LOGGING_STEP
+        _LOGGING_STEP += 1
+        step = _LOGGING_STEP
+        wandb = _get_wandb()
+        if wandb is None or wandb.run is None:
+            # Fallback to print if wandb not initialized
+            if step % 10 == 0:
+                print(f"[Step {step}] Q: {prompts[0][-1]['content'][:80]}...")
+            return [0.0] * len(completions)
+        # Compute all reward scores if requested
+        if compute_all_rewards:
+            reward_scores = _compute_all_reward_scores(prompts, completions, answer, **kwargs)
+            log_reward_metrics(step, reward_scores)
+        else:
+            reward_scores = {}
+        # Log samples periodically
+        if sample_logger and sample_logger.should_log(step):
+            # Add current batch to sample logger
+            for i in range(min(sample_logger.max_samples_per_log, len(prompts))):
+                question = prompts[i][-1]["content"]
+                response = completions[i][0]["content"]
+                truth = answer[i] if i < len(answer) else ""
+                # Get individual rewards for this sample
+                sample_rewards = {
+                    name: scores[i] if i < len(scores) else 0.0
+                    for name, scores in reward_scores.items()
+                }
+                sample_logger.add_sample(
+                    step=step,
+                    question=question,
+                    response=response,
+                    ground_truth=truth,
+                    rewards=sample_rewards,
+                )
+            sample_logger.log_table(step)
+        return [0.0] * len(completions)
+    return logging_reward
+def _compute_all_reward_scores(
+    prompts: Sequence[Sequence[dict[str, str]]],
+    completions: Sequence[Sequence[dict[str, str]]],
+    answer: Sequence[str],
+    **kwargs: object,
+) -> dict[str, list[float]]:
+    """
+    Compute all reward function scores for logging.
+    Returns dict mapping reward name to list of scores.
+    """
+    # Import reward functions here to avoid circular imports
+    from prolewiki_llm.grpo_rewards import (
+        completeness_reward,
+        interconnection_depth_reward,
+        match_format_approximately,
+        match_format_exactly,
+        nli_coherence_reward,
+        topic_relevance_reward,
+    )
+    reward_scores: dict[str, list[float]] = {}
+    # Format rewards (don't need answer)
+    try:
+        reward_scores["format_exact"] = match_format_exactly(completions, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in format_exact: {e}")
+        reward_scores["format_exact"] = [0.0] * len(completions)
+    try:
+        reward_scores["format_approx"] = match_format_approximately(completions, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in format_approx: {e}")
+        reward_scores["format_approx"] = [0.0] * len(completions)
+    # NLI coherence (needs answer)
+    try:
+        reward_scores["nli_coherence"] = nli_coherence_reward(completions, answer, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in nli_coherence: {e}")
+        reward_scores["nli_coherence"] = [0.0] * len(completions)
+    # Topic relevance (needs prompts)
+    try:
+        reward_scores["topic_relevance"] = topic_relevance_reward(prompts, completions, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in topic_relevance: {e}")
+        reward_scores["topic_relevance"] = [0.0] * len(completions)
+    # Interconnection depth
+    try:
+        reward_scores["interconnection_depth"] = interconnection_depth_reward(completions, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in interconnection_depth: {e}")
+        reward_scores["interconnection_depth"] = [0.0] * len(completions)
+    # Completeness (needs answer)
+    try:
+        reward_scores["completeness"] = completeness_reward(completions, answer, **kwargs)
+    except Exception as e:
+        print(f"[WandbLogging] Error in completeness: {e}")
+        reward_scores["completeness"] = [0.0] * len(completions)
+    return reward_scores
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+def finish_wandb_logging(summary: dict[str, Any] | None = None) -> None:
+    """
+    Finish the wandb run with optional summary statistics.
+    Args:
+        summary: Optional dict of final summary metrics
+    """
+    wandb = _get_wandb()
+    if wandb is None or wandb.run is None:
+        return
+    if summary:
+        for key, value in summary.items():
+            wandb.run.summary[key] = value
+    wandb.finish()
+    print("[WandbLogging] Run finished.")
+def log_model_checkpoint(
+    checkpoint_path: str,
+    metadata: dict[str, Any] | None = None,
+) -> None:
+    """
+    Log a model checkpoint as a wandb artifact.
+    Args:
+        checkpoint_path: Path to the checkpoint directory
+        metadata: Optional metadata about the checkpoint
+    """
+    wandb = _get_wandb()
+    if wandb is None or wandb.run is None:
+        return
+    artifact = wandb.Artifact(
+        name=f"checkpoint-{wandb.run.name}",
+        type="model",
+        metadata=metadata or {},
+    )
+    artifact.add_dir(checkpoint_path)
+    wandb.log_artifact(artifact)
+    print(f"[WandbLogging] Logged checkpoint: {checkpoint_path}")

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # prolewiki-llm tests

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Pytest configuration and fixtures for prolewiki-llm tests."""
+import os
+from pathlib import Path
+import pytest
+# =============================================================================
+# Module-Level Setup for train_headless.py Tests
+# =============================================================================
+# train_headless.py reads required env vars (HF_TOKEN, WANDB_API_KEY) at import time.
+# We need to set dummy values BEFORE pytest collects tests that import this module.
+# This must be done at module level, not in a fixture, because fixtures run
+# after collection and the import happens during collection.
+if "HF_TOKEN" not in os.environ:
+    os.environ["HF_TOKEN"] = "test-token-for-unit-tests"
+if "WANDB_API_KEY" not in os.environ:
+    os.environ["WANDB_API_KEY"] = "test-key-for-unit-tests"
+@pytest.fixture
+def sample_question() -> str:
+    """Sample Marxist theory question for testing."""
+    return "What is the difference between revisionism and Marxism-Leninism?"
+@pytest.fixture
+def sample_good_answer() -> str:
+    """Sample well-structured answer for testing rewards."""
+    return """<think>
+The question asks about the distinction between revisionism and Marxism-Leninism,
+which requires explaining both concepts and their historical relationship.
+</think>
+Revisionism refers to attempts to revise or water down the fundamental principles
+of Marxism, particularly the necessity of class struggle and proletarian
+dictatorship. It emerged historically with Eduard Bernstein who rejected
+revolutionary change in favor of gradual reform within capitalism.
+Marxism-Leninism, in contrast, upholds the core tenets of scientific socialism:
+the class nature of the state, the necessity of socialist revolution, and the
+dictatorship of the proletariat as the transitional form to communism.
+The key distinction lies in their approach to the capitalist state - revisionists
+seek to reform it, while Marxist-Leninists understand it must be replaced by a
+workers' state through revolutionary action."""
+@pytest.fixture
+def sample_bad_answer() -> str:
+    """Sample buzzword-heavy answer lacking depth."""
+    return """Revisionism is bad and Marxism-Leninism is good. We must fight
+imperialism and support the working class. Solidarity forever! The revolution
+will triumph against capitalism and fascism. Workers of the world unite!"""
+# =============================================================================
+# Integration Test Fixtures
+# =============================================================================
+@pytest.fixture
+def project_root() -> Path:
+    """Return the project root directory."""
+    return Path(__file__).parent.parent
+@pytest.fixture
+def mock_bin_dir(tmp_path: Path) -> Path:
+    """Create mock bin directory with executable stubs for shell script testing.
+    Copies mock scripts from tests/fixtures/mock_bin/ to a temporary directory
+    and makes them executable.
+    """
+    mock_bin = tmp_path / "mock_bin"
+    mock_bin.mkdir()
+    # Copy mock scripts from fixtures
+    fixtures_dir = Path(__file__).parent / "fixtures" / "mock_bin"
+    if fixtures_dir.exists():
+        for script in fixtures_dir.iterdir():
+            if script.is_file():
+                dest = mock_bin / script.name
+                dest.write_text(script.read_text())
+                dest.chmod(0o755)  # Make executable
+    return mock_bin
+@pytest.fixture
+def start_sh_env(tmp_path: Path, mock_bin_dir: Path) -> dict[str, str]:
+    """Base environment for start.sh integration tests.
+    Provides a controlled environment with:
+    - PATH pointing to mock binaries first
+    - Log directory for capturing mock invocations
+    - Default success values for mock commands
+    - Output directories in tmp_path (not /workspace)
+    """
+    log_dir = tmp_path / "logs"
+    log_dir.mkdir()
+    return {
+        "PATH": f"{mock_bin_dir}:/usr/bin:/bin",
+        "HOME": str(tmp_path),
+        "MOCK_LOG_DIR": str(log_dir),
+        "MOCK_CUDA_AVAILABLE": "1",
+        "MOCK_TRAINING_EXIT_CODE": "0",
+        # Override /workspace paths to use tmp_path
+        "CHECKPOINT_DIR": str(tmp_path / "checkpoints"),
+        "LORA_OUTPUT": str(tmp_path / "lora-output"),
+        "OUTPUT_DIR": str(tmp_path / "outputs"),
+    }

tests/fixtures/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Test fixtures for prolewiki-llm tests."""

tests/fixtures/mock_bin/huggingface-cli ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+# Mock huggingface-cli for testing authentication
+LOG_FILE="${MOCK_LOG_DIR:-/tmp}/mock_calls.log"
+echo "huggingface-cli $*" >> "$LOG_FILE"
+exit "${MOCK_HF_CLI_EXIT_CODE:-0}"

tests/fixtures/mock_bin/python ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Mock python script for testing start.sh
+# Handles CUDA checks and training module invocations
+LOG_FILE="${MOCK_LOG_DIR:-/tmp}/mock_calls.log"
+echo "python $*" >> "$LOG_FILE"
+# Handle CUDA availability check
+if [[ "$*" == *"torch.cuda.is_available"* ]]; then
+    if [ "${MOCK_CUDA_AVAILABLE:-1}" = "0" ]; then
+        echo "AssertionError: CUDA not available" >&2
+        exit 1
+    fi
+    exit 0
+fi
+# Handle GPU name query
+if [[ "$*" == *"get_device_name"* ]]; then
+    echo "Mock GPU RTX 4090"
+    exit 0
+fi
+# Handle VRAM query
+if [[ "$*" == *"get_device_properties"* ]]; then
+    echo "24.0 GB"
+    exit 0
+fi
+# Handle training module
+if [[ "$*" == *"prolewiki_llm.train_headless"* ]]; then
+    exit "${MOCK_TRAINING_EXIT_CODE:-0}"
+fi
+# Default: success
+exit 0

tests/fixtures/mock_bin/runpodctl ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+# Mock runpodctl for testing start.sh auto-shutoff behavior
+LOG_FILE="${MOCK_LOG_DIR:-/tmp}/mock_calls.log"
+echo "runpodctl $*" >> "$LOG_FILE"
+# Simulate successful pod operations
+if [[ "$1" == "stop" ]] && [[ "$2" == "pod" ]]; then
+    echo "pod \"$3\" stopped"
+fi
+exit "${MOCK_RUNPODCTL_EXIT_CODE:-0}"

tests/fixtures/mock_bin/wandb ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+# Mock wandb CLI for testing authentication
+LOG_FILE="${MOCK_LOG_DIR:-/tmp}/mock_calls.log"
+echo "wandb $*" >> "$LOG_FILE"
+exit "${MOCK_WANDB_EXIT_CODE:-0}"

tests/fixtures/mock_bin/wget ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+# Mock wget for testing dataset download
+LOG_FILE="${MOCK_LOG_DIR:-/tmp}/mock_calls.log"
+echo "wget $*" >> "$LOG_FILE"
+# If -O flag is present, create a dummy file
+for i in "$@"; do
+    if [[ "$prev" == "-O" ]]; then
+        # Create dummy dataset file
+        echo '{"prompt": "test", "answer": "test"}' > "$i"
+        break
+    fi
+    prev="$i"
+done
+exit "${MOCK_WGET_EXIT_CODE:-0}"

tests/integration/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Integration tests for prolewiki-llm."""

tests/integration/test_start_sh.py ADDED Viewed

	@@ -0,0 +1,462 @@

+"""
+Integration tests for the headless training entrypoint script.
+Tests cover:
+- Environment validation (required secrets, GPU check)
+- Auto-shutoff behavior (FinOps: pod termination on success, no termination on failure)
+- Data handling (dataset validation, download from URL)
+- Authentication (HuggingFace, Weights & Biases)
+These tests use subprocess to run start.sh with mock external commands.
+"""
+from __future__ import annotations
+import subprocess
+from pathlib import Path
+import pytest
+# =============================================================================
+# Environment Validation Tests
+# =============================================================================
+@pytest.mark.integration
+class TestEnvironmentValidation:
+    """Test environment variable validation in start.sh."""
+    def test_exits_without_hf_token(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script exits with error when HF_TOKEN is not set."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            # HF_TOKEN intentionally missing
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 1
+        assert "HF_TOKEN" in result.stdout or "HF_TOKEN" in result.stderr
+    def test_exits_without_wandb_key(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script exits with error when WANDB_API_KEY is not set."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            # WANDB_API_KEY intentionally missing
+            "DATASET_PATH": str(dataset),
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 1
+        assert "WANDB_API_KEY" in result.stdout or "WANDB_API_KEY" in result.stderr
+    def test_exits_without_cuda(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script exits with error when CUDA is not available."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+            "MOCK_CUDA_AVAILABLE": "0",  # Simulate no GPU
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 1
+        assert "CUDA" in result.stdout or "GPU" in result.stdout
+# =============================================================================
+# Auto-Shutoff Tests (Critical for FinOps)
+# =============================================================================
+@pytest.mark.integration
+class TestAutoShutoff:
+    """Test pod auto-termination per RunPod strategy document.
+    These tests verify the FinOps strategy that prevents zombie pods.
+    """
+    def test_calls_runpodctl_stop_on_success(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Pod is terminated after successful training (FinOps)."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "RUNPOD_POD_ID": "pod-abc123",
+            "DATASET_PATH": str(dataset),
+            "MOCK_TRAINING_EXIT_CODE": "0",  # Training succeeds
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        # Verify runpodctl was called
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        assert result.returncode == 0
+        assert "runpodctl stop pod pod-abc123" in calls
+    def test_no_runpodctl_stop_without_pod_id(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """No pod termination when RUNPOD_POD_ID is not set."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            # RUNPOD_POD_ID intentionally not set
+            "DATASET_PATH": str(dataset),
+            "MOCK_TRAINING_EXIT_CODE": "0",
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        assert result.returncode == 0
+        assert "runpodctl" not in calls
+        assert "RUNPOD_POD_ID not set" in result.stdout
+    def test_no_runpodctl_stop_on_failure(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Pod is NOT terminated on failure (allows debugging).
+        This is critical: developers need SSH access to debug failed training.
+        """
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "RUNPOD_POD_ID": "pod-abc123",
+            "DATASET_PATH": str(dataset),
+            "MOCK_TRAINING_EXIT_CODE": "1",  # Training fails
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        # Pod should NOT be stopped (for debugging)
+        assert "runpodctl" not in calls
+        assert result.returncode == 1
+        assert "NOT be automatically terminated" in result.stdout
+    def test_exit_code_propagates_from_training(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script exits with the same code as the training process."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+            "MOCK_TRAINING_EXIT_CODE": "42",  # Custom exit code
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 42
+# =============================================================================
+# Data Handling Tests
+# =============================================================================
+@pytest.mark.integration
+class TestDataHandling:
+    """Test dataset validation and download logic."""
+    def test_exits_when_dataset_missing_no_url(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script exits when dataset is missing and no DATASET_URL provided."""
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(tmp_path / "nonexistent.jsonl"),
+            # DATASET_URL not set
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 1
+        assert "Dataset not found" in result.stdout
+    def test_downloads_dataset_from_url(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script downloads dataset when file missing but DATASET_URL set."""
+        dataset_path = tmp_path / "dataset.jsonl"
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset_path),
+            "DATASET_URL": "https://example.com/dataset.jsonl",
+        }
+        subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        # wget should have been called
+        assert "wget" in calls
+        assert "https://example.com/dataset.jsonl" in calls
+    def test_skips_download_when_dataset_exists(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script does not download when dataset file already exists."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+            "DATASET_URL": "https://example.com/should-not-download.jsonl",
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        # wget should NOT have been called
+        assert "wget" not in calls
+        assert result.returncode == 0
+# =============================================================================
+# Authentication Tests
+# =============================================================================
+@pytest.mark.integration
+class TestAuthentication:
+    """Test authentication with external services."""
+    def test_calls_huggingface_cli_login(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script calls huggingface-cli login with the token."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "hf_test_token_12345",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+        }
+        subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        assert "huggingface-cli login" in calls
+        assert "hf_test_token_12345" in calls
+    def test_calls_wandb_login(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script calls wandb login with the API key."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "wandb_key_67890",
+            "DATASET_PATH": str(dataset),
+        }
+        subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        log_file = Path(env["MOCK_LOG_DIR"]) / "mock_calls.log"
+        calls = log_file.read_text() if log_file.exists() else ""
+        assert "wandb login" in calls
+        assert "wandb_key_67890" in calls
+    def test_auth_failures_dont_stop_script(
+        self,
+        tmp_path: Path,
+        start_sh_env: dict[str, str],
+        project_root: Path,
+    ) -> None:
+        """Script continues even if authentication commands fail (|| true)."""
+        dataset = tmp_path / "dataset.jsonl"
+        dataset.write_text('{"prompt":"test"}\n')
+        env = {
+            **start_sh_env,
+            "HF_TOKEN": "test-token",
+            "WANDB_API_KEY": "test-key",
+            "DATASET_PATH": str(dataset),
+            "MOCK_HF_CLI_EXIT_CODE": "1",  # HF login fails
+            "MOCK_WANDB_EXIT_CODE": "1",  # wandb login fails
+        }
+        result = subprocess.run(
+            ["bash", str(project_root / "docker" / "start.sh")],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        # Script should still succeed (auth failures are non-fatal)
+        assert result.returncode == 0

tests/unit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Unit tests

tests/unit/test_grpo_rewards.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/unit/test_train_headless.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+Unit tests for the headless training module.
+Tests cover:
+- Environment variable parsing (get_env, get_env_int, get_env_float)
+- Checkpoint discovery (find_latest_checkpoint)
+- Model upload to HuggingFace Hub (upload_to_hub)
+"""
+from __future__ import annotations
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import pytest
+# =============================================================================
+# Environment Variable Parsing Tests
+# =============================================================================
+class TestGetEnv:
+    """Test environment variable retrieval functions."""
+    def test_get_env_returns_value(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env returns the environment variable value when set."""
+        monkeypatch.setenv("TEST_VAR", "test_value")
+        # Import after setting env to avoid module-level checks
+        from prolewiki_llm.train_headless import get_env
+        assert get_env("TEST_VAR") == "test_value"
+    def test_get_env_returns_default(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env returns default when variable not set."""
+        monkeypatch.delenv("NONEXISTENT_VAR", raising=False)
+        from prolewiki_llm.train_headless import get_env
+        assert get_env("NONEXISTENT_VAR", "default_value") == "default_value"
+    def test_get_env_required_exits(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env exits when required variable is missing."""
+        monkeypatch.delenv("REQUIRED_VAR", raising=False)
+        from prolewiki_llm.train_headless import get_env
+        with pytest.raises(SystemExit) as exc_info:
+            get_env("REQUIRED_VAR", required=True)
+        assert exc_info.value.code == 1
+class TestGetEnvInt:
+    """Test integer environment variable parsing."""
+    def test_get_env_int_parses_integer(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env_int correctly parses integer values."""
+        monkeypatch.setenv("INT_VAR", "42")
+        from prolewiki_llm.train_headless import get_env_int
+        assert get_env_int("INT_VAR", 0) == 42
+    def test_get_env_int_returns_default(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env_int returns default when variable not set."""
+        monkeypatch.delenv("NONEXISTENT_INT", raising=False)
+        from prolewiki_llm.train_headless import get_env_int
+        assert get_env_int("NONEXISTENT_INT", 100) == 100
+class TestGetEnvFloat:
+    """Test float environment variable parsing."""
+    def test_get_env_float_parses_float(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """get_env_float correctly parses float values."""
+        monkeypatch.setenv("FLOAT_VAR", "3.14")
+        from prolewiki_llm.train_headless import get_env_float
+        assert get_env_float("FLOAT_VAR", 0.0) == pytest.approx(3.14)
+    def test_get_env_float_parses_scientific(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """get_env_float correctly parses scientific notation."""
+        monkeypatch.setenv("FLOAT_VAR", "5e-6")
+        from prolewiki_llm.train_headless import get_env_float
+        assert get_env_float("FLOAT_VAR", 0.0) == pytest.approx(5e-6)
+    def test_get_env_float_returns_default(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """get_env_float returns default when variable not set."""
+        monkeypatch.delenv("NONEXISTENT_FLOAT", raising=False)
+        from prolewiki_llm.train_headless import get_env_float
+        assert get_env_float("NONEXISTENT_FLOAT", 1.5) == 1.5
+# =============================================================================
+# Checkpoint Discovery Tests
+# =============================================================================
+class TestFindLatestCheckpoint:
+    """Test checkpoint discovery logic."""
+    def test_returns_none_for_nonexistent_dir(self, tmp_path: Path) -> None:
+        """Returns None when checkpoint directory doesn't exist."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        nonexistent = tmp_path / "nonexistent"
+        assert find_latest_checkpoint(nonexistent) is None
+    def test_returns_none_for_empty_dir(self, tmp_path: Path) -> None:
+        """Returns None when checkpoint directory is empty."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        checkpoint_dir = tmp_path / "checkpoints"
+        checkpoint_dir.mkdir()
+        assert find_latest_checkpoint(checkpoint_dir) is None
+    def test_returns_none_when_no_checkpoint_dirs(self, tmp_path: Path) -> None:
+        """Returns None when no checkpoint-* directories exist."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        checkpoint_dir = tmp_path / "checkpoints"
+        checkpoint_dir.mkdir()
+        # Create non-checkpoint directories
+        (checkpoint_dir / "random_dir").mkdir()
+        (checkpoint_dir / "other_file.txt").write_text("test")
+        assert find_latest_checkpoint(checkpoint_dir) is None
+    def test_finds_single_checkpoint(self, tmp_path: Path) -> None:
+        """Finds single checkpoint directory."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        checkpoint_dir = tmp_path / "checkpoints"
+        checkpoint_dir.mkdir()
+        checkpoint = checkpoint_dir / "checkpoint-100"
+        checkpoint.mkdir()
+        result = find_latest_checkpoint(checkpoint_dir)
+        assert result == checkpoint
+    def test_finds_latest_checkpoint(self, tmp_path: Path) -> None:
+        """Finds the checkpoint with the highest step number."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        checkpoint_dir = tmp_path / "checkpoints"
+        checkpoint_dir.mkdir()
+        # Create checkpoints in random order
+        (checkpoint_dir / "checkpoint-50").mkdir()
+        (checkpoint_dir / "checkpoint-200").mkdir()
+        (checkpoint_dir / "checkpoint-100").mkdir()
+        (checkpoint_dir / "checkpoint-150").mkdir()
+        result = find_latest_checkpoint(checkpoint_dir)
+        assert result == checkpoint_dir / "checkpoint-200"
+    def test_ignores_non_checkpoint_dirs(self, tmp_path: Path) -> None:
+        """Ignores directories that don't match checkpoint-* pattern."""
+        from prolewiki_llm.train_headless import find_latest_checkpoint
+        checkpoint_dir = tmp_path / "checkpoints"
+        checkpoint_dir.mkdir()
+        # Create mix of checkpoint and non-checkpoint dirs
+        (checkpoint_dir / "checkpoint-50").mkdir()
+        (checkpoint_dir / "logs").mkdir()
+        (checkpoint_dir / "checkpoint-100").mkdir()
+        (checkpoint_dir / "outputs").mkdir()
+        result = find_latest_checkpoint(checkpoint_dir)
+        assert result == checkpoint_dir / "checkpoint-100"
+# =============================================================================
+# HuggingFace Hub Upload Tests
+# =============================================================================
+class TestUploadToHub:
+    """Test model upload to HuggingFace Hub."""
+    def test_creates_repo(self, tmp_path: Path) -> None:
+        """upload_to_hub creates the repository if it doesn't exist."""
+        from prolewiki_llm.train_headless import upload_to_hub
+        model_path = tmp_path / "lora-output"
+        model_path.mkdir()
+        (model_path / "adapter_model.safetensors").write_bytes(b"mock model")
+        mock_api = MagicMock()
+        # HfApi is imported inside upload_to_hub, so we patch at the source
+        with patch("huggingface_hub.HfApi", return_value=mock_api):
+            upload_to_hub(model_path, "test-org/test-model", "test-token")
+        mock_api.create_repo.assert_called_once_with(
+            "test-org/test-model", exist_ok=True, private=True
+        )
+    def test_uploads_folder(self, tmp_path: Path) -> None:
+        """upload_to_hub uploads the model folder."""
+        from prolewiki_llm.train_headless import upload_to_hub
+        model_path = tmp_path / "lora-output"
+        model_path.mkdir()
+        (model_path / "adapter_model.safetensors").write_bytes(b"mock model")
+        mock_api = MagicMock()
+        with patch("huggingface_hub.HfApi", return_value=mock_api):
+            upload_to_hub(model_path, "test-org/test-model", "test-token")
+        mock_api.upload_folder.assert_called_once_with(
+            folder_path=str(model_path),
+            repo_id="test-org/test-model",
+            commit_message="Headless GRPO training run",
+        )
+    def test_handles_repo_creation_failure(self, tmp_path: Path) -> None:
+        """upload_to_hub continues if repo already exists."""
+        from prolewiki_llm.train_headless import upload_to_hub
+        model_path = tmp_path / "lora-output"
+        model_path.mkdir()
+        (model_path / "adapter_model.safetensors").write_bytes(b"mock model")
+        mock_api = MagicMock()
+        mock_api.create_repo.side_effect = Exception("Repo already exists")
+        with patch("huggingface_hub.HfApi", return_value=mock_api):
+            # Should not raise
+            upload_to_hub(model_path, "test-org/test-model", "test-token")
+        # Should still attempt upload
+        mock_api.upload_folder.assert_called_once()

tests/unit/test_wandb_logging.py ADDED Viewed

	@@ -0,0 +1,467 @@

+"""
+Tests for Weights & Biases logging module.
+Tests cover:
+- WandbSampleLogger accumulation and table creation
+- Reward metrics logging
+- Logging reward function signature compatibility
+- Graceful handling when wandb is not available
+"""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import pytest
+# =============================================================================
+# FIXTURES
+# =============================================================================
+@pytest.fixture
+def mock_wandb() -> MagicMock:
+    """Create a mock wandb module."""
+    mock = MagicMock()
+    mock.run = MagicMock()
+    mock.run.name = "test-run"
+    mock.run.url = "https://wandb.ai/test/run"
+    mock.Table = MagicMock(return_value=MagicMock())
+    mock.init = MagicMock(return_value=mock.run)
+    mock.log = MagicMock()
+    mock.finish = MagicMock()
+    return mock
+@pytest.fixture
+def sample_prompts() -> list[list[dict[str, str]]]:
+    """Create sample prompts for testing."""
+    return [
+        [
+            {"role": "system", "content": "You are a Marxist assistant."},
+            {"role": "user", "content": "What is revisionism?"},
+        ],
+        [
+            {"role": "system", "content": "You are a Marxist assistant."},
+            {"role": "user", "content": "Explain surplus value."},
+        ],
+    ]
+@pytest.fixture
+def sample_completions() -> list[list[dict[str, str]]]:
+    """Create sample completions for testing."""
+    return [
+        [{"role": "assistant", "content": "</think>Revisionism distorts Marxist theory."}],
+        [{"role": "assistant", "content": "</think>Surplus value is unpaid labor."}],
+    ]
+@pytest.fixture
+def sample_answers() -> list[str]:
+    """Create sample ground truth answers."""
+    return [
+        "Revisionism is the distortion of Marxist-Leninist theory.",
+        "Surplus value is the value produced by workers beyond their wages.",
+    ]
+# =============================================================================
+# REWARD SAMPLE TESTS
+# =============================================================================
+class TestRewardSample:
+    """Test the RewardSample dataclass."""
+    def test_total_reward_calculation(self) -> None:
+        """Test that total_reward sums all rewards."""
+        from prolewiki_llm.wandb_logging import RewardSample
+        sample = RewardSample(
+            step=10,
+            question="What is X?",
+            response="X is Y.",
+            ground_truth="X is Y.",
+            rewards={
+                "format_exact": 3.0,
+                "nli_coherence": 2.0,
+                "topic_relevance": 1.5,
+            },
+        )
+        assert sample.total_reward == 6.5
+    def test_empty_rewards(self) -> None:
+        """Test total_reward with empty rewards dict."""
+        from prolewiki_llm.wandb_logging import RewardSample
+        sample = RewardSample(
+            step=10,
+            question="Q",
+            response="R",
+            ground_truth="T",
+            rewards={},
+        )
+        assert sample.total_reward == 0.0
+# =============================================================================
+# SAMPLE LOGGER TESTS
+# =============================================================================
+class TestWandbSampleLogger:
+    """Test the WandbSampleLogger class."""
+    def test_add_sample(self) -> None:
+        """Test adding samples to the logger."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        logger = WandbSampleLogger(log_every_n_steps=10)
+        logger.add_sample(
+            step=5,
+            question="What is revisionism?",
+            response="Revisionism distorts theory.",
+            ground_truth="Revisionism is distortion of Marxism.",
+            rewards={"format": 3.0, "nli": 2.0},
+        )
+        assert len(logger._samples) == 1
+        assert logger._samples[0].step == 5
+        assert logger._samples[0].total_reward == 5.0
+    def test_sample_buffer_limit(self) -> None:
+        """Test that sample buffer doesn't grow unbounded."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        logger = WandbSampleLogger(log_every_n_steps=10, max_samples_per_log=2)
+        # Add many samples
+        for i in range(20):
+            logger.add_sample(
+                step=i,
+                question=f"Q{i}",
+                response=f"R{i}",
+                ground_truth=f"T{i}",
+                rewards={"x": float(i)},
+            )
+        # Buffer should be limited (max_samples_per_log * 3 = 6)
+        assert len(logger._samples) <= 6
+    def test_should_log(self) -> None:
+        """Test should_log returns True at correct intervals."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        logger = WandbSampleLogger(log_every_n_steps=10)
+        assert not logger.should_log(0)  # Step 0 doesn't log
+        assert not logger.should_log(5)
+        assert logger.should_log(10)
+        assert not logger.should_log(15)
+        assert logger.should_log(20)
+    def test_clear(self) -> None:
+        """Test clearing the sample buffer."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        logger = WandbSampleLogger()
+        logger.add_sample(1, "Q", "R", "T", {"x": 1.0})
+        logger.add_sample(2, "Q", "R", "T", {"x": 2.0})
+        assert len(logger._samples) == 2
+        logger.clear()
+        assert len(logger._samples) == 0
+    def test_truncation(self) -> None:
+        """Test that long strings are truncated."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        logger = WandbSampleLogger()
+        long_text = "x" * 1000  # 1000 characters
+        logger.add_sample(
+            step=1,
+            question=long_text,
+            response=long_text,
+            ground_truth=long_text,
+            rewards={},
+        )
+        assert len(logger._samples[0].question) == 500
+        assert len(logger._samples[0].response) == 500
+        assert len(logger._samples[0].ground_truth) == 300
+# =============================================================================
+# LOG TABLE TESTS
+# =============================================================================
+class TestLogTable:
+    """Test table logging functionality."""
+    def test_log_table_creates_table(self, mock_wandb: MagicMock) -> None:
+        """Test that log_table creates and logs a wandb Table."""
+        from prolewiki_llm.wandb_logging import WandbSampleLogger
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            logger = WandbSampleLogger(max_samples_per_log=2)
+            # Add samples
+            logger.add_sample(1, "Q1", "R1", "T1", {"format": 3.0})
+            logger.add_sample(2, "Q2", "R2", "T2", {"format": 2.0})
+            # Log table
+            logger.log_table(step=10)
+            # Verify Table was created
+            mock_wandb.Table.assert_called_once()
+            # Verify log was called
+            mock_wandb.log.assert_called_once()
+            call_args = mock_wandb.log.call_args
+            assert "samples" in call_args[0][0]
+            assert call_args[1]["step"] == 10
+# =============================================================================
+# REWARD METRICS LOGGING TESTS
+# =============================================================================
+class TestLogRewardMetrics:
+    """Test reward metrics logging."""
+    def test_log_reward_metrics(self, mock_wandb: MagicMock) -> None:
+        """Test logging reward metrics to wandb."""
+        from prolewiki_llm.wandb_logging import log_reward_metrics
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            reward_scores = {
+                "format_exact": [3.0, 3.0, 0.0],
+                "nli_coherence": [2.0, -1.0, 3.0],
+            }
+            log_reward_metrics(step=50, reward_scores=reward_scores)
+            # Verify log was called with correct metrics
+            mock_wandb.log.assert_called_once()
+            logged_metrics = mock_wandb.log.call_args[0][0]
+            # Check mean calculations
+            assert logged_metrics["rewards/format_exact"] == 2.0  # (3+3+0)/3
+            assert logged_metrics["rewards/nli_coherence"] == pytest.approx(4 / 3)  # (2-1+3)/3
+            # Check min/max
+            assert logged_metrics["rewards/format_exact_min"] == 0.0
+            assert logged_metrics["rewards/format_exact_max"] == 3.0
+    def test_log_reward_metrics_computes_total(self, mock_wandb: MagicMock) -> None:
+        """Test that total reward is computed correctly."""
+        from prolewiki_llm.wandb_logging import log_reward_metrics
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            reward_scores = {
+                "format": [3.0, 2.0],
+                "nli": [1.0, 2.0],
+            }
+            log_reward_metrics(step=10, reward_scores=reward_scores)
+            logged_metrics = mock_wandb.log.call_args[0][0]
+            # Total for sample 0: 3.0 + 1.0 = 4.0
+            # Total for sample 1: 2.0 + 2.0 = 4.0
+            # Mean total: 4.0
+            assert logged_metrics["rewards/total"] == 4.0
+# =============================================================================
+# LOGGING REWARD FUNCTION TESTS
+# =============================================================================
+class TestCreateLoggingReward:
+    """Test the create_logging_reward function."""
+    def test_returns_zeros(
+        self,
+        sample_prompts: list[list[dict[str, str]]],
+        sample_completions: list[list[dict[str, str]]],
+        sample_answers: list[str],
+    ) -> None:
+        """Test that logging reward returns zeros (no training effect)."""
+        from prolewiki_llm.wandb_logging import (
+            WandbSampleLogger,
+            create_logging_reward,
+        )
+        # Create logging reward without wandb (will fallback to print)
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=None):
+            sample_logger = WandbSampleLogger()
+            logging_reward = create_logging_reward(sample_logger, compute_all_rewards=False)
+            scores = logging_reward(
+                prompts=sample_prompts,
+                completions=sample_completions,
+                answer=sample_answers,
+            )
+            # Should return zeros for all samples
+            assert scores == [0.0, 0.0]
+    def test_function_signature_compatibility(
+        self,
+        sample_prompts: list[list[dict[str, str]]],
+        sample_completions: list[list[dict[str, str]]],
+        sample_answers: list[str],
+    ) -> None:
+        """Test that logging reward has correct signature for GRPOTrainer."""
+        from prolewiki_llm.wandb_logging import create_logging_reward
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=None):
+            logging_reward = create_logging_reward(compute_all_rewards=False)
+            # Should accept prompts, completions, answer, and kwargs
+            result = logging_reward(
+                prompts=sample_prompts,
+                completions=sample_completions,
+                answer=sample_answers,
+                extra_kwarg="ignored",
+            )
+            assert isinstance(result, list)
+            assert len(result) == len(sample_completions)
+    def test_logs_samples_at_interval(
+        self,
+        mock_wandb: MagicMock,
+        sample_prompts: list[list[dict[str, str]]],
+        sample_completions: list[list[dict[str, str]]],
+        sample_answers: list[str],
+    ) -> None:
+        """Test that samples are logged at correct intervals."""
+        # Reset global step counter
+        import prolewiki_llm.wandb_logging as wl
+        from prolewiki_llm.wandb_logging import (
+            WandbSampleLogger,
+            create_logging_reward,
+        )
+        wl._LOGGING_STEP = 0
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            sample_logger = WandbSampleLogger(log_every_n_steps=5, max_samples_per_log=2)
+            logging_reward = create_logging_reward(sample_logger, compute_all_rewards=False)
+            # Call multiple times
+            for _ in range(10):
+                logging_reward(
+                    prompts=sample_prompts,
+                    completions=sample_completions,
+                    answer=sample_answers,
+                )
+            # Table should have been logged twice (at step 5 and 10)
+            table_logs = [call for call in mock_wandb.log.call_args_list if "samples" in call[0][0]]
+            assert len(table_logs) == 2
+# =============================================================================
+# WANDB AVAILABILITY TESTS
+# =============================================================================
+class TestWandbAvailability:
+    """Test handling of wandb availability."""
+    def test_is_wandb_available_true(self, mock_wandb: MagicMock) -> None:
+        """Test is_wandb_available returns True when wandb is installed."""
+        from prolewiki_llm import wandb_logging as wl
+        # Reset cached state
+        wl._WANDB_AVAILABLE = None
+        wl._wandb_module = None
+        with patch.dict("sys.modules", {"wandb": mock_wandb}):
+            # Force re-import check
+            wl._WANDB_AVAILABLE = None
+            _result = wl.is_wandb_available()
+            # Note: This may still be False due to import mechanics
+            # The important thing is it doesn't crash
+    def test_graceful_degradation_without_wandb(
+        self,
+        sample_prompts: list[list[dict[str, str]]],
+        sample_completions: list[list[dict[str, str]]],
+        sample_answers: list[str],
+    ) -> None:
+        """Test that logging works gracefully without wandb."""
+        from prolewiki_llm.wandb_logging import (
+            WandbSampleLogger,
+            create_logging_reward,
+            log_reward_metrics,
+        )
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=None):
+            # These should not raise exceptions
+            sample_logger = WandbSampleLogger()
+            logging_reward = create_logging_reward(sample_logger, compute_all_rewards=False)
+            # Should return valid result even without wandb
+            result = logging_reward(
+                prompts=sample_prompts,
+                completions=sample_completions,
+                answer=sample_answers,
+            )
+            assert result == [0.0, 0.0]
+            # Metrics logging should not crash
+            log_reward_metrics(step=1, reward_scores={"x": [1.0]})
+            # Table logging should not crash
+            sample_logger.log_table(step=10)
+# =============================================================================
+# INIT AND FINISH TESTS
+# =============================================================================
+class TestInitAndFinish:
+    """Test initialization and finishing of wandb runs."""
+    def test_init_wandb_logging(self, mock_wandb: MagicMock) -> None:
+        """Test wandb initialization with config."""
+        from prolewiki_llm.wandb_logging import init_wandb_logging
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            _run = init_wandb_logging(
+                project="test-project",
+                config={"lr": 1e-5, "batch_size": 4},
+                name="test-run",
+                tags=["test"],
+            )
+            mock_wandb.init.assert_called_once()
+            call_kwargs = mock_wandb.init.call_args[1]
+            assert call_kwargs["project"] == "test-project"
+            assert call_kwargs["config"] == {"lr": 1e-5, "batch_size": 4}
+    def test_finish_wandb_logging(self, mock_wandb: MagicMock) -> None:
+        """Test wandb finish with summary."""
+        from prolewiki_llm.wandb_logging import finish_wandb_logging
+        with patch("prolewiki_llm.wandb_logging._get_wandb", return_value=mock_wandb):
+            finish_wandb_logging(summary={"final_loss": 0.5})
+            # Check summary was updated
+            mock_wandb.run.summary.__setitem__.assert_called_with("final_loss", 0.5)
+            # Check finish was called
+            mock_wandb.finish.assert_called_once()

training_data/entity_whitelist.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_data/entity_whitelist_clean.json ADDED Viewed

The diff for this file is too large to render. See raw diff