| { | |
| "model_type": "adaptive_repetition_controller", | |
| "version": "1.0.0", | |
| "architecture": { | |
| "d_model": 4096, | |
| "n_layers": 32, | |
| "d_fiber": 16, | |
| "d_control": 64, | |
| "rep_window": 32, | |
| "total_params": 50000 | |
| }, | |
| "training": { | |
| "dataset": "wikitext-2", | |
| "loss": "BCEWithLogitsLoss", | |
| "pos_weight": "dynamic", | |
| "lr_predictor": 1e-4, | |
| "lr_lora": 2e-5, | |
| "batch_size": 4, | |
| "gradient_accumulation": 8, | |
| "optimal_steps": 5000 | |
| }, | |
| "performance": { | |
| "f1_score": 0.99, | |
| "risk_at_repeats": 0.998, | |
| "risk_at_non_repeats": 0.008, | |
| "separation": "125x", | |
| "repetition_reduction": "48.4%", | |
| "distinct2_improvement": "16.7%" | |
| }, | |
| "inference": { | |
| "penalty_scale_default": 3.0, | |
| "temperature_default": 0.8, | |
| "threshold_default": 0.1, | |
| "rep_window": 32 | |
| }, | |
| "base_model_compatibility": [ | |
| "llama-3.1-8b", | |
| "llama-3-8b", | |
| "mistral-7b" | |
| ], | |
| "notes": "This is a decode-time intervention system, not an attention modification. The geometric CF-HoT theory remains unvalidated; this is the working practical implementation." | |
| } | |