{ "model_type": "adaptive_repetition_controller", "version": "1.0.0", "architecture": { "d_model": 4096, "n_layers": 32, "d_fiber": 16, "d_control": 64, "rep_window": 32, "total_params": 50000 }, "training": { "dataset": "wikitext-2", "loss": "BCEWithLogitsLoss", "pos_weight": "dynamic", "lr_predictor": 1e-4, "lr_lora": 2e-5, "batch_size": 4, "gradient_accumulation": 8, "optimal_steps": 5000 }, "performance": { "f1_score": 0.99, "risk_at_repeats": 0.998, "risk_at_non_repeats": 0.008, "separation": "125x", "repetition_reduction": "48.4%", "distinct2_improvement": "16.7%" }, "inference": { "penalty_scale_default": 3.0, "temperature_default": 0.8, "threshold_default": 0.1, "rep_window": 32 }, "base_model_compatibility": [ "llama-3.1-8b", "llama-3-8b", "mistral-7b" ], "notes": "This is a decode-time intervention system, not an attention modification. The geometric CF-HoT theory remains unvalidated; this is the working practical implementation." }