# okto_version: "1.2" PROJECT "MonitorFullExample" DESCRIPTION "Demonstrates complete MONITOR block with all metrics" ENV { accelerator: "gpu" min_memory: "16GB" precision: "fp16" } DATASET { train: "examples/datasets/demo_train.jsonl" validation: "examples/datasets/demo_train.jsonl" format: "jsonl" type: "chat" } MODEL { name: "monitor-full-model" base: "oktoseek/base-mini" device: "cuda" } TRAIN { epochs: 10 batch_size: 32 learning_rate: 0.0001 device: "cuda" } MONITOR { metrics: [ "loss", "val_loss", "accuracy", "val_accuracy", "precision", "recall", "f1_score", "perplexity", "confidence", "hallucination_score" ] notify_if { loss > 2.0 gpu_usage > 90% gpu_temperature > 85 val_loss > 2.5 hallucination_score > 0.5 } log_system: [ "gpu_usage", "gpu_memory_used", "gpu_memory_free", "gpu_temperature", "cpu_usage", "ram_usage" ] log_speed: [ "tokens_per_second", "samples_per_second", "throughput", "latency", "step_time" ] refresh_interval: 2s export_to: "runs/logs/system.json" dashboard: true log_to: "logs/training.log" } CONTROL { on_epoch_end { IF gpu_temperature > 85 { SET batch_size = 16 LOG "GPU temperature high, reducing batch size" } } } EXPORT { format: ["okm"] path: "export/" }