File size: 1,496 Bytes
5fc8c9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# okto_version: "1.2"
PROJECT "MonitorFullExample"
DESCRIPTION "Demonstrates complete MONITOR block with all metrics"
ENV {
accelerator: "gpu"
min_memory: "16GB"
precision: "fp16"
}
DATASET {
train: "examples/datasets/demo_train.jsonl"
validation: "examples/datasets/demo_train.jsonl"
format: "jsonl"
type: "chat"
}
MODEL {
name: "monitor-full-model"
base: "oktoseek/base-mini"
device: "cuda"
}
TRAIN {
epochs: 10
batch_size: 32
learning_rate: 0.0001
device: "cuda"
}
MONITOR {
metrics: [
"loss",
"val_loss",
"accuracy",
"val_accuracy",
"precision",
"recall",
"f1_score",
"perplexity",
"confidence",
"hallucination_score"
]
notify_if {
loss > 2.0
gpu_usage > 90%
gpu_temperature > 85
val_loss > 2.5
hallucination_score > 0.5
}
log_system: [
"gpu_usage",
"gpu_memory_used",
"gpu_memory_free",
"gpu_temperature",
"cpu_usage",
"ram_usage"
]
log_speed: [
"tokens_per_second",
"samples_per_second",
"throughput",
"latency",
"step_time"
]
refresh_interval: 2s
export_to: "runs/logs/system.json"
dashboard: true
log_to: "logs/training.log"
}
CONTROL {
on_epoch_end {
IF gpu_temperature > 85 {
SET batch_size = 16
LOG "GPU temperature high, reducing batch size"
}
}
}
EXPORT {
format: ["okm"]
path: "export/"
}
|