File size: 15,365 Bytes
0f07ba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
package model

import (
	"sort"
	"sync"
	"time"

	"github.com/mudler/LocalAI/pkg/xsysinfo"
	process "github.com/mudler/go-processmanager"
	"github.com/mudler/xlog"
)

// WatchDog tracks all the requests from GRPC clients.
// All GRPC Clients created by ModelLoader should have an associated injected
// watchdog that will keep track of the state of each backend (busy or not)
// and for how much time it has been busy.
// If a backend is busy for too long, the watchdog will kill the process and
// force a reload of the model.
// The watchdog also supports LRU (Least Recently Used) eviction when a maximum
// number of active backends is configured.
// The watchdog also supports memory threshold monitoring - when memory usage
// (GPU VRAM if available, otherwise system RAM) exceeds the threshold,
// it will evict backends using the LRU strategy.
// The watchdog runs as a separate go routine,
// and the GRPC client talks to it via a channel to send status updates
type WatchDog struct {
	sync.Mutex
	busyTime             map[string]time.Time
	idleTime             map[string]time.Time
	lastUsed             map[string]time.Time // LRU tracking: when each model was last used
	timeout, idletimeout time.Duration
	addressMap           map[string]*process.Process
	addressModelMap      map[string]string
	pm                   ProcessManager
	stop                 chan bool

	busyCheck, idleCheck bool
	lruLimit             int // Maximum number of active backends (0 = unlimited)

	// Memory reclaimer settings (works with GPU if available, otherwise RAM)
	memoryReclaimerEnabled   bool    // Enable memory threshold monitoring
	memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
	watchdogInterval         time.Duration

	// Eviction settings
	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
}

type ProcessManager interface {
	ShutdownModel(modelName string) error
}

// NewWatchDog creates a new WatchDog with the provided options.
// Example usage:
//
//	wd := NewWatchDog(
//	    WithProcessManager(pm),
//	    WithBusyTimeout(5*time.Minute),
//	    WithIdleTimeout(15*time.Minute),
//	    WithBusyCheck(true),
//	    WithIdleCheck(true),
//	    WithLRULimit(3),
//	    WithMemoryReclaimer(true, 0.95),
//	)
func NewWatchDog(opts ...WatchDogOption) *WatchDog {
	o := NewWatchDogOptions(opts...)

	return &WatchDog{
		timeout:                  o.busyTimeout,
		idletimeout:              o.idleTimeout,
		pm:                       o.processManager,
		busyTime:                 make(map[string]time.Time),
		idleTime:                 make(map[string]time.Time),
		lastUsed:                 make(map[string]time.Time),
		addressMap:               make(map[string]*process.Process),
		busyCheck:                o.busyCheck,
		idleCheck:                o.idleCheck,
		lruLimit:                 o.lruLimit,
		addressModelMap:          make(map[string]string),
		stop:                     make(chan bool, 1),
		memoryReclaimerEnabled:   o.memoryReclaimerEnabled,
		memoryReclaimerThreshold: o.memoryReclaimerThreshold,
		watchdogInterval:         o.watchdogInterval,
		forceEvictionWhenBusy:    o.forceEvictionWhenBusy,
	}
}

// SetLRULimit updates the LRU limit dynamically
func (wd *WatchDog) SetLRULimit(limit int) {
	wd.Lock()
	defer wd.Unlock()
	wd.lruLimit = limit
}

// GetLRULimit returns the current LRU limit
func (wd *WatchDog) GetLRULimit() int {
	wd.Lock()
	defer wd.Unlock()
	return wd.lruLimit
}

// SetMemoryReclaimer updates the memory reclaimer settings dynamically
func (wd *WatchDog) SetMemoryReclaimer(enabled bool, threshold float64) {
	wd.Lock()
	defer wd.Unlock()
	wd.memoryReclaimerEnabled = enabled
	wd.memoryReclaimerThreshold = threshold
}

// GetMemoryReclaimerSettings returns the current memory reclaimer settings
func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float64) {
	wd.Lock()
	defer wd.Unlock()
	return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
}

// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
	wd.Lock()
	defer wd.Unlock()
	wd.forceEvictionWhenBusy = force
}

func (wd *WatchDog) Shutdown() {
	wd.Lock()
	defer wd.Unlock()
	xlog.Info("[WatchDog] Shutting down watchdog")
	wd.stop <- true
}

func (wd *WatchDog) AddAddressModelMap(address string, model string) {
	wd.Lock()
	defer wd.Unlock()
	wd.addressModelMap[address] = model

}
func (wd *WatchDog) Add(address string, p *process.Process) {
	wd.Lock()
	defer wd.Unlock()
	wd.addressMap[address] = p
}

func (wd *WatchDog) Mark(address string) {
	wd.Lock()
	defer wd.Unlock()
	now := time.Now()
	wd.busyTime[address] = now
	wd.lastUsed[address] = now // Update LRU tracking
	delete(wd.idleTime, address)
}

func (wd *WatchDog) UnMark(ModelAddress string) {
	wd.Lock()
	defer wd.Unlock()
	now := time.Now()
	delete(wd.busyTime, ModelAddress)
	wd.idleTime[ModelAddress] = now
	wd.lastUsed[ModelAddress] = now // Update LRU tracking
}

// UpdateLastUsed updates the last used time for a model address (for LRU tracking)
// This should be called when a model is accessed (e.g., when checking if loaded)
func (wd *WatchDog) UpdateLastUsed(address string) {
	wd.Lock()
	defer wd.Unlock()
	wd.lastUsed[address] = time.Now()
}

// GetLoadedModelCount returns the number of currently loaded models tracked by the watchdog
func (wd *WatchDog) GetLoadedModelCount() int {
	wd.Lock()
	defer wd.Unlock()
	return len(wd.addressModelMap)
}

// modelUsageInfo holds information about a model's usage for LRU sorting
type modelUsageInfo struct {
	address  string
	model    string
	lastUsed time.Time
}

// EnforceLRULimitResult contains the result of LRU enforcement
type EnforceLRULimitResult struct {
	EvictedCount int  // Number of models successfully evicted
	NeedMore     bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
}

// EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
// This should be called before loading a new model.
// pendingLoads is the number of models currently being loaded (to account for concurrent loads).
// Returns the result containing evicted count and whether more evictions are needed.
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
	if wd.lruLimit <= 0 {
		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
	}

	wd.Lock()

	currentCount := len(wd.addressModelMap)
	// We need to evict enough to make room for the new model AND any pending loads
	// Total after loading = currentCount + pendingLoads + 1 (the new one we're about to load)
	// We need: currentCount + pendingLoads + 1 <= lruLimit
	// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
	modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
	if modelsToEvict <= 0 {
		wd.Unlock()
		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
	}

	xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)

	// Build a list of models sorted by last used time (oldest first)
	var models []modelUsageInfo
	for address, model := range wd.addressModelMap {
		lastUsed := wd.lastUsed[address]
		if lastUsed.IsZero() {
			// If no lastUsed recorded, use a very old time
			lastUsed = time.Time{}
		}
		models = append(models, modelUsageInfo{
			address:  address,
			model:    model,
			lastUsed: lastUsed,
		})
	}

	// Sort by lastUsed time (oldest first)
	sort.Slice(models, func(i, j int) bool {
		return models[i].lastUsed.Before(models[j].lastUsed)
	})

	// Collect models to evict (the oldest ones)
	var modelsToShutdown []string
	evictedCount := 0
	skippedBusyCount := 0
	for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
		m := models[i]
		// Check if model is busy
		_, isBusy := wd.busyTime[m.address]
		if isBusy && !forceEvictionWhenBusy {
			// Skip eviction for busy models when forceEvictionWhenBusy is false
			xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
			skippedBusyCount++
			continue
		}
		xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
		modelsToShutdown = append(modelsToShutdown, m.model)
		// Clean up the maps while we have the lock
		wd.untrack(m.address)
		evictedCount++
	}
	needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
	wd.Unlock()

	// Now shutdown models without holding the watchdog lock to prevent deadlock
	for _, model := range modelsToShutdown {
		if err := wd.pm.ShutdownModel(model); err != nil {
			xlog.Error("[WatchDog] error shutting down model during LRU eviction", "error", err, "model", model)
		}
		xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
	}

	if needMore {
		xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
	}

	return EnforceLRULimitResult{
		EvictedCount: len(modelsToShutdown),
		NeedMore:     needMore,
	}
}

func (wd *WatchDog) Run() {
	xlog.Info("[WatchDog] starting watchdog")

	for {
		select {
		case <-wd.stop:
			xlog.Info("[WatchDog] Stopping watchdog")
			return
		case <-time.After(wd.watchdogInterval):
			// Check if any monitoring is enabled
			wd.Lock()
			busyCheck := wd.busyCheck
			idleCheck := wd.idleCheck
			memoryCheck := wd.memoryReclaimerEnabled
			wd.Unlock()

			if !busyCheck && !idleCheck && !memoryCheck {
				xlog.Info("[WatchDog] No checks enabled, stopping watchdog")
				return
			}
			if busyCheck {
				wd.checkBusy()
			}
			if idleCheck {
				wd.checkIdle()
			}
			if memoryCheck {
				wd.checkMemory()
			}
		}
	}
}

func (wd *WatchDog) checkIdle() {
	wd.Lock()
	xlog.Debug("[WatchDog] Watchdog checks for idle connections")

	// Collect models to shutdown while holding the lock
	var modelsToShutdown []string
	for address, t := range wd.idleTime {
		xlog.Debug("[WatchDog] idle connection", "address", address)
		if time.Since(t) > wd.idletimeout {
			xlog.Warn("[WatchDog] Address is idle for too long, killing it", "address", address)
			model, ok := wd.addressModelMap[address]
			if ok {
				modelsToShutdown = append(modelsToShutdown, model)
			} else {
				xlog.Warn("[WatchDog] Address unresolvable", "address", address)
			}
			wd.untrack(address)
		}
	}
	wd.Unlock()

	// Now shutdown models without holding the watchdog lock to prevent deadlock
	for _, model := range modelsToShutdown {
		if err := wd.pm.ShutdownModel(model); err != nil {
			xlog.Error("[watchdog] error shutting down model", "error", err, "model", model)
		}
		xlog.Debug("[WatchDog] model shut down", "model", model)
	}
}

func (wd *WatchDog) checkBusy() {
	wd.Lock()
	xlog.Debug("[WatchDog] Watchdog checks for busy connections")

	// Collect models to shutdown while holding the lock
	var modelsToShutdown []string
	for address, t := range wd.busyTime {
		xlog.Debug("[WatchDog] active connection", "address", address)

		if time.Since(t) > wd.timeout {
			model, ok := wd.addressModelMap[address]
			if ok {
				xlog.Warn("[WatchDog] Model is busy for too long, killing it", "model", model)
				modelsToShutdown = append(modelsToShutdown, model)
			} else {
				xlog.Warn("[WatchDog] Address unresolvable", "address", address)
			}
			wd.untrack(address)
		}
	}
	wd.Unlock()

	// Now shutdown models without holding the watchdog lock to prevent deadlock
	for _, model := range modelsToShutdown {
		if err := wd.pm.ShutdownModel(model); err != nil {
			xlog.Error("[watchdog] error shutting down model", "error", err, "model", model)
		}
		xlog.Debug("[WatchDog] model shut down", "model", model)
	}
}

// checkMemory monitors memory usage (GPU VRAM if available, otherwise RAM) and evicts backends when usage exceeds threshold
func (wd *WatchDog) checkMemory() {
	wd.Lock()
	threshold := wd.memoryReclaimerThreshold
	enabled := wd.memoryReclaimerEnabled
	modelCount := len(wd.addressModelMap)
	wd.Unlock()

	if !enabled || threshold <= 0 || modelCount == 0 {
		return
	}

	// Get current memory usage (GPU if available, otherwise RAM)
	aggregate := xsysinfo.GetResourceAggregateInfo()
	if aggregate.TotalMemory == 0 {
		xlog.Debug("[WatchDog] No memory information available for memory reclaimer")
		return
	}

	// Convert threshold from 0.0-1.0 to percentage
	thresholdPercent := threshold * 100

	memoryType := "GPU"
	if aggregate.GPUCount == 0 {
		memoryType = "RAM"
	}

	xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)

	// Check if usage exceeds threshold
	if aggregate.UsagePercent > thresholdPercent {
		xlog.Warn("[WatchDog] Memory usage exceeds threshold, evicting LRU backend", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent)

		// Evict the least recently used model
		wd.evictLRUModel()
	}
}

// evictLRUModel evicts the least recently used model
func (wd *WatchDog) evictLRUModel() {
	wd.Lock()

	if len(wd.addressModelMap) == 0 {
		wd.Unlock()
		return
	}

	forceEvictionWhenBusy := wd.forceEvictionWhenBusy

	// Build a list of models sorted by last used time (oldest first)
	var models []modelUsageInfo
	for address, model := range wd.addressModelMap {
		lastUsed := wd.lastUsed[address]
		if lastUsed.IsZero() {
			lastUsed = time.Time{}
		}
		models = append(models, modelUsageInfo{
			address:  address,
			model:    model,
			lastUsed: lastUsed,
		})
	}

	if len(models) == 0 {
		wd.Unlock()
		return
	}

	// Sort by lastUsed time (oldest first)
	sort.Slice(models, func(i, j int) bool {
		return models[i].lastUsed.Before(models[j].lastUsed)
	})

	// Find the first non-busy model (or first model if forceEvictionWhenBusy is true)
	var lruModel *modelUsageInfo
	for i := 0; i < len(models); i++ {
		m := models[i]
		_, isBusy := wd.busyTime[m.address]
		if isBusy && !forceEvictionWhenBusy {
			// Skip busy models when forceEvictionWhenBusy is false
			xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
			continue
		}
		lruModel = &m
		break
	}

	if lruModel == nil {
		// All models are busy and forceEvictionWhenBusy is false
		wd.Unlock()
		xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
		return
	}

	xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)

	// Untrack the model
	wd.untrack(lruModel.address)
	wd.Unlock()

	// Shutdown the model
	if err := wd.pm.ShutdownModel(lruModel.model); err != nil {
		xlog.Error("[WatchDog] error shutting down model during memory reclamation", "error", err, "model", lruModel.model)
	} else {
		xlog.Info("[WatchDog] Memory reclaimer eviction complete", "model", lruModel.model)
	}
}

func (wd *WatchDog) untrack(address string) {
	delete(wd.busyTime, address)
	delete(wd.idleTime, address)
	delete(wd.lastUsed, address)
	delete(wd.addressModelMap, address)
	delete(wd.addressMap, address)
}