File size: 3,368 Bytes
88e3f4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pytest

from omniff.scaling.batch import BatchInference
from omniff.scaling.device_map import DeviceMap, GPUInfo
from omniff.scaling.quantization import detect_quantization, estimate_quantized_size_gb
from omniff.scaling.queue import Priority, RequestQueue


def test_device_map_empty():
    dm = DeviceMap()
    assert dm.get_device("llm") == "auto"
    assert dm.get_assignments() == {}


def test_device_map_assign():
    dm = DeviceMap()
    dm._gpus = [GPUInfo(0, "A10", 22.0, 20.0), GPUInfo(1, "A10", 22.0, 18.0)]
    dm.assign("llm", 0)
    assert dm.get_device("llm") == "cuda:0"
    assert dm.get_assignments() == {"llm": 0}


def test_device_map_auto_assign():
    dm = DeviceMap()
    dm._gpus = [GPUInfo(0, "A10", 22.0, 10.0), GPUInfo(1, "A10", 22.0, 20.0)]
    idx = dm.auto_assign("vlm", required_gb=5.0)
    assert idx == 1
    assert dm.get_device("vlm") == "cuda:1"


def test_device_map_auto_assign_insufficient():
    dm = DeviceMap()
    dm._gpus = [GPUInfo(0, "A10", 22.0, 2.0)]
    idx = dm.auto_assign("big_model", required_gb=10.0)
    assert idx == -1


def test_queue_enqueue_dequeue():
    q = RequestQueue()
    q.enqueue({"input": "hello"})
    assert q.size() == 1
    out = q.dequeue()
    assert out.payload["input"] == "hello"
    assert q.is_empty()


def test_queue_priority_ordering():
    q = RequestQueue()
    q.enqueue({"n": 1}, Priority.LOW)
    q.enqueue({"n": 2}, Priority.CRITICAL)
    q.enqueue({"n": 3}, Priority.NORMAL)
    first = q.dequeue()
    assert first.payload["n"] == 2


def test_queue_max_size():
    q = RequestQueue(max_size=2)
    q.enqueue({"n": 1})
    q.enqueue({"n": 2})
    with pytest.raises(RuntimeError, match="Queue full"):
        q.enqueue({"n": 3})


def test_queue_stats():
    q = RequestQueue()
    q.enqueue({"n": 1})
    q.dequeue()
    stats = q.stats()
    assert stats["processed"] == 1
    assert stats["queue_size"] == 0


def test_batch_add_and_size():
    batch = BatchInference(max_batch_size=4)
    batch.add({"prompt": "a"})
    batch.add({"prompt": "b"})
    assert batch.size() == 2
    assert not batch.is_full()


def test_batch_is_full():
    batch = BatchInference(max_batch_size=2)
    batch.add({"prompt": "a"})
    batch.add({"prompt": "b"})
    assert batch.is_full()


def test_batch_flush():
    class MockModel:
        def infer(self, inputs):
            return {"text": inputs["prompt"].upper()}

    batch = BatchInference(max_batch_size=4)
    batch.add({"prompt": "hello"})
    batch.add({"prompt": "world"})
    results = batch.flush(MockModel())
    assert len(results) == 2
    assert results[0]["text"] == "HELLO"
    assert batch.size() == 0


def test_batch_flush_empty():
    batch = BatchInference()
    results = batch.flush(None)
    assert results == []


def test_detect_quantization_gptq():
    assert detect_quantization("model-gptq-4bit") == "gptq"


def test_detect_quantization_awq():
    assert detect_quantization("model-AWQ") == "awq"


def test_detect_quantization_gguf():
    assert detect_quantization("model.gguf") == "gguf"


def test_detect_quantization_none():
    assert detect_quantization("Qwen/Qwen3-4B") is None


def test_estimate_quantized_size():
    assert estimate_quantized_size_gb(8.0, "gptq") == 2.0
    assert estimate_quantized_size_gb(8.0, "int8") == 4.0
    assert estimate_quantized_size_gb(8.0, None) == 8.0