omniff / tests /python /unit /test_scaling.py
stukenov's picture
Initial upload: OmniFF — FFmpeg for AI
88e3f4a verified
import pytest
from omniff.scaling.batch import BatchInference
from omniff.scaling.device_map import DeviceMap, GPUInfo
from omniff.scaling.quantization import detect_quantization, estimate_quantized_size_gb
from omniff.scaling.queue import Priority, RequestQueue
def test_device_map_empty():
dm = DeviceMap()
assert dm.get_device("llm") == "auto"
assert dm.get_assignments() == {}
def test_device_map_assign():
dm = DeviceMap()
dm._gpus = [GPUInfo(0, "A10", 22.0, 20.0), GPUInfo(1, "A10", 22.0, 18.0)]
dm.assign("llm", 0)
assert dm.get_device("llm") == "cuda:0"
assert dm.get_assignments() == {"llm": 0}
def test_device_map_auto_assign():
dm = DeviceMap()
dm._gpus = [GPUInfo(0, "A10", 22.0, 10.0), GPUInfo(1, "A10", 22.0, 20.0)]
idx = dm.auto_assign("vlm", required_gb=5.0)
assert idx == 1
assert dm.get_device("vlm") == "cuda:1"
def test_device_map_auto_assign_insufficient():
dm = DeviceMap()
dm._gpus = [GPUInfo(0, "A10", 22.0, 2.0)]
idx = dm.auto_assign("big_model", required_gb=10.0)
assert idx == -1
def test_queue_enqueue_dequeue():
q = RequestQueue()
q.enqueue({"input": "hello"})
assert q.size() == 1
out = q.dequeue()
assert out.payload["input"] == "hello"
assert q.is_empty()
def test_queue_priority_ordering():
q = RequestQueue()
q.enqueue({"n": 1}, Priority.LOW)
q.enqueue({"n": 2}, Priority.CRITICAL)
q.enqueue({"n": 3}, Priority.NORMAL)
first = q.dequeue()
assert first.payload["n"] == 2
def test_queue_max_size():
q = RequestQueue(max_size=2)
q.enqueue({"n": 1})
q.enqueue({"n": 2})
with pytest.raises(RuntimeError, match="Queue full"):
q.enqueue({"n": 3})
def test_queue_stats():
q = RequestQueue()
q.enqueue({"n": 1})
q.dequeue()
stats = q.stats()
assert stats["processed"] == 1
assert stats["queue_size"] == 0
def test_batch_add_and_size():
batch = BatchInference(max_batch_size=4)
batch.add({"prompt": "a"})
batch.add({"prompt": "b"})
assert batch.size() == 2
assert not batch.is_full()
def test_batch_is_full():
batch = BatchInference(max_batch_size=2)
batch.add({"prompt": "a"})
batch.add({"prompt": "b"})
assert batch.is_full()
def test_batch_flush():
class MockModel:
def infer(self, inputs):
return {"text": inputs["prompt"].upper()}
batch = BatchInference(max_batch_size=4)
batch.add({"prompt": "hello"})
batch.add({"prompt": "world"})
results = batch.flush(MockModel())
assert len(results) == 2
assert results[0]["text"] == "HELLO"
assert batch.size() == 0
def test_batch_flush_empty():
batch = BatchInference()
results = batch.flush(None)
assert results == []
def test_detect_quantization_gptq():
assert detect_quantization("model-gptq-4bit") == "gptq"
def test_detect_quantization_awq():
assert detect_quantization("model-AWQ") == "awq"
def test_detect_quantization_gguf():
assert detect_quantization("model.gguf") == "gguf"
def test_detect_quantization_none():
assert detect_quantization("Qwen/Qwen3-4B") is None
def test_estimate_quantized_size():
assert estimate_quantized_size_gb(8.0, "gptq") == 2.0
assert estimate_quantized_size_gb(8.0, "int8") == 4.0
assert estimate_quantized_size_gb(8.0, None) == 8.0