File size: 2,338 Bytes
c28358e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
#!/usr/bin/env python3
"""
Performance monitor for GPT-OSS-120B
"""
import time
from mlx_lm import load, generate
import psutil
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def monitor_performance():
"""Monitor model performance and resource usage"""
logger.info("๐ GPT-OSS-120B Performance Monitor")
logger.info("=" * 50)
# Check system resources
ram = psutil.virtual_memory()
logger.info(f"๐พ System RAM: {ram.available / (1024**3):.1f}GB available")
# Load model and time it
load_start = time.time()
model, tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4")
load_time = time.time() - load_start
logger.info(f"โฑ๏ธ Model load time: {load_time:.2f}s")
# Test generation performance
test_prompts = [
"Hello, how are you?",
"Explain machine learning",
"What is the meaning of life?",
"Write a haiku about technology",
"Describe quantum physics"
]
total_tokens = 0
total_time = 0
for i, prompt in enumerate(test_prompts):
logger.info(f"\n๐งช Test {i+1}: {prompt}")
# Time generation
gen_start = time.time()
response = generate(
model, tokenizer,
prompt=prompt,
max_tokens=50,
verbose=False
)
gen_time = time.time() - gen_start
# Estimate tokens (roughly)
tokens = len(response.split()) * 1.3 # Approximate
total_tokens += tokens
total_time += gen_time
tokens_per_sec = tokens / gen_time if gen_time > 0 else 0
logger.info(f" โฑ๏ธ Time: {gen_time:.2f}s")
logger.info(f" ๐ Speed: {tokens_per_sec:.1f} tokens/sec")
logger.info(f" ๐ Response: {response[:100]}...")
# Summary
avg_speed = total_tokens / total_time if total_time > 0 else 0
logger.info(f"\n๐ Summary:")
logger.info(f" Total tokens generated: {total_tokens:.0f}")
logger.info(f" Total time: {total_time:.2f}s")
logger.info(f" Average speed: {avg_speed:.1f} tokens/sec")
logger.info(f" Peak RAM usage: ~62GB (estimated)")
if __name__ == "__main__":
monitor_performance() |