File size: 2,338 Bytes
c28358e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
"""
Performance monitor for GPT-OSS-120B
"""

import time
from mlx_lm import load, generate
import psutil
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def monitor_performance():
    """Monitor model performance and resource usage"""
    logger.info("๐Ÿ“Š GPT-OSS-120B Performance Monitor")
    logger.info("=" * 50)
    
    # Check system resources
    ram = psutil.virtual_memory()
    logger.info(f"๐Ÿ’พ System RAM: {ram.available / (1024**3):.1f}GB available")
    
    # Load model and time it
    load_start = time.time()
    model, tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4")
    load_time = time.time() - load_start
    logger.info(f"โฑ๏ธ  Model load time: {load_time:.2f}s")
    
    # Test generation performance
    test_prompts = [
        "Hello, how are you?",
        "Explain machine learning",
        "What is the meaning of life?",
        "Write a haiku about technology",
        "Describe quantum physics"
    ]
    
    total_tokens = 0
    total_time = 0
    
    for i, prompt in enumerate(test_prompts):
        logger.info(f"\n๐Ÿงช Test {i+1}: {prompt}")
        
        # Time generation
        gen_start = time.time()
        response = generate(
            model, tokenizer, 
            prompt=prompt, 
            max_tokens=50,
            verbose=False
        )
        gen_time = time.time() - gen_start
        
        # Estimate tokens (roughly)
        tokens = len(response.split()) * 1.3  # Approximate
        total_tokens += tokens
        total_time += gen_time
        
        tokens_per_sec = tokens / gen_time if gen_time > 0 else 0
        
        logger.info(f"   โฑ๏ธ  Time: {gen_time:.2f}s")
        logger.info(f"   ๐Ÿ“ˆ Speed: {tokens_per_sec:.1f} tokens/sec")
        logger.info(f"   ๐Ÿ“ Response: {response[:100]}...")
    
    # Summary
    avg_speed = total_tokens / total_time if total_time > 0 else 0
    logger.info(f"\n๐Ÿ“Š Summary:")
    logger.info(f"   Total tokens generated: {total_tokens:.0f}")
    logger.info(f"   Total time: {total_time:.2f}s")
    logger.info(f"   Average speed: {avg_speed:.1f} tokens/sec")
    logger.info(f"   Peak RAM usage: ~62GB (estimated)")

if __name__ == "__main__":
    monitor_performance()