|
|
|
|
|
""" |
|
|
GPU Metrics JSON Server |
|
|
|
|
|
This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format. |
|
|
It runs on the remote machine and is accessed via an SSH tunnel. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import subprocess |
|
|
import re |
|
|
from flask import Flask, jsonify |
|
|
import logging |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
logger = logging.getLogger('gpu_server') |
|
|
|
|
|
app = Flask(__name__) |
|
|
|
|
|
def get_gpu_info(): |
|
|
""" |
|
|
Get NVIDIA GPU information and parse it into a structured format |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary containing GPU information |
|
|
""" |
|
|
try: |
|
|
|
|
|
nvidia_smi_output = subprocess.check_output( |
|
|
[ |
|
|
'nvidia-smi', |
|
|
'--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit', |
|
|
'--format=csv,noheader,nounits' |
|
|
], |
|
|
universal_newlines=True |
|
|
) |
|
|
|
|
|
|
|
|
gpus = [] |
|
|
for line in nvidia_smi_output.strip().split('\n'): |
|
|
values = [v.strip() for v in line.split(',')] |
|
|
if len(values) >= 10: |
|
|
gpu = { |
|
|
'index': int(values[0]), |
|
|
'name': values[1], |
|
|
'temperature': float(values[2]), |
|
|
'gpu_utilization': float(values[3]), |
|
|
'memory_utilization': float(values[4]), |
|
|
'memory_total': float(values[5]), |
|
|
'memory_used': float(values[6]), |
|
|
'memory_free': float(values[7]), |
|
|
'power_draw': float(values[8]), |
|
|
'power_limit': float(values[9]) |
|
|
} |
|
|
gpus.append(gpu) |
|
|
|
|
|
|
|
|
process_output = subprocess.check_output( |
|
|
['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'], |
|
|
universal_newlines=True |
|
|
) |
|
|
|
|
|
processes = [] |
|
|
for line in process_output.strip().split('\n'): |
|
|
if line: |
|
|
values = [v.strip() for v in line.split(',')] |
|
|
if len(values) >= 3: |
|
|
process = { |
|
|
'pid': int(values[0]), |
|
|
'name': values[1], |
|
|
'memory_used': float(values[2]) |
|
|
} |
|
|
processes.append(process) |
|
|
|
|
|
return { |
|
|
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), |
|
|
'gpus': gpus, |
|
|
'processes': processes, |
|
|
'success': True |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error getting GPU information: {str(e)}") |
|
|
return { |
|
|
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), |
|
|
'error': str(e), |
|
|
'success': False |
|
|
} |
|
|
|
|
|
@app.route('/gpu/json') |
|
|
def gpu_json(): |
|
|
""" |
|
|
API endpoint for GPU information in JSON format |
|
|
""" |
|
|
return jsonify(get_gpu_info()) |
|
|
|
|
|
@app.route('/gpu/txt') |
|
|
def gpu_txt(): |
|
|
""" |
|
|
API endpoint for traditional nvidia-smi text output (for backward compatibility) |
|
|
""" |
|
|
try: |
|
|
|
|
|
nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True) |
|
|
return nvidia_smi_output |
|
|
except Exception as e: |
|
|
logger.error(f"Error getting nvidia-smi output: {str(e)}") |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
@app.route('/health') |
|
|
def health_check(): |
|
|
""" |
|
|
Simple health check endpoint |
|
|
""" |
|
|
return jsonify({'status': 'ok'}) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
app.run(host='0.0.0.0', port=5000) |