import os import sys import asyncio import json from flask import Blueprint, request, jsonify from flask_cors import cross_origin # Add the virtual GPU path to sys.path vgpu_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'virtual_gpu_setup', 'virtual_gpu') sys.path.insert(0, vgpu_path) from vgpu import VirtualGPU from vram import VRAM from driver import GPUDriver from render import Renderer from ai import AIAccelerator import numpy as np # Import the Hugging Face GPT model from the same directory from .huggingface_gpt_model import HuggingFaceModelManager ai_chat_bp = Blueprint('ai_chat', __name__) # Global variables to store GPU components vgpu_instance = None ai_accelerator = None driver = None hf_model_manager = None def initialize_vgpu(): """Initialize the virtual GPU components.""" global vgpu_instance, ai_accelerator, driver, hf_model_manager if vgpu_instance is None: print("Initializing Virtual GPU with 500GB VRAM...") # Create VRAM (500GB - full virtual GPU capacity) vram = VRAM(memory_size_gb=500) # Create renderer renderer = Renderer(vram) # Create AI accelerator ai_accelerator = AIAccelerator(vram) # Create vGPU with 800 SMs and 50,000 cores vgpu_instance = VirtualGPU(num_sms=800, total_cores=50000) vgpu_instance.set_modules(vram, renderer, ai_accelerator, None) # Create driver driver = GPUDriver(vgpu_instance) vgpu_instance.driver = driver print("Virtual GPU initialized successfully!") print(f"VRAM: {vram.get_stats()['total_memory_gb']} GB") print(f"Cores: {vgpu_instance.total_cores:,}") print(f"SMs: {vgpu_instance.num_sms}") # Initialize the Hugging Face model manager print("Loading Hugging Face pre-trained model onto virtual GPU...") hf_model_manager = HuggingFaceModelManager(ai_accelerator) print("Hugging Face model loaded successfully!") @ai_chat_bp.route('/chat', methods=['POST']) @cross_origin() def chat(): """Handle chat requests using the Hugging Face pre-trained model.""" global hf_model_manager try: # Initialize vGPU if not already done initialize_vgpu() # Get the message from request data = request.get_json() if not data or 'message' not in data: return jsonify({'error': 'No message provided'}), 400 user_message = data['message'] # Generate response using Hugging Face model on virtual GPU response = hf_model_manager.chat(user_message) # Get GPU stats vgpu_stats = vgpu_instance.get_stats() ai_stats = ai_accelerator.get_stats() vram_stats = vgpu_instance.vram.get_stats() # Get model info model_info = hf_model_manager.get_model_info() return jsonify({ 'response': response, 'gpu_stats': { 'clock_cycles': vgpu_stats['clock_cycle'], 'tasks_processed': vgpu_stats['total_tasks_processed'], 'busy_sms': vgpu_stats['busy_sms'], 'total_sms': vgpu_stats['total_sms'], 'ai_operations': ai_stats['operations_performed'], 'flops_performed': ai_stats['flops_performed'], 'vram_utilization': vram_stats['utilization_percent'], 'matrices_in_memory': ai_stats['matrices_in_memory'] }, 'model_info': model_info }) except Exception as e: return jsonify({'error': f'Hugging Face model error: {str(e)}'}), 500 @ai_chat_bp.route('/gpu-status', methods=['GET']) @cross_origin() def gpu_status(): """Get current GPU status.""" try: initialize_vgpu() vgpu_stats = vgpu_instance.get_stats() ai_stats = ai_accelerator.get_stats() vram_stats = vgpu_instance.vram.get_stats() return jsonify({ 'vgpu': vgpu_stats, 'ai_accelerator': ai_stats, 'vram': vram_stats, 'status': 'online' }) except Exception as e: return jsonify({'error': f'Failed to get GPU status: {str(e)}'}), 500 @ai_chat_bp.route('/reset-gpu', methods=['POST']) @cross_origin() def reset_gpu(): """Reset the virtual GPU.""" global vgpu_instance, ai_accelerator, driver, ai_model try: vgpu_instance = None ai_accelerator = None driver = None ai_model = None return jsonify({'message': 'Virtual GPU reset successfully'}) except Exception as e: return jsonify({'error': f'Failed to reset GPU: {str(e)}'}), 500