Factor Studios commited on
Commit
8aea612
·
verified ·
1 Parent(s): 5cdc76b

Update test_ai_integration_http.py

Browse files
Files changed (1) hide show
  1. test_ai_integration_http.py +198 -198
test_ai_integration_http.py CHANGED
@@ -1,199 +1,199 @@
1
- import logging
2
- import os
3
- import time
4
- from contextlib import contextmanager
5
- from typing import Any, Optional
6
-
7
- import torch
8
- from transformers import pipeline
9
- from virtual_vram import VirtualVRAM
10
- from http_storage import HTTPGPUStorage
11
- from torch_vgpu import VGPUDevice, to_vgpu
12
-
13
- def setup_vgpu():
14
- """Setup vGPU device"""
15
- try:
16
- # Initialize the backend first
17
- from torch_vgpu import init_vgpu_backend, VGPUDevice
18
- if not init_vgpu_backend():
19
- raise RuntimeError("Failed to initialize vGPU backend")
20
-
21
- # Create and register vGPU device
22
- vgpu = VGPUDevice()
23
- device = vgpu.device()
24
-
25
- # Set as default device for tensor operations
26
- return device
27
-
28
- except Exception as e:
29
- logging.error(f"vGPU setup failed: {str(e)}")
30
- raise
31
-
32
- # Configure logging
33
- logging.basicConfig(
34
- level=logging.INFO,
35
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36
- )
37
- logger = logging.getLogger(__name__)
38
-
39
- @contextmanager
40
- def gpu_context():
41
- """Context manager for vGPU resources"""
42
- storage = None
43
- try:
44
- storage = HTTPGPUStorage()
45
- yield storage
46
- finally:
47
- if storage:
48
- storage.close()
49
- logger.info("vGPU resources cleaned up")
50
-
51
- def get_model_size(model):
52
- """Calculate model size in parameters and memory footprint"""
53
- param_size = 0
54
- for param in model.parameters():
55
- param_size += param.nelement() * param.element_size()
56
- buffer_size = 0
57
- for buffer in model.buffers():
58
- buffer_size += buffer.nelement() * buffer.element_size()
59
- return param_size + buffer_size
60
-
61
- def prepare_prompt(instruction: str) -> str:
62
- """Prepare a prompt for Llama-2 using its chat format."""
63
- # Format: <s>[INST] instruction [/INST] assistant response </s>[INST] ...
64
- return f"<s>[INST] {instruction} [/INST]"
65
-
66
- def test_ai_integration_http():
67
- """Test GPT OSS model on vGPU with text generation"""
68
- logger.info("Starting vGPU text generation test")
69
-
70
- status = {
71
- 'pipeline_loaded': False,
72
- 'model_on_vgpu': False,
73
- 'generation_complete': False,
74
- 'cleanup_success': False
75
- }
76
-
77
- with gpu_context() as storage:
78
- try:
79
- # Initialize vRAM with monitoring
80
- initial_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
81
- vram = VirtualVRAM(size_gb=None, storage=storage)
82
-
83
- # Initialize vGPU device
84
- device = setup_vgpu()
85
- logger.info(f"vGPU initialized with device {device}")
86
-
87
- # Load model using pipeline
88
- model_id = "openai/gpt-oss-20b"
89
- logger.info(f"Loading {model_id}")
90
-
91
- try:
92
- # Disable transformers logging temporarily
93
- transformers_logger = logging.getLogger("transformers")
94
- original_level = transformers_logger.level
95
- transformers_logger.setLevel(logging.ERROR)
96
-
97
- try:
98
- # Create pipeline with direct vGPU device mapping
99
- pipe = pipeline(
100
- "text-generation",
101
- model=model_id,
102
- torch_dtype=torch.float32, # Use full precision
103
- device=device, # Load directly to vGPU
104
- use_safetensors=True,
105
- trust_remote_code=True,
106
- model_kwargs={
107
- "device_map": device # Ensure all model parts go to vGPU
108
- }
109
- )
110
- status["pipeline_loaded"] = True
111
- status['model_on_vgpu'] = True
112
- pipe.model.eval()
113
-
114
- # Log model details
115
- logger.info(f"Pipeline created with model: {model_id}")
116
-
117
- # Log model size
118
- model_size = get_model_size(pipe.model)
119
- logger.info(f"Model loaded: {model_size/1e9:.2f} GB in parameters")
120
- logger.info(f"Model architecture: {pipe.model.__class__.__name__}")
121
-
122
- # Verify model location
123
- with torch.device(device):
124
- current_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
125
- logger.info(f"Model memory usage: {(current_mem - initial_mem)/1e9:.2f} GB")
126
-
127
- finally:
128
- # Restore original logging level
129
- transformers_logger.setLevel(original_level)
130
-
131
- except Exception as e:
132
- logger.error(f"Model loading failed: {str(e)}")
133
- raise
134
- except Exception as e:
135
- logger.error(f"Model transfer to vGPU failed: {str(e)}")
136
- raise
137
-
138
- # Run text generation
139
- logger.info("Running text generation...")
140
- start = time.time()
141
- peak_mem = initial_mem
142
-
143
- try:
144
- # Prepare input prompt
145
- prompt = "Explain how virtual GPUs work in simple terms."
146
-
147
- with torch.no_grad():
148
- outputs = pipe(
149
- prompt,
150
- max_new_tokens=256,
151
- temperature=0.7,
152
- top_p=0.95,
153
- top_k=40,
154
- num_beams=1,
155
- do_sample=True,
156
- return_full_text=True
157
- )
158
-
159
- if hasattr(storage, 'get_used_memory'):
160
- peak_mem = max(peak_mem, storage.get_used_memory())
161
-
162
- inference_time = time.time() - start
163
- status['generation_complete'] = True
164
-
165
- # Log performance metrics
166
- logger.info(f"\nGeneration stats:")
167
- logger.info(f"- Time: {inference_time:.4f}s")
168
- logger.info(f"- Memory peak: {(peak_mem - initial_mem)/1e9:.2f} GB")
169
- logger.info(f"- Generated text: {outputs[0]['generated_text']}")
170
-
171
- except Exception as e:
172
- logger.error(f"Text generation failed: {str(e)}")
173
- raise
174
-
175
- except Exception as e:
176
- logger.error(f"Test failed: {str(e)}")
177
- raise
178
- finally:
179
- # Cleanup and status report
180
- try:
181
- if 'pipe' in locals():
182
- del pipe
183
- if 'outputs' in locals():
184
- del outputs
185
- torch.cuda.empty_cache() if hasattr(torch, 'cuda') else None
186
- status['cleanup_success'] = True
187
- except Exception as e:
188
- logger.error(f"Cleanup error: {str(e)}")
189
-
190
- logger.info("\nTest Summary:")
191
- for key, value in status.items():
192
- logger.info(f"- {key}: {'✓' if value else '✗'}")
193
-
194
- final_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
195
- if final_mem > initial_mem:
196
- logger.warning(f"Memory leak detected: {(final_mem - initial_mem)/1e6:.2f} MB")
197
-
198
- if __name__ == "__main__":
199
  test_ai_integration_http()
 
1
+ import logging
2
+ import os
3
+ import time
4
+ from contextlib import contextmanager
5
+ from typing import Any, Optional
6
+
7
+ import torch
8
+ from transformers import pipeline
9
+ from virtual_vram import VirtualVRAM
10
+ from http_storage import HTTPGPUStorage
11
+ from torch_vgpu import VGPUDevice, to_vgpu
12
+
13
+ def setup_vgpu():
14
+ """Setup vGPU device"""
15
+ try:
16
+ # Initialize the backend first
17
+ from torch_vgpu import init_vgpu_backend, VGPUDevice
18
+ if not init_vgpu_backend():
19
+ raise RuntimeError("Failed to initialize vGPU backend")
20
+
21
+ # Create and register vGPU device
22
+ vgpu = VGPUDevice()
23
+ device = vgpu.device()
24
+
25
+ # Set as default device for tensor operations
26
+ return device
27
+
28
+ except Exception as e:
29
+ logging.error(f"vGPU setup failed: {str(e)}")
30
+ raise
31
+
32
+ # Configure logging
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36
+ )
37
+ logger = logging.getLogger(__name__)
38
+
39
+ @contextmanager
40
+ def gpu_context():
41
+ """Context manager for vGPU resources"""
42
+ storage = None
43
+ try:
44
+ storage = HTTPGPUStorage()
45
+ yield storage
46
+ finally:
47
+ if storage:
48
+ storage.close()
49
+ logger.info("vGPU resources cleaned up")
50
+
51
+ def get_model_size(model):
52
+ """Calculate model size in parameters and memory footprint"""
53
+ param_size = 0
54
+ for param in model.parameters():
55
+ param_size += param.nelement() * param.element_size()
56
+ buffer_size = 0
57
+ for buffer in model.buffers():
58
+ buffer_size += buffer.nelement() * buffer.element_size()
59
+ return param_size + buffer_size
60
+
61
+ def prepare_prompt(instruction: str) -> str:
62
+ """Prepare a prompt for Llama-2 using its chat format."""
63
+ # Format: <s>[INST] instruction [/INST] assistant response </s>[INST] ...
64
+ return f"<s>[INST] {instruction} [/INST]"
65
+
66
+ def test_ai_integration_http():
67
+ """Test GPT OSS model on vGPU with text generation"""
68
+ logger.info("Starting vGPU text generation test")
69
+
70
+ status = {
71
+ 'pipeline_loaded': False,
72
+ 'model_on_vgpu': False,
73
+ 'generation_complete': False,
74
+ 'cleanup_success': False
75
+ }
76
+
77
+ with gpu_context() as storage:
78
+ try:
79
+ # Initialize vRAM with monitoring
80
+ initial_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
81
+ vram = VirtualVRAM(size_gb=None, storage=storage)
82
+
83
+ # Initialize vGPU device
84
+ device = setup_vgpu()
85
+ logger.info(f"vGPU initialized with device {device}")
86
+
87
+ # Load model using pipeline
88
+ model_id = "openai/gpt-oss-20b"
89
+ logger.info(f"Loading {model_id}")
90
+
91
+ try:
92
+ # Disable transformers logging temporarily
93
+ transformers_logger = logging.getLogger("transformers")
94
+ original_level = transformers_logger.level
95
+ transformers_logger.setLevel(logging.ERROR)
96
+
97
+ try:
98
+ # Create pipeline
99
+ # Create pipeline with vGPU device
100
+ pipe = pipeline(
101
+ "text-generation",
102
+ model=model_id,
103
+ torch_dtype=torch.float32, # Use full precision,
104
+ use_safetensors=True,
105
+ trust_remote_code=True,
106
+ device=device
107
+ )
108
+ pipe.model.eval()
109
+
110
+ # Move pipeline model to vGPU
111
+
112
+ status['model_on_vgpu'] = True
113
+
114
+ # Log model details
115
+ logger.info(f"Pipeline created with model: {model_id}")
116
+
117
+ # Log model size
118
+ model_size = get_model_size(pipe.model)
119
+ logger.info(f"Model loaded: {model_size/1e9:.2f} GB in parameters")
120
+ logger.info(f"Model architecture: {pipe.model.__class__.__name__}")
121
+
122
+ # Verify model location
123
+ with torch.device(device):
124
+ current_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
125
+ logger.info(f"Model memory usage: {(current_mem - initial_mem)/1e9:.2f} GB")
126
+
127
+ finally:
128
+ # Restore original logging level
129
+ transformers_logger.setLevel(original_level)
130
+
131
+ except Exception as e:
132
+ logger.error(f"Model loading failed: {str(e)}")
133
+ raise
134
+ except Exception as e:
135
+ logger.error(f"Model transfer to vGPU failed: {str(e)}")
136
+ raise
137
+
138
+ # Run text generation
139
+ logger.info("Running text generation...")
140
+ start = time.time()
141
+ peak_mem = initial_mem
142
+
143
+ try:
144
+ # Prepare input prompt
145
+ prompt = "Explain how virtual GPUs work in simple terms."
146
+
147
+ with torch.no_grad():
148
+ outputs = pipe(
149
+ prompt,
150
+ max_new_tokens=256,
151
+ temperature=0.7,
152
+ top_p=0.95,
153
+ top_k=40,
154
+ num_beams=1,
155
+ do_sample=True,
156
+ return_full_text=True
157
+ )
158
+
159
+ if hasattr(storage, 'get_used_memory'):
160
+ peak_mem = max(peak_mem, storage.get_used_memory())
161
+
162
+ inference_time = time.time() - start
163
+ status['generation_complete'] = True
164
+
165
+ # Log performance metrics
166
+ logger.info(f"\nGeneration stats:")
167
+ logger.info(f"- Time: {inference_time:.4f}s")
168
+ logger.info(f"- Memory peak: {(peak_mem - initial_mem)/1e9:.2f} GB")
169
+ logger.info(f"- Generated text: {outputs[0]['generated_text']}")
170
+
171
+ except Exception as e:
172
+ logger.error(f"Text generation failed: {str(e)}")
173
+ raise
174
+
175
+ except Exception as e:
176
+ logger.error(f"Test failed: {str(e)}")
177
+ raise
178
+ finally:
179
+ # Cleanup and status report
180
+ try:
181
+ if 'pipe' in locals():
182
+ del pipe
183
+ if 'outputs' in locals():
184
+ del outputs
185
+ torch.cuda.empty_cache() if hasattr(torch, 'cuda') else None
186
+ status['cleanup_success'] = True
187
+ except Exception as e:
188
+ logger.error(f"Cleanup error: {str(e)}")
189
+
190
+ logger.info("\nTest Summary:")
191
+ for key, value in status.items():
192
+ logger.info(f"- {key}: {'✓' if value else '✗'}")
193
+
194
+ final_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
195
+ if final_mem > initial_mem:
196
+ logger.warning(f"Memory leak detected: {(final_mem - initial_mem)/1e6:.2f} MB")
197
+
198
+ if __name__ == "__main__":
199
  test_ai_integration_http()