Factor Studios commited on
Commit
bebbd46
·
verified ·
1 Parent(s): 4023c98

Update test_ai_integration_http.py

Browse files
Files changed (1) hide show
  1. test_ai_integration_http.py +202 -208
test_ai_integration_http.py CHANGED
@@ -1,208 +1,202 @@
1
- """
2
- Test Llama-2-7b-instruct model integration with vGPU.
3
- Configure PyTorch to use vGPU as device for text generation.
4
- """
5
- import logging
6
- import os
7
- import time
8
- from contextlib import contextmanager
9
- from typing import Any, Optional
10
-
11
- import torch
12
- from transformers import pipeline
13
- from virtual_vram import VirtualVRAM
14
- from http_storage import HTTPGPUStorage
15
- from torch_vgpu import VGPUDevice, to_vgpu
16
-
17
- def setup_vgpu():
18
- """Setup vGPU device"""
19
- try:
20
- # Initialize the backend first
21
- from torch_vgpu import init_vgpu_backend, VGPUDevice
22
- if not init_vgpu_backend():
23
- raise RuntimeError("Failed to initialize vGPU backend")
24
-
25
- # Create and register vGPU device
26
- vgpu = VGPUDevice()
27
- device = vgpu.device()
28
-
29
- # Set as default device for tensor operations
30
- torch.set_default_device(device)
31
-
32
- return device
33
-
34
- except Exception as e:
35
- logging.error(f"vGPU setup failed: {str(e)}")
36
- raise
37
-
38
- # Configure logging
39
- logging.basicConfig(
40
- level=logging.INFO,
41
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
42
- )
43
- logger = logging.getLogger(__name__)
44
-
45
- @contextmanager
46
- def gpu_context():
47
- """Context manager for vGPU resources"""
48
- storage = None
49
- try:
50
- storage = HTTPGPUStorage()
51
- yield storage
52
- finally:
53
- if storage:
54
- storage.close()
55
- logger.info("vGPU resources cleaned up")
56
-
57
- def get_model_size(model):
58
- """Calculate model size in parameters and memory footprint"""
59
- param_size = 0
60
- for param in model.parameters():
61
- param_size += param.nelement() * param.element_size()
62
- buffer_size = 0
63
- for buffer in model.buffers():
64
- buffer_size += buffer.nelement() * buffer.element_size()
65
- return param_size + buffer_size
66
-
67
- def prepare_prompt(instruction: str) -> str:
68
- """Prepare a prompt for Llama-2 using its chat format."""
69
- # Format: <s>[INST] instruction [/INST] assistant response </s>[INST] ...
70
- return f"<s>[INST] {instruction} [/INST]"
71
-
72
- def test_ai_integration_http():
73
- """Test GPT OSS model on vGPU with text generation"""
74
- logger.info("Starting vGPU text generation test")
75
-
76
- status = {
77
- 'pipeline_loaded': False,
78
- 'model_on_vgpu': False,
79
- 'generation_complete': False,
80
- 'cleanup_success': False
81
- }
82
-
83
- with gpu_context() as storage:
84
- try:
85
- # Initialize vRAM with monitoring
86
- initial_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
87
- vram = VirtualVRAM(size_gb=None, storage=storage)
88
-
89
- # Initialize vGPU device
90
- device = setup_vgpu()
91
- logger.info(f"vGPU initialized with device {device}")
92
-
93
- # Load model using pipeline
94
- model_id = "openai/gpt-oss-20b"
95
- logger.info(f"Loading {model_id}")
96
-
97
- try:
98
- # Disable transformers logging temporarily
99
- transformers_logger = logging.getLogger("transformers")
100
- original_level = transformers_logger.level
101
- transformers_logger.setLevel(logging.ERROR)
102
-
103
- try:
104
- # Create pipeline
105
- # Create pipeline with vGPU device
106
- pipe = pipeline(
107
- "text-generation",
108
- model=model_id,
109
- torch_dtype=torch.float32, # Use full precision
110
- device=device, # Use our vGPU device
111
- use_safetensors=True,
112
- trust_remote_code=True
113
- )
114
- status['pipeline_loaded'] = True
115
-
116
- # Move pipeline model to vGPU
117
- pipe.model = to_vgpu(pipe.model, vram=vram)
118
- pipe.model.eval()
119
- status['model_on_vgpu'] = True
120
-
121
- # Log model details
122
- logger.info(f"Pipeline created with model: {model_id}")
123
-
124
- # Log model size
125
- model_size = get_model_size(pipe.model)
126
- logger.info(f"Model loaded: {model_size/1e9:.2f} GB in parameters")
127
- logger.info(f"Model architecture: {pipe.model.__class__.__name__}")
128
-
129
- # Verify model location
130
- with torch.device(device):
131
- current_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
132
- logger.info(f"Model memory usage: {(current_mem - initial_mem)/1e9:.2f} GB")
133
-
134
- finally:
135
- # Restore original logging level
136
- transformers_logger.setLevel(original_level)
137
-
138
- except Exception as e:
139
- logger.error(f"Model loading failed: {str(e)}")
140
- raise
141
- except Exception as e:
142
- logger.error(f"Model transfer to vGPU failed: {str(e)}")
143
- raise
144
-
145
- # Run text generation
146
- logger.info("Running text generation...")
147
- start = time.time()
148
- peak_mem = initial_mem
149
-
150
- try:
151
- # Prepare input prompt
152
- prompt = "Explain how virtual GPUs work in simple terms."
153
-
154
- with torch.no_grad():
155
- # Generate text
156
- outputs = pipe(
157
- prompt,
158
- max_new_tokens=256,
159
- temperature=0.7,
160
- top_p=0.95,
161
- top_k=40,
162
- num_beams=1,
163
- do_sample=True,
164
- return_full_text=True
165
- )
166
-
167
- if hasattr(storage, 'get_used_memory'):
168
- peak_mem = max(peak_mem, storage.get_used_memory())
169
-
170
- inference_time = time.time() - start
171
- status['generation_complete'] = True
172
-
173
- # Log performance metrics
174
- logger.info(f"\nGeneration stats:")
175
- logger.info(f"- Time: {inference_time:.4f}s")
176
- logger.info(f"- Memory peak: {(peak_mem - initial_mem)/1e9:.2f} GB")
177
- logger.info(f"- Generated text: {outputs[0]['generated_text']}")
178
-
179
- except Exception as e:
180
- logger.error(f"Text generation failed: {str(e)}")
181
- raise
182
-
183
- except Exception as e:
184
- logger.error(f"Test failed: {str(e)}")
185
- raise
186
- finally:
187
- # Cleanup and status report
188
- try:
189
- if 'pipe' in locals():
190
- del pipe
191
- if 'outputs' in locals():
192
- del outputs
193
- torch.cuda.empty_cache() if hasattr(torch, 'cuda') else None
194
- status['cleanup_success'] = True
195
- except Exception as e:
196
- logger.error(f"Cleanup error: {str(e)}")
197
-
198
- logger.info("\nTest Summary:")
199
- for key, value in status.items():
200
- logger.info(f"- {key}: {'✓' if value else '✗'}")
201
-
202
- final_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
203
- if final_mem > initial_mem:
204
- logger.warning(f"Memory leak detected: {(final_mem - initial_mem)/1e6:.2f} MB")
205
-
206
- if __name__ == "__main__":
207
- test_ai_integration_http()
208
-
 
1
+ Test Llama-2-7b-instruct model integration with vGPU.
2
+ Configure PyTorch to use vGPU as device for text generation.
3
+ """
4
+ import logging
5
+ import os
6
+ import time
7
+ from contextlib import contextmanager
8
+ from typing import Any, Optional
9
+
10
+ import torch
11
+ from transformers import pipeline
12
+ from virtual_vram import VirtualVRAM
13
+ from http_storage import HTTPGPUStorage
14
+ from torch_vgpu import VGPUDevice, to_vgpu
15
+
16
+ def setup_vgpu():
17
+ """Setup vGPU device"""
18
+ try:
19
+ # Initialize the backend first
20
+ from torch_vgpu import init_vgpu_backend, VGPUDevice
21
+ if not init_vgpu_backend():
22
+ raise RuntimeError("Failed to initialize vGPU backend")
23
+
24
+ # Create and register vGPU device
25
+ vgpu = VGPUDevice()
26
+ device = vgpu.device()
27
+
28
+ # Set as default device for tensor operations
29
+ return device
30
+
31
+ except Exception as e:
32
+ logging.error(f"vGPU setup failed: {str(e)}")
33
+ raise
34
+
35
+ # Configure logging
36
+ logging.basicConfig(
37
+ level=logging.INFO,
38
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
39
+ )
40
+ logger = logging.getLogger(__name__)
41
+
42
+ @contextmanager
43
+ def gpu_context():
44
+ """Context manager for vGPU resources"""
45
+ storage = None
46
+ try:
47
+ storage = HTTPGPUStorage()
48
+ yield storage
49
+ finally:
50
+ if storage:
51
+ storage.close()
52
+ logger.info("vGPU resources cleaned up")
53
+
54
+ def get_model_size(model):
55
+ """Calculate model size in parameters and memory footprint"""
56
+ param_size = 0
57
+ for param in model.parameters():
58
+ param_size += param.nelement() * param.element_size()
59
+ buffer_size = 0
60
+ for buffer in model.buffers():
61
+ buffer_size += buffer.nelement() * buffer.element_size()
62
+ return param_size + buffer_size
63
+
64
+ def prepare_prompt(instruction: str) -> str:
65
+ """Prepare a prompt for Llama-2 using its chat format."""
66
+ # Format: <s>[INST] instruction [/INST] assistant response </s>[INST] ...
67
+ return f"<s>[INST] {instruction} [/INST]"
68
+
69
+ def test_ai_integration_http():
70
+ """Test GPT OSS model on vGPU with text generation"""
71
+ logger.info("Starting vGPU text generation test")
72
+
73
+ status = {
74
+ 'pipeline_loaded': False,
75
+ 'model_on_vgpu': False,
76
+ 'generation_complete': False,
77
+ 'cleanup_success': False
78
+ }
79
+
80
+ with gpu_context() as storage:
81
+ try:
82
+ # Initialize vRAM with monitoring
83
+ initial_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
84
+ vram = VirtualVRAM(size_gb=None, storage=storage)
85
+
86
+ # Initialize vGPU device
87
+ device = setup_vgpu()
88
+ logger.info(f"vGPU initialized with device {device}")
89
+
90
+ # Load model using pipeline
91
+ model_id = "openai/gpt-oss-20b"
92
+ logger.info(f"Loading {model_id}")
93
+
94
+ try:
95
+ # Disable transformers logging temporarily
96
+ transformers_logger = logging.getLogger("transformers")
97
+ original_level = transformers_logger.level
98
+ transformers_logger.setLevel(logging.ERROR)
99
+
100
+ try:
101
+ # Create pipeline
102
+ # Create pipeline with vGPU device
103
+ pipe = pipeline(
104
+ "text-generation",
105
+ model=model_id,
106
+ torch_dtype=torch.float32, # Use full precision,
107
+ use_safetensors=True,
108
+ trust_remote_code=True
109
+ )
110
+ status["pipeline_loaded"] = True
111
+
112
+ # Move pipeline model to vGPU
113
+ pipe.model = to_vgpu(pipe.model, vram=vram)
114
+ pipe.model.eval()
115
+ status['model_on_vgpu'] = True
116
+
117
+ # Log model details
118
+ logger.info(f"Pipeline created with model: {model_id}")
119
+
120
+ # Log model size
121
+ model_size = get_model_size(pipe.model)
122
+ logger.info(f"Model loaded: {model_size/1e9:.2f} GB in parameters")
123
+ logger.info(f"Model architecture: {pipe.model.__class__.__name__}")
124
+
125
+ # Verify model location
126
+ with torch.device(device):
127
+ current_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
128
+ logger.info(f"Model memory usage: {(current_mem - initial_mem)/1e9:.2f} GB")
129
+
130
+ finally:
131
+ # Restore original logging level
132
+ transformers_logger.setLevel(original_level)
133
+
134
+ except Exception as e:
135
+ logger.error(f"Model loading failed: {str(e)}")
136
+ raise
137
+ except Exception as e:
138
+ logger.error(f"Model transfer to vGPU failed: {str(e)}")
139
+ raise
140
+
141
+ # Run text generation
142
+ logger.info("Running text generation...")
143
+ start = time.time()
144
+ peak_mem = initial_mem
145
+
146
+ try:
147
+ # Prepare input prompt
148
+ prompt = "Explain how virtual GPUs work in simple terms."
149
+
150
+ with torch.no_grad():
151
+ outputs = pipe(
152
+ prompt,
153
+ max_new_tokens=256,
154
+ temperature=0.7,
155
+ top_p=0.95,
156
+ top_k=40,
157
+ num_beams=1,
158
+ do_sample=True,
159
+ return_full_text=True
160
+ )
161
+
162
+ if hasattr(storage, 'get_used_memory'):
163
+ peak_mem = max(peak_mem, storage.get_used_memory())
164
+
165
+ inference_time = time.time() - start
166
+ status['generation_complete'] = True
167
+
168
+ # Log performance metrics
169
+ logger.info(f"\nGeneration stats:")
170
+ logger.info(f"- Time: {inference_time:.4f}s")
171
+ logger.info(f"- Memory peak: {(peak_mem - initial_mem)/1e9:.2f} GB")
172
+ logger.info(f"- Generated text: {outputs[0]['generated_text']}")
173
+
174
+ except Exception as e:
175
+ logger.error(f"Text generation failed: {str(e)}")
176
+ raise
177
+
178
+ except Exception as e:
179
+ logger.error(f"Test failed: {str(e)}")
180
+ raise
181
+ finally:
182
+ # Cleanup and status report
183
+ try:
184
+ if 'pipe' in locals():
185
+ del pipe
186
+ if 'outputs' in locals():
187
+ del outputs
188
+ torch.cuda.empty_cache() if hasattr(torch, 'cuda') else None
189
+ status['cleanup_success'] = True
190
+ except Exception as e:
191
+ logger.error(f"Cleanup error: {str(e)}")
192
+
193
+ logger.info("\nTest Summary:")
194
+ for key, value in status.items():
195
+ logger.info(f"- {key}: {'✓' if value else '✗'}")
196
+
197
+ final_mem = storage.get_used_memory() if hasattr(storage, 'get_used_memory') else 0
198
+ if final_mem > initial_mem:
199
+ logger.warning(f"Memory leak detected: {(final_mem - initial_mem)/1e6:.2f} MB")
200
+
201
+ if __name__ == "__main__":
202
+ test_ai_integration_http()