songhieng commited on
Commit
9aa4daf
·
verified ·
1 Parent(s): 168a930

Update src/mlops/system_check.py

Browse files
Files changed (1) hide show
  1. src/mlops/system_check.py +245 -245
src/mlops/system_check.py CHANGED
@@ -1,245 +1,245 @@
1
- """
2
- System Prerequisites Checker Module
3
-
4
- This module provides functionality to check system prerequisites including:
5
- - CUDA/GPU availability
6
- - Environment dependencies
7
- - Model download with progress tracking
8
- """
9
-
10
- import os
11
- import sys
12
- import torch
13
- import platform
14
- from typing import Dict, Tuple, Optional
15
- from pathlib import Path
16
- import importlib.metadata
17
- from huggingface_hub import hf_hub_download, snapshot_download
18
- from tqdm import tqdm
19
-
20
-
21
- class SystemChecker:
22
- """Check system prerequisites for MLOps platform."""
23
-
24
- def __init__(self, models_dir: str = "models"):
25
- """
26
- Initialize system checker.
27
-
28
- Args:
29
- models_dir: Directory to store downloaded models
30
- """
31
- self.models_dir = Path(models_dir)
32
- self.models_dir.mkdir(parents=True, exist_ok=True)
33
-
34
- def check_cuda(self) -> Dict[str, any]:
35
- """
36
- Check CUDA/GPU availability and information.
37
-
38
- Returns:
39
- Dict with CUDA status, device info, and specifications
40
- """
41
- result = {
42
- "available": torch.cuda.is_available(),
43
- "device_count": 0,
44
- "devices": [],
45
- "cuda_version": None,
46
- "cudnn_version": None
47
- }
48
-
49
- if result["available"]:
50
- result["device_count"] = torch.cuda.device_count()
51
- result["cuda_version"] = torch.version.cuda
52
- result["cudnn_version"] = torch.backends.cudnn.version()
53
-
54
- for i in range(result["device_count"]):
55
- device_props = {
56
- "id": i,
57
- "name": torch.cuda.get_device_name(i),
58
- "memory_total": torch.cuda.get_device_properties(i).total_memory / 1024**3, # GB
59
- "compute_capability": f"{torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}"
60
- }
61
- result["devices"].append(device_props)
62
-
63
- return result
64
-
65
- def check_environment(self) -> Dict[str, any]:
66
- """
67
- Check Python environment and required dependencies.
68
-
69
- Returns:
70
- Dict with Python version, package versions, and system info
71
- """
72
- result = {
73
- "python_version": sys.version,
74
- "platform": platform.platform(),
75
- "architecture": platform.machine(),
76
- "packages": {},
77
- "missing_packages": [],
78
- "all_satisfied": True
79
- }
80
-
81
- # Required packages with minimum versions
82
- required_packages = {
83
- "torch": "2.0.0",
84
- "transformers": "4.36.0",
85
- "streamlit": "1.28.0",
86
- "pandas": "2.0.0",
87
- "numpy": "1.24.0",
88
- "plotly": "5.18.0",
89
- "scikit-learn": "1.3.0"
90
- }
91
-
92
- for package, min_version in required_packages.items():
93
- try:
94
- version = importlib.metadata.version(package)
95
- result["packages"][package] = {
96
- "installed": version,
97
- "required": f">={min_version}",
98
- "satisfied": True # Simple check, could add version comparison
99
- }
100
- except importlib.metadata.PackageNotFoundError:
101
- result["packages"][package] = {
102
- "installed": None,
103
- "required": f">={min_version}",
104
- "satisfied": False
105
- }
106
- result["missing_packages"].append(package)
107
- result["all_satisfied"] = False
108
-
109
- return result
110
-
111
- def download_model(
112
- self,
113
- model_name: str,
114
- progress_callback: Optional[callable] = None
115
- ) -> Tuple[bool, str, str]:
116
- """
117
- Download model from HuggingFace Hub to local cache.
118
-
119
- Args:
120
- model_name: HuggingFace model identifier (e.g., "roberta-base")
121
- progress_callback: Optional callback function for progress updates
122
-
123
- Returns:
124
- Tuple of (success: bool, model_path: str, message: str)
125
- """
126
- try:
127
- model_cache_path = self.models_dir / model_name.replace("/", "_")
128
-
129
- # Check if model already exists
130
- if model_cache_path.exists() and any(model_cache_path.iterdir()):
131
- return True, str(model_cache_path), f"Model '{model_name}' already exists in cache"
132
-
133
- # Download model
134
- if progress_callback:
135
- progress_callback(f"Downloading {model_name}...", 0.1)
136
-
137
- # Use snapshot_download to get all model files
138
- cache_dir = snapshot_download(
139
- repo_id=model_name,
140
- cache_dir=str(self.models_dir),
141
- local_dir=str(model_cache_path),
142
- local_dir_use_symlinks=False
143
- )
144
-
145
- if progress_callback:
146
- progress_callback(f"Downloaded {model_name} successfully", 1.0)
147
-
148
- return True, str(model_cache_path), f"Model '{model_name}' downloaded successfully"
149
-
150
- except Exception as e:
151
- error_msg = f"Failed to download model '{model_name}': {str(e)}"
152
- if progress_callback:
153
- progress_callback(error_msg, 0.0)
154
- return False, "", error_msg
155
-
156
- def get_model_info(self, model_name: str) -> Dict[str, any]:
157
- """
158
- Get information about a model (local or remote).
159
-
160
- Args:
161
- model_name: Model identifier
162
-
163
- Returns:
164
- Dict with model information
165
- """
166
- model_cache_path = self.models_dir / model_name.replace("/", "_")
167
-
168
- info = {
169
- "name": model_name,
170
- "local_path": str(model_cache_path),
171
- "exists_locally": model_cache_path.exists() and any(model_cache_path.iterdir()),
172
- "size_mb": 0
173
- }
174
-
175
- if info["exists_locally"]:
176
- # Calculate total size
177
- total_size = sum(
178
- f.stat().st_size
179
- for f in model_cache_path.rglob('*')
180
- if f.is_file()
181
- )
182
- info["size_mb"] = total_size / (1024 * 1024)
183
-
184
- return info
185
-
186
-
187
- def format_bytes(bytes_size: float) -> str:
188
- """Format bytes to human-readable string."""
189
- for unit in ['B', 'KB', 'MB', 'GB']:
190
- if bytes_size < 1024.0:
191
- return f"{bytes_size:.2f} {unit}"
192
- bytes_size /= 1024.0
193
- return f"{bytes_size:.2f} TB"
194
-
195
-
196
- def get_system_summary() -> str:
197
- """Get a formatted summary of system capabilities."""
198
- checker = SystemChecker()
199
-
200
- cuda_info = checker.check_cuda()
201
- env_info = checker.check_environment()
202
-
203
- summary = []
204
- summary.append("=" * 60)
205
- summary.append("SYSTEM SUMMARY")
206
- summary.append("=" * 60)
207
-
208
- # Python & Platform
209
- summary.append(f"\n🐍 Python: {env_info['python_version'].split()[0]}")
210
- summary.append(f"💻 Platform: {env_info['platform']}")
211
- summary.append(f"🏗️ Architecture: {env_info['architecture']}")
212
-
213
- # CUDA
214
- summary.append(f"\n🎮 CUDA Available: {'Yes' if cuda_info['available'] else 'No (CPU only)'}")
215
- if cuda_info['available']:
216
- summary.append(f"📊 CUDA Version: {cuda_info['cuda_version']}")
217
- summary.append(f"🔢 Number of GPUs: {cuda_info['device_count']}")
218
- for device in cuda_info['devices']:
219
- summary.append(f" - GPU {device['id']}: {device['name']}")
220
- summary.append(f" Memory: {device['memory_total']:.2f} GB")
221
- summary.append(f" Compute: {device['compute_capability']}")
222
-
223
- # Packages
224
- summary.append(f"\n📦 Required Packages: {'✅ All Satisfied' if env_info['all_satisfied'] else '⚠️ Missing Packages'}")
225
- if env_info['missing_packages']:
226
- summary.append(f" Missing: {', '.join(env_info['missing_packages'])}")
227
-
228
- summary.append("=" * 60)
229
-
230
- return "\n".join(summary)
231
-
232
-
233
- if __name__ == "__main__":
234
- # Test the system checker
235
- print(get_system_summary())
236
-
237
- checker = SystemChecker()
238
-
239
- # Test model download (small model for testing)
240
- print("\n\nTesting model download...")
241
- success, path, msg = checker.download_model(
242
- "distilbert-base-uncased",
243
- progress_callback=lambda msg, progress: print(f"Progress: {progress*100:.0f}% - {msg}")
244
- )
245
- print(f"Result: {msg}")
 
1
+ """
2
+ System Prerequisites Checker Module
3
+
4
+ This module provides functionality to check system prerequisites including:
5
+ - CUDA/GPU availability
6
+ - Environment dependencies
7
+ - Model download with progress tracking
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import torch
13
+ import platform
14
+ from typing import Dict, Tuple, Optional
15
+ from pathlib import Path
16
+ import importlib.metadata
17
+ from huggingface_hub import hf_hub_download, snapshot_download
18
+ from tqdm import tqdm
19
+
20
+
21
+ class SystemChecker:
22
+ """Check system prerequisites for MLOps platform."""
23
+
24
+ def __init__(self, models_dir: str = "models"):
25
+ """
26
+ Initialize system checker.
27
+
28
+ Args:
29
+ models_dir: Directory to store downloaded models
30
+ """
31
+ self.models_dir = Path(models_dir)
32
+ self.models_dir.mkdir(parents=True, exist_ok=True)
33
+
34
+ def check_cuda(self) -> Dict[str, any]:
35
+ """
36
+ Check CUDA/GPU availability and information.
37
+
38
+ Returns:
39
+ Dict with CUDA status, device info, and specifications
40
+ """
41
+ result = {
42
+ "available": torch.cuda.is_available(),
43
+ "device_count": 0,
44
+ "devices": [],
45
+ "cuda_version": None,
46
+ "cudnn_version": None
47
+ }
48
+
49
+ if result["available"]:
50
+ result["device_count"] = torch.cuda.device_count()
51
+ result["cuda_version"] = torch.version.cuda
52
+ result["cudnn_version"] = torch.backends.cudnn.version()
53
+
54
+ for i in range(result["device_count"]):
55
+ device_props = {
56
+ "id": i,
57
+ "name": torch.cuda.get_device_name(i),
58
+ "memory_total": torch.cuda.get_device_properties(i).total_memory / 1024**3, # GB
59
+ "compute_capability": f"{torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}"
60
+ }
61
+ result["devices"].append(device_props)
62
+
63
+ return result
64
+
65
+ def check_environment(self) -> Dict[str, any]:
66
+ """
67
+ Check Python environment and required dependencies.
68
+
69
+ Returns:
70
+ Dict with Python version, package versions, and system info
71
+ """
72
+ result = {
73
+ "python_version": sys.version,
74
+ "platform": platform.platform(),
75
+ "architecture": platform.machine(),
76
+ "packages": {},
77
+ "missing_packages": [],
78
+ "all_satisfied": True
79
+ }
80
+
81
+ # Required packages with minimum versions
82
+ required_packages = {
83
+ "torch": "2.0.0",
84
+ "transformers": "4.36.0",
85
+ "streamlit": "1.28.0",
86
+ "pandas": "2.0.0",
87
+ "numpy": "1.24.0",
88
+ "plotly": "5.18.0",
89
+ "scikit-learn": "1.3.0"
90
+ }
91
+
92
+ for package, min_version in required_packages.items():
93
+ try:
94
+ version = importlib.metadata.version(package)
95
+ result["packages"][package] = {
96
+ "installed": version,
97
+ "required": f">={min_version}",
98
+ "satisfied": True # Simple check, could add version comparison
99
+ }
100
+ except importlib.metadata.PackageNotFoundError:
101
+ result["packages"][package] = {
102
+ "installed": None,
103
+ "required": f">={min_version}",
104
+ "satisfied": False
105
+ }
106
+ result["missing_packages"].append(package)
107
+ result["all_satisfied"] = False
108
+
109
+ return result
110
+
111
+ def download_model(
112
+ self,
113
+ model_name: str,
114
+ progress_callback: Optional[callable] = None
115
+ ) -> Tuple[bool, str, str]:
116
+ """
117
+ Download model from HuggingFace Hub to local cache.
118
+
119
+ Args:
120
+ model_name: HuggingFace model identifier (e.g., "roberta-base")
121
+ progress_callback: Optional callback function for progress updates
122
+
123
+ Returns:
124
+ Tuple of (success: bool, model_path: str, message: str)
125
+ """
126
+ try:
127
+ model_cache_path = self.models_dir / model_name.replace("/", "_")
128
+
129
+ # Check if model already exists
130
+ if model_cache_path.exists() and any(model_cache_path.iterdir()):
131
+ return True, str(model_cache_path), f"Model '{model_name}' already exists in cache"
132
+
133
+ # Download model
134
+ if progress_callback:
135
+ progress_callback(f"Downloading {model_name}...", 0.1)
136
+
137
+ # Use snapshot_download to get all model files
138
+ cache_dir = snapshot_download(
139
+ repo_id=model_name,
140
+ cache_dir=str(self.models_dir),
141
+ local_dir=str(model_cache_path),
142
+ local_dir_use_symlinks=False
143
+ )
144
+
145
+ if progress_callback:
146
+ progress_callback(f"Downloaded {model_name} successfully", 1.0)
147
+
148
+ return True, str(model_cache_path), f"Model '{model_name}' downloaded successfully"
149
+
150
+ except Exception as e:
151
+ error_msg = f"Failed to download model '{model_name}': {str(e)}"
152
+ if progress_callback:
153
+ progress_callback(error_msg, 0.0)
154
+ return False, "", error_msg
155
+
156
+ def get_model_info(self, model_name: str) -> Dict[str, any]:
157
+ """
158
+ Get information about a model (local or remote).
159
+
160
+ Args:
161
+ model_name: Model identifier
162
+
163
+ Returns:
164
+ Dict with model information
165
+ """
166
+ model_cache_path = self.models_dir / model_name.replace("/", "_")
167
+
168
+ info = {
169
+ "name": model_name,
170
+ "local_path": str(model_cache_path),
171
+ "exists_locally": model_cache_path.exists() and any(model_cache_path.iterdir()),
172
+ "size_mb": 0
173
+ }
174
+
175
+ if info["exists_locally"]:
176
+ # Calculate total size
177
+ total_size = sum(
178
+ f.stat().st_size
179
+ for f in model_cache_path.rglob('*')
180
+ if f.is_file()
181
+ )
182
+ info["size_mb"] = total_size / (1024 * 1024)
183
+
184
+ return info
185
+
186
+
187
+ def format_bytes(bytes_size: float) -> str:
188
+ """Format bytes to human-readable string."""
189
+ for unit in ['B', 'KB', 'MB', 'GB']:
190
+ if bytes_size < 1024.0:
191
+ return f"{bytes_size:.2f} {unit}"
192
+ bytes_size /= 1024.0
193
+ return f"{bytes_size:.2f} TB"
194
+
195
+
196
+ def get_system_summary() -> str:
197
+ """Get a formatted summary of system capabilities."""
198
+ checker = SystemChecker()
199
+
200
+ cuda_info = checker.check_cuda()
201
+ env_info = checker.check_environment()
202
+
203
+ summary = []
204
+ summary.append("=" * 60)
205
+ summary.append("SYSTEM SUMMARY")
206
+ summary.append("=" * 60)
207
+
208
+ # Python & Platform
209
+ summary.append(f"\nPython: {env_info['python_version'].split()[0]}")
210
+ summary.append(f"Platform: {env_info['platform']}")
211
+ summary.append(f"Architecture: {env_info['architecture']}")
212
+
213
+ # CUDA
214
+ summary.append(f"\nCUDA Available: {'Yes' if cuda_info['available'] else 'No (CPU only)'}")
215
+ if cuda_info['available']:
216
+ summary.append(f"CUDA Version: {cuda_info['cuda_version']}")
217
+ summary.append(f"Number of GPUs: {cuda_info['device_count']}")
218
+ for device in cuda_info['devices']:
219
+ summary.append(f" - GPU {device['id']}: {device['name']}")
220
+ summary.append(f" Memory: {device['memory_total']:.2f} GB")
221
+ summary.append(f" Compute: {device['compute_capability']}")
222
+
223
+ # Packages
224
+ summary.append(f"\n📦 Required Packages: {'✅ All Satisfied' if env_info['all_satisfied'] else '⚠️ Missing Packages'}")
225
+ if env_info['missing_packages']:
226
+ summary.append(f" Missing: {', '.join(env_info['missing_packages'])}")
227
+
228
+ summary.append("=" * 60)
229
+
230
+ return "\n".join(summary)
231
+
232
+
233
+ if __name__ == "__main__":
234
+ # Test the system checker
235
+ print(get_system_summary())
236
+
237
+ checker = SystemChecker()
238
+
239
+ # Test model download (small model for testing)
240
+ print("\n\nTesting model download...")
241
+ success, path, msg = checker.download_model(
242
+ "distilbert-base-uncased",
243
+ progress_callback=lambda msg, progress: print(f"Progress: {progress*100:.0f}% - {msg}")
244
+ )
245
+ print(f"Result: {msg}")