Spaces:
Running
on
Zero
Running
on
Zero
🔧 Spaces GPU 问题完整修复指南
🎯 问题诊断:你说得完全正确!
问题根源分析
# event_handlers.py - 主进程中
class EventHandlers:
def __init__(self):
self.model_inference = ModelInference() # ❌ 在主进程创建实例
# model_inference.py
class ModelInference:
def __init__(self):
self.model = None # ❌ 实例变量,跨进程共享状态有问题
def initialize_model(self, device):
if self.model is None:
self.model = load_model() # 第一次:在子进程加载
else:
self.model = self.model.to(device) # 第二次:💥 主进程CUDA操作!
为什么第二次会失败?
第一次调用:
@spaces.GPU在子进程运行self.model is None→ 加载模型self.model保存在实例中- 返回时
prediction.gaussians包含 CUDA 张量 - pickle 时尝试在主进程重建 CUDA 张量 → 💥
第二次调用(即使第一次成功了):
- 新的子进程或状态混乱
self.model状态不确定- 尝试
.to(device)操作 → 💥
✅ 解决方案:两个关键修改
修改 1:使用全局变量缓存模型(避免实例状态)
为什么用全局变量?
@spaces.GPU每次在独立子进程运行- 全局变量在子进程内是安全的
- 不会污染主进程
修改 2:返回前移动所有 CUDA 张量到 CPU
为什么需要?
- Pickle 序列化返回值时会尝试重建 CUDA 张量
- 必须确保返回的数据都在 CPU 上
📝 完整修复代码
文件:depth_anything_3/app/modules/model_inference.py
"""
Model inference module for Depth Anything 3 Gradio app.
Modified for HF Spaces GPU compatibility.
"""
import gc
import glob
import os
from typing import Any, Dict, Optional, Tuple
import numpy as np
import torch
from depth_anything_3.api import DepthAnything3
from depth_anything_3.utils.export.glb import export_to_glb
from depth_anything_3.utils.export.gs import export_to_gs_video
# ========================================
# 🔑 关键修改 1:使用全局变量缓存模型
# ========================================
# Global cache for model (used in GPU subprocess)
# This is SAFE because @spaces.GPU runs in isolated subprocess
# Each subprocess gets its own copy of this global variable
_MODEL_CACHE = None
class ModelInference:
"""
Handles model inference and data processing for Depth Anything 3.
Modified for HF Spaces GPU compatibility - does NOT store state
in instance variables to avoid cross-process issues.
"""
def __init__(self):
"""Initialize the model inference handler.
Note: Do NOT store model in instance variable to avoid
state sharing issues with @spaces.GPU decorator.
"""
# No instance variables! All state in global or local variables
pass
def initialize_model(self, device: str = "cuda"):
"""
Initialize the DepthAnything3 model using global cache.
This uses a global variable which is safe because:
1. @spaces.GPU runs in isolated subprocess
2. Each subprocess has its own global namespace
3. No state leaks to main process
Args:
device: Device to load the model on
Returns:
Model instance ready for inference
"""
global _MODEL_CACHE
if _MODEL_CACHE is None:
# First time loading in this subprocess
model_dir = os.environ.get(
"DA3_MODEL_DIR", "depth-anything/DA3NESTED-GIANT-LARGE"
)
print(f"🔄 Loading model from {model_dir}...")
_MODEL_CACHE = DepthAnything3.from_pretrained(model_dir)
_MODEL_CACHE = _MODEL_CACHE.to(device)
_MODEL_CACHE.eval()
print("✅ Model loaded and ready on GPU")
else:
# Model already cached in this subprocess
print("✅ Using cached model")
# Ensure it's on the correct device (defensive programming)
_MODEL_CACHE = _MODEL_CACHE.to(device)
return _MODEL_CACHE
def run_inference(
self,
target_dir: str,
filter_black_bg: bool = False,
filter_white_bg: bool = False,
process_res_method: str = "upper_bound_resize",
show_camera: bool = True,
selected_first_frame: Optional[str] = None,
save_percentage: float = 30.0,
num_max_points: int = 1_000_000,
infer_gs: bool = False,
gs_trj_mode: str = "extend",
gs_video_quality: str = "high",
) -> Tuple[Any, Dict[int, Dict[str, Any]]]:
"""
Run DepthAnything3 model inference on images.
This method is wrapped with @spaces.GPU in app.py.
Args:
target_dir: Directory containing images
filter_black_bg: Whether to filter black background
filter_white_bg: Whether to filter white background
process_res_method: Method for resizing input images
show_camera: Whether to show camera in 3D view
selected_first_frame: Selected first frame filename
save_percentage: Percentage of points to save (0-100)
num_max_points: Maximum number of points
infer_gs: Whether to infer 3D Gaussian Splatting
gs_trj_mode: Trajectory mode for GS
gs_video_quality: Video quality for GS
Returns:
Tuple of (prediction, processed_data)
"""
print(f"Processing images from {target_dir}")
# Device check
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
print(f"Using device: {device}")
# 🔑 使用返回值,而不是 self.model
model = self.initialize_model(device)
# Get image paths
print("Loading images...")
image_folder_path = os.path.join(target_dir, "images")
all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
# Filter for image files
image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
all_image_paths = [
path
for path in all_image_paths
if any(path.lower().endswith(ext) for ext in image_extensions)
]
print(f"Found {len(all_image_paths)} images")
# Apply first frame selection logic
if selected_first_frame:
selected_path = None
for path in all_image_paths:
if os.path.basename(path) == selected_first_frame:
selected_path = path
break
if selected_path:
image_paths = [selected_path] + [
path for path in all_image_paths if path != selected_path
]
print(f"User selected first frame: {selected_first_frame}")
else:
image_paths = all_image_paths
print(f"Selected frame not found, using default order")
else:
image_paths = all_image_paths
if len(image_paths) == 0:
raise ValueError("No images found. Check your upload.")
# Map UI options to actual method names
method_mapping = {"high_res": "lower_bound_resize", "low_res": "upper_bound_resize"}
actual_method = method_mapping.get(process_res_method, "upper_bound_crop")
# Run model inference
print(f"Running inference with method: {actual_method}")
with torch.no_grad():
# 🔑 使用局部变量 model,不是 self.model
prediction = model.inference(
image_paths, export_dir=None, process_res_method=actual_method, infer_gs=infer_gs
)
# Export to GLB
export_to_glb(
prediction,
filter_black_bg=filter_black_bg,
filter_white_bg=filter_white_bg,
export_dir=target_dir,
show_cameras=show_camera,
conf_thresh_percentile=save_percentage,
num_max_points=int(num_max_points),
)
# Export to GS video if needed
if infer_gs:
mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
export_to_gs_video(
prediction,
export_dir=target_dir,
chunk_size=4,
trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
enable_tqdm=True,
vis_depth="hcat",
video_quality=gs_video_quality,
)
# Save predictions cache
self._save_predictions_cache(target_dir, prediction)
# Process results
processed_data = self._process_results(target_dir, prediction, image_paths)
# ========================================
# 🔑 关键修改 2:返回前移动所有 CUDA 张量到 CPU
# ========================================
print("Moving all tensors to CPU for safe return...")
prediction = self._move_prediction_to_cpu(prediction)
# Clean up GPU memory
torch.cuda.empty_cache()
return prediction, processed_data
def _move_prediction_to_cpu(self, prediction: Any) -> Any:
"""
Move all CUDA tensors in prediction to CPU for safe pickling.
This is CRITICAL for HF Spaces with @spaces.GPU decorator.
Without this, pickle will try to reconstruct CUDA tensors in
the main process, causing CUDA initialization error.
Args:
prediction: Prediction object that may contain CUDA tensors
Returns:
Prediction object with all tensors moved to CPU
"""
# Move gaussians tensors to CPU
if hasattr(prediction, 'gaussians') and prediction.gaussians is not None:
gaussians = prediction.gaussians
# Move each tensor attribute to CPU
tensor_attrs = ['means', 'scales', 'rotations', 'harmonics', 'opacities']
for attr in tensor_attrs:
if hasattr(gaussians, attr):
tensor = getattr(gaussians, attr)
if isinstance(tensor, torch.Tensor) and tensor.is_cuda:
setattr(gaussians, attr, tensor.cpu())
print(f" ✓ Moved gaussians.{attr} to CPU")
# Move any tensors in aux dict to CPU
if hasattr(prediction, 'aux') and prediction.aux is not None:
for key, value in list(prediction.aux.items()):
if isinstance(value, torch.Tensor) and value.is_cuda:
prediction.aux[key] = value.cpu()
print(f" ✓ Moved aux['{key}'] to CPU")
elif isinstance(value, dict):
# Recursively handle nested dicts
for k, v in list(value.items()):
if isinstance(v, torch.Tensor) and v.is_cuda:
value[k] = v.cpu()
print(f" ✓ Moved aux['{key}']['{k}'] to CPU")
print("✅ All tensors moved to CPU")
return prediction
def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
"""Save predictions data to predictions.npz for caching."""
try:
output_file = os.path.join(target_dir, "predictions.npz")
save_dict = {}
if prediction.processed_images is not None:
save_dict["images"] = prediction.processed_images
if prediction.depth is not None:
save_dict["depths"] = np.round(prediction.depth, 6)
if prediction.conf is not None:
save_dict["conf"] = np.round(prediction.conf, 2)
if prediction.extrinsics is not None:
save_dict["extrinsics"] = prediction.extrinsics
if prediction.intrinsics is not None:
save_dict["intrinsics"] = prediction.intrinsics
np.savez_compressed(output_file, **save_dict)
print(f"Saved predictions cache to: {output_file}")
except Exception as e:
print(f"Warning: Failed to save predictions cache: {e}")
def _process_results(
self, target_dir: str, prediction: Any, image_paths: list
) -> Dict[int, Dict[str, Any]]:
"""Process model results into structured data."""
processed_data = {}
depth_vis_dir = os.path.join(target_dir, "depth_vis")
if os.path.exists(depth_vis_dir):
depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
for i, depth_file in enumerate(depth_files):
processed_image = None
if prediction.processed_images is not None and i < len(
prediction.processed_images
):
processed_image = prediction.processed_images[i]
processed_data[i] = {
"depth_image": depth_file,
"image": processed_image,
"original_image_path": image_paths[i] if i < len(image_paths) else None,
"depth": prediction.depth[i] if i < len(prediction.depth) else None,
"intrinsics": (
prediction.intrinsics[i]
if prediction.intrinsics is not None and i < len(prediction.intrinsics)
else None
),
"mask": None,
}
return processed_data
def cleanup(self) -> None:
"""Clean up GPU memory."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
🔍 关键变化总结
Before (有问题):
class ModelInference:
def __init__(self):
self.model = None # ❌ 实例变量
def initialize_model(self, device):
if self.model is None:
self.model = load_model() # ❌ 保存在实例中
else:
self.model = self.model.to(device) # ❌ 跨进程操作
def run_inference(self):
self.initialize_model(device) # ❌ 使用实例方法
prediction = self.model.inference(...) # ❌ 使用实例变量
return prediction # ❌ 包含 CUDA 张量
After (正确):
_MODEL_CACHE = None # ✅ 全局变量(子进程安全)
class ModelInference:
def __init__(self):
pass # ✅ 无实例变量
def initialize_model(self, device):
global _MODEL_CACHE
if _MODEL_CACHE is None:
_MODEL_CACHE = load_model() # ✅ 保存在全局
return _MODEL_CACHE # ✅ 返回而不是存储
def run_inference(self):
model = self.initialize_model(device) # ✅ 局部变量
prediction = model.inference(...) # ✅ 使用局部变量
prediction = self._move_prediction_to_cpu(prediction) # ✅ 移到 CPU
return prediction # ✅ 安全返回
🎯 为什么这样修改?
1. 全局变量 vs 实例变量
| 方式 | 问题 | 原因 |
|---|---|---|
self.model |
❌ 跨进程状态混乱 | 实例在主进程创建 |
_MODEL_CACHE |
✅ 子进程内安全 | 每个子进程独立 |
2. 返回 CPU 张量
# ❌ 直接返回会报错
return prediction # prediction.gaussians.means is on CUDA
# ✅ 移到 CPU 后返回
prediction = move_to_cpu(prediction)
return prediction # All tensors are on CPU, pickle safe
🧪 测试修复
# 1. 应用修改
# 复制上面的完整代码到 model_inference.py
# 2. 推送到 Spaces
git add depth_anything_3/app/modules/model_inference.py
git commit -m "Fix: Spaces GPU CUDA initialization error"
git push
# 3. 测试多次运行
# 在 Space 中连续运行 2-3 次推理
# 应该不再出现 CUDA 错误
📊 修复效果
| 问题 | Before | After |
|---|---|---|
| 第一次推理 | ❌ CUDA 错误 | ✅ 正常 |
| 第二次推理 | ❌ CUDA 错误 | ✅ 正常 |
| 连续推理 | ❌ 失败 | ✅ 稳定 |
| 模型加载 | 每次重新加载 | 缓存复用 |
💡 最佳实践
对于 @spaces.GPU 装饰的函数:
- ✅ 使用全局变量缓存模型(子进程安全)
- ✅ 不要使用实例变量存储模型
- ✅ 返回前移动所有张量到 CPU
- ✅ 清理 GPU 内存 (
torch.cuda.empty_cache()) - ❌ 不要在主进程中初始化 CUDA
- ❌ 不要返回 CUDA 张量