ricklon Claude Sonnet 4.6 commited on
Commit
19cbeef
·
1 Parent(s): b9d5e1c

Fix attn_impl fallback: use eager not sdpa on ZeroGPU

Browse files

DeepseekOCR2ForCausalLM only supports flash_attention_2 and eager;
sdpa raises ValueError. Fall back to eager when CUDA is unavailable
at module load time (ZeroGPU cold start).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -29,10 +29,10 @@ MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
29
 
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
31
  # flash_attention_2 requires a CUDA device at init time — not available on ZeroGPU at
32
- # module load. Use sdpa (PyTorch scaled dot product attention) as the fallback; it works
33
- # on CPU at load time and on GPU at inference time. Locally with CUDA present, use
34
- # flash_attention_2 for maximum throughput.
35
- _attn_impl = 'flash_attention_2' if torch.cuda.is_available() else 'sdpa'
36
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation=_attn_impl, torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True).eval()
37
  # .cuda() is NOT called here — on ZeroGPU, GPU is only available inside @spaces.GPU
38
  # functions. Locally, model.cuda() is called inside process_image on first run.
 
29
 
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
31
  # flash_attention_2 requires a CUDA device at init time — not available on ZeroGPU at
32
+ # module load. DeepseekOCR2 only supports 'flash_attention_2' and 'eager'; sdpa is not
33
+ # implemented for this model class. Fall back to 'eager' when no GPU is present.
34
+ # Locally with CUDA, flash_attention_2 is used for maximum throughput.
35
+ _attn_impl = 'flash_attention_2' if torch.cuda.is_available() else 'eager'
36
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation=_attn_impl, torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True).eval()
37
  # .cuda() is NOT called here — on ZeroGPU, GPU is only available inside @spaces.GPU
38
  # functions. Locally, model.cuda() is called inside process_image on first run.