DeepSeek-OCR-2

@@ -901,41 +901,32 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
-        if not eval_mode:
-            streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            with torch.autocast("cuda", dtype=torch.bfloat16):
-                with torch.no_grad():
-                    output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
-                        images_spatial_crop = images_spatial_crop,
-                        # do_sample=False,
-                        # num_beams = 1,
-                        temperature=0.0,
-                        eos_token_id=tokenizer.eos_token_id,
-                        streamer=streamer,
-                        max_new_tokens=8192,
-                        no_repeat_ngram_size = 20,
-                        use_cache = True
-                        )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
-                with torch.no_grad():
-                    output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
-                        images_spatial_crop = images_spatial_crop,
-                        # do_sample=False,
-                        # num_beams = 1,
-                        temperature=0.0,
-                        eos_token_id=tokenizer.eos_token_id,
-                        max_new_tokens=8192,
-                        no_repeat_ngram_size = 35,
-                        use_cache = True
-                        )
         if '<image>' in conversation[0]['content'] and eval_mode:

+        # Initialization
+        if torch.backends.mps.is_available():
+            device = torch.device("mps")
+        elif torch.cuda.is_available():
+            device = torch.device("cuda")
         else:
+            device = torch.device("cpu")
+        dtype = torch.float16 if device.type == "mps" else torch.float32
+        # Execution Block
+        streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False) if not eval_mode else None
+        with torch.no_grad():
+            output_ids = self.generate(
+                input_ids.unsqueeze(0).to(device=device, dtype=torch.long),
+                images=[(images_crop.to(device=device, dtype=dtype),
+                         images_ori.to(device=device, dtype=dtype))],
+                images_seq_mask=images_seq_mask.unsqueeze(0).to(device=device, dtype=dtype),
+                images_spatial_crop=images_spatial_crop,
+                temperature=0.0,
+                eos_token_id=tokenizer.eos_token_id,
+                streamer=streamer,
+                max_new_tokens=8192,
+                no_repeat_ngram_size=20 if not eval_mode else 35,
+                use_cache=True
+            )
         if '<image>' in conversation[0]['content'] and eval_mode: