Spaces:

DocUA
/

Local_OCR_Demo

Running on Zero

App Files Files Community

DocUA commited on Jan 31

Commit

4f43939

1 Parent(s): 002d07a

feat: Implement CUDA BF16 error handling with automatic fallback to CPU for model inference and generation.

Browse files

Files changed (1) hide show

app_hf.py +73 -29

app_hf.py CHANGED Viewed

@@ -186,11 +186,16 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
     else:
         return "Будь ласка, завантажте зображення або PDF файл."
     try:
         model, processor_or_tokenizer = manager.get_model(model_choice)
         # Move to GPU only inside the decorated function
         print(f"Moving {model_choice} to GPU...")
         model.to(device="cuda", dtype=torch.float16)
     except Exception as e:
         return f"Помилка завантаження чи переміщення моделі: {str(e)}\nЯкщо це MedGemma, переконайтеся, що ви надали HF_TOKEN."
@@ -200,11 +205,10 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
     all_results = []
     try:
-        _autocast_ctx = (
-            torch.autocast(device_type="cuda", dtype=torch.float16)
-            if torch.cuda.is_available()
-            else contextlib.nullcontext()
-        )
         for i, img in enumerate(images_to_process):
             img = img.convert("RGB")
@@ -216,18 +220,38 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
                         tmp_path = tmp.name
                     try:
-                        with torch.no_grad(), _autocast_ctx:
-                            res = model.infer(
-                                processor_or_tokenizer,
-                                prompt=custom_prompt if custom_prompt else "<image>\nFree OCR. ",
-                                image_file=tmp_path,
-                                output_path=output_dir,
-                                base_size=1024,
-                                image_size=768,
-                                crop_mode=True,
-                                eval_mode=True
-                            )
-                        all_results.append(f"--- Page/Image {i+1} ---\n{res}")
                     finally:
                         if os.path.exists(tmp_path):
                             os.remove(tmp_path)
@@ -250,22 +274,42 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
                         tokenize=True,
                         return_dict=True,
                         return_tensors="pt"
-                    ).to("cuda") # Ensure inputs are on cuda
                     if "attention_mask" not in inputs:
                         inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long)
-                    with torch.no_grad(), _autocast_ctx:
-                        output = model.generate(
-                            **inputs,
-                            max_new_tokens=4096,
-                            do_sample=False,
-                            pad_token_id=processor_or_tokenizer.tokenizer.pad_token_id,
-                        )
-                        input_len = inputs["input_ids"].shape[-1]
-                        res = processor_or_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
-                        all_results.append(f"--- Page/Image {i+1} ---\n{res}")
             except Exception as e:
                 all_results.append(f"--- Page/Image {i+1} ---\nПомилка: {str(e)}")

     else:
         return "Будь ласка, завантажте зображення або PDF файл."
+    def _is_cuda_bf16_error(err):
+        msg = str(err)
+        return "CUBLAS_STATUS_INVALID_VALUE" in msg and "CUDA_R_16BF" in msg
     try:
         model, processor_or_tokenizer = manager.get_model(model_choice)
         # Move to GPU only inside the decorated function
         print(f"Moving {model_choice} to GPU...")
         model.to(device="cuda", dtype=torch.float16)
+        run_device = "cuda"
     except Exception as e:
         return f"Помилка завантаження чи переміщення моделі: {str(e)}\nЯкщо це MedGemma, переконайтеся, що ви надали HF_TOKEN."
     all_results = []
     try:
+        def _autocast_for(device_str):
+            if device_str == "cuda" and torch.cuda.is_available():
+                return torch.autocast(device_type="cuda", dtype=torch.float16)
+            return contextlib.nullcontext()
         for i, img in enumerate(images_to_process):
             img = img.convert("RGB")
                         tmp_path = tmp.name
                     try:
+                        try:
+                            with torch.no_grad(), _autocast_for(run_device):
+                                res = model.infer(
+                                    processor_or_tokenizer,
+                                    prompt=custom_prompt if custom_prompt else "<image>\nFree OCR. ",
+                                    image_file=tmp_path,
+                                    output_path=output_dir,
+                                    base_size=1024,
+                                    image_size=768,
+                                    crop_mode=True,
+                                    eval_mode=True
+                                )
+                            all_results.append(f"--- Page/Image {i+1} ---\n{res}")
+                        except Exception as e:
+                            if run_device == "cuda" and _is_cuda_bf16_error(e):
+                                print("CUDA BF16 error detected, retrying on CPU...")
+                                model.to(device="cpu", dtype=torch.float32)
+                                run_device = "cpu"
+                                with torch.no_grad(), _autocast_for(run_device):
+                                    res = model.infer(
+                                        processor_or_tokenizer,
+                                        prompt=custom_prompt if custom_prompt else "<image>\nFree OCR. ",
+                                        image_file=tmp_path,
+                                        output_path=output_dir,
+                                        base_size=1024,
+                                        image_size=768,
+                                        crop_mode=True,
+                                        eval_mode=True
+                                    )
+                                all_results.append(f"--- Page/Image {i+1} ---\n{res}")
+                            else:
+                                raise
                     finally:
                         if os.path.exists(tmp_path):
                             os.remove(tmp_path)
                         tokenize=True,
                         return_dict=True,
                         return_tensors="pt"
+                    ).to(run_device)
                     if "attention_mask" not in inputs:
                         inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long)
+                    try:
+                        with torch.no_grad(), _autocast_for(run_device):
+                            output = model.generate(
+                                **inputs,
+                                max_new_tokens=4096,
+                                do_sample=False,
+                                pad_token_id=processor_or_tokenizer.tokenizer.pad_token_id,
+                            )
+                            input_len = inputs["input_ids"].shape[-1]
+                            res = processor_or_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
+                            all_results.append(f"--- Page/Image {i+1} ---\n{res}")
+                    except Exception as e:
+                        if run_device == "cuda" and _is_cuda_bf16_error(e):
+                            print("CUDA BF16 error detected, retrying on CPU...")
+                            model.to(device="cpu", dtype=torch.float32)
+                            run_device = "cpu"
+                            inputs = inputs.to(run_device)
+                            with torch.no_grad(), _autocast_for(run_device):
+                                output = model.generate(
+                                    **inputs,
+                                    max_new_tokens=4096,
+                                    do_sample=False,
+                                    pad_token_id=processor_or_tokenizer.tokenizer.pad_token_id,
+                                )
+                                input_len = inputs["input_ids"].shape[-1]
+                                res = processor_or_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
+                                all_results.append(f"--- Page/Image {i+1} ---\n{res}")
+                        else:
+                            raise
             except Exception as e:
                 all_results.append(f"--- Page/Image {i+1} ---\nПомилка: {str(e)}")