Spaces:

DocUA
/

Local_OCR_Demo

Running on Zero

App Files Files Community

DocUA commited on 23 days ago

Commit

9efb9c8

1 Parent(s): 3537ca8

feat: Improve Hugging Face cache management and enable mixed-precision inference for GPU models.

Browse files

Files changed (1) hide show

app_hf.py +38 -5

app_hf.py CHANGED Viewed

@@ -19,6 +19,8 @@ import datetime
 import fitz  # PyMuPDF
 import io
 import gc
 try:
     from transformers.models.llama import modeling_llama as _modeling_llama
@@ -52,6 +54,29 @@ warnings.filterwarnings("ignore", message="You are using a model of type .* to i
 DEEPSEEK_MODEL = 'deepseek-ai/DeepSeek-OCR-2'
 MEDGEMMA_MODEL = 'google/medgemma-1.5-4b-it'
 # --- Device Setup ---
 # For HF Spaces with ZeroGPU, we'll use cuda if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -66,12 +91,13 @@ class ModelManager:
         if model_name not in self.models:
             print(f"Loading {model_name} to CPU...")
             if model_name == DEEPSEEK_MODEL:
-                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 model = AutoModel.from_pretrained(
                     model_name,
                     trust_remote_code=True,
                     use_safetensors=True,
                     attn_implementation="eager",
                     torch_dtype=dtype
                 )
                 model.eval()
@@ -79,10 +105,11 @@ class ModelManager:
                 self.processors[model_name] = tokenizer
             elif model_name == MEDGEMMA_MODEL:
-                processor = AutoProcessor.from_pretrained(model_name)
                 model = AutoModelForImageTextToText.from_pretrained(
                     model_name,
                     trust_remote_code=True,
                     torch_dtype=dtype
                 )
                 model.eval()
@@ -134,7 +161,7 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
         model, processor_or_tokenizer = manager.get_model(model_choice)
         # Move to GPU only inside the decorated function
         print(f"Moving {model_choice} to GPU...")
-        model.to("cuda")
     except Exception as e:
         return f"Помилка завантаження чи переміщення моделі: {str(e)}\nЯкщо це MedGemma, переконайтеся, що ви надали HF_TOKEN."
@@ -144,6 +171,12 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
     all_results = []
     try:
         for i, img in enumerate(images_to_process):
             img = img.convert("RGB")
             try:
@@ -154,7 +187,7 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
                         tmp_path = tmp.name
                     try:
-                        with torch.no_grad():
                             res = model.infer(
                                 processor_or_tokenizer,
                                 prompt=custom_prompt if custom_prompt else "<image>\nFree OCR. ",
@@ -190,7 +223,7 @@ def run_ocr(input_image, input_file, model_choice, custom_prompt):
                         return_tensors="pt"
                     ).to("cuda") # Ensure inputs are on cuda
-                    with torch.no_grad():
                         output = model.generate(**inputs, max_new_tokens=4096, do_sample=False)
                     input_len = inputs["input_ids"].shape[-1]

 import fitz  # PyMuPDF
 import io
 import gc
+import threading
+import contextlib
 try:
     from transformers.models.llama import modeling_llama as _modeling_llama
 DEEPSEEK_MODEL = 'deepseek-ai/DeepSeek-OCR-2'
 MEDGEMMA_MODEL = 'google/medgemma-1.5-4b-it'
+_default_hf_home = "/data/.huggingface" if os.path.isdir("/data") else os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
+os.environ.setdefault("HF_HOME", _default_hf_home)
+_hf_cache_dir = os.environ.get("HF_HUB_CACHE") or os.path.join(os.environ["HF_HOME"], "hub")
+os.environ.setdefault("HF_HUB_CACHE", _hf_cache_dir)
+os.environ.setdefault("TRANSFORMERS_CACHE", _hf_cache_dir)
+def _warmup_hf_cache():
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:
+        print(f"Warmup cache failed: {e}")
+        return
+    for _repo_id in (DEEPSEEK_MODEL, MEDGEMMA_MODEL):
+        try:
+            snapshot_download(repo_id=_repo_id, cache_dir=_hf_cache_dir)
+        except Exception as e:
+            print(f"Warmup cache failed for {_repo_id}: {e}")
+threading.Thread(target=_warmup_hf_cache, daemon=True).start()
 # --- Device Setup ---
 # For HF Spaces with ZeroGPU, we'll use cuda if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
         if model_name not in self.models:
             print(f"Loading {model_name} to CPU...")
             if model_name == DEEPSEEK_MODEL:
+                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=_hf_cache_dir)
                 model = AutoModel.from_pretrained(
                     model_name,
                     trust_remote_code=True,
                     use_safetensors=True,
                     attn_implementation="eager",
+                    cache_dir=_hf_cache_dir,
                     torch_dtype=dtype
                 )
                 model.eval()
                 self.processors[model_name] = tokenizer
             elif model_name == MEDGEMMA_MODEL:
+                processor = AutoProcessor.from_pretrained(model_name, cache_dir=_hf_cache_dir)
                 model = AutoModelForImageTextToText.from_pretrained(
                     model_name,
                     trust_remote_code=True,
+                    cache_dir=_hf_cache_dir,
                     torch_dtype=dtype
                 )
                 model.eval()
         model, processor_or_tokenizer = manager.get_model(model_choice)
         # Move to GPU only inside the decorated function
         print(f"Moving {model_choice} to GPU...")
+        model.to(device="cuda", dtype=torch.float16)
     except Exception as e:
         return f"Помилка завантаження чи переміщення моделі: {str(e)}\nЯкщо це MedGemma, переконайтеся, що ви надали HF_TOKEN."
     all_results = []
     try:
+        _autocast_ctx = (
+            torch.autocast(device_type="cuda", dtype=torch.float16)
+            if torch.cuda.is_available()
+            else contextlib.nullcontext()
+        )
         for i, img in enumerate(images_to_process):
             img = img.convert("RGB")
             try:
                         tmp_path = tmp.name
                     try:
+                        with torch.no_grad(), _autocast_ctx:
                             res = model.infer(
                                 processor_or_tokenizer,
                                 prompt=custom_prompt if custom_prompt else "<image>\nFree OCR. ",
                         return_tensors="pt"
                     ).to("cuda") # Ensure inputs are on cuda
+                    with torch.no_grad(), _autocast_ctx:
                         output = model.generate(**inputs, max_new_tokens=4096, do_sample=False)
                     input_len = inputs["input_ids"].shape[-1]