Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on Oct 14

Commit

08b7752

verified ·

1 Parent(s): 54d5943

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -28

app.py CHANGED Viewed

@@ -320,25 +320,51 @@ def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = '
     return "\n".join(markdown_lines)
-# Initialize model and processor at script level
 model_id = "rednote-hilab/dots.ocr"
 model_path = "./models/dots-ocr-local"
-snapshot_download(
-    repo_id=model_id,
-    local_dir=model_path,
-    local_dir_use_symlinks=False, # Recommended to set to False to avoid symlink issues
-)
-model = AutoModelForCausalLM.from_pretrained(
-    model_path,
-    attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    trust_remote_code=True
-)
-processor = AutoProcessor.from_pretrained(
-    model_path,
-    trust_remote_code=True
-)
 # Global state variables
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -356,6 +382,7 @@ pdf_cache = {
 def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
     """Run inference on an image with the given prompt"""
     try:
         if model is None or processor is None:
             raise RuntimeError("Model not loaded. Please check model initialization.")
@@ -392,8 +419,9 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
             return_tensors="pt",
         )
-        # Move to device
-        inputs = inputs.to(device)
         # Generate output
         with torch.no_grad():
@@ -423,6 +451,7 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
         return f"Error during inference: {str(e)}"
 def _generate_text_and_confidence_for_crop(
     image: Image.Image,
     max_new_tokens: int = 128,
@@ -433,6 +462,7 @@ def _generate_text_and_confidence_for_crop(
     Returns (generated_text, average_confidence_percent).
     """
     try:
         # Prepare a concise extraction prompt for the crop
         messages = [
             {
@@ -463,7 +493,8 @@ def _generate_text_and_confidence_for_crop(
             padding=True,
             return_tensors="pt",
         )
-        inputs = inputs.to(device)
         # Generate with scores
         with torch.no_grad():
@@ -506,9 +537,10 @@ def _generate_text_and_confidence_for_crop(
 def process_image(
-    image: Image.Image,
     min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None
 ) -> Dict[str, Any]:
     """Process a single image with the specified prompt mode"""
     try:
@@ -517,7 +549,7 @@ def process_image(
             image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
         # Run inference with the default prompt
-        raw_output = inference(image, prompt)
         # Process results based on prompt mode
         result = {
@@ -876,8 +908,7 @@ def create_gradio_interface():
                             datatype=["html", "str", "str"],
                             label="OCR Results",
                             interactive=True,
-                            wrap=True,
-                            height=500
                         )
                     # Markdown output tab
                     with gr.Tab("📝 Extracted Content"):
@@ -950,11 +981,14 @@ def create_gradio_interface():
             return table_data
         # Event handlers
         def process_document(file_path, max_tokens, min_pix, max_pix):
             """Process the uploaded document"""
             global pdf_cache
             try:
                 if not file_path:
                     return None, [], "Please upload a file first.", None
@@ -974,9 +1008,10 @@ def create_gradio_interface():
                     for i, img in enumerate(pdf_cache["images"]):
                         result = process_image(
-                            img,
                             min_pixels=int(min_pix) if min_pix else None,
-                            max_pixels=int(max_pix) if max_pix else None
                         )
                         all_results.append(result)
                         if result.get('markdown_content'):
@@ -1014,7 +1049,8 @@ def create_gradio_interface():
                     result = process_image(
                         image,
                         min_pixels=int(min_pix) if min_pix else None,
-                        max_pixels=int(max_pix) if max_pix else None
                     )
                     pdf_cache["results"] = [result]

     return "\n".join(markdown_lines)
+# Initialize model/processor lazily inside GPU context
 model_id = "rednote-hilab/dots.ocr"
 model_path = "./models/dots-ocr-local"
+model = None
+processor = None
+def ensure_model_loaded():
+    """Lazily download and load model/processor using eager attention (no FlashAttention)."""
+    global model, processor
+    if model is not None and processor is not None:
+        return
+    # Always use eager attention
+    attn_impl = "eager"
+    # Use GPU if available, otherwise CPU
+    if torch.cuda.is_available():
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        device_map = "auto"
+    else:
+        dtype = torch.float32
+        device_map = "cpu"
+    # Download snapshot locally (idempotent)
+    snapshot_download(
+        repo_id=model_id,
+        local_dir=model_path,
+        local_dir_use_symlinks=False,
+    )
+    # Load model/processor
+    loaded_model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        attn_implementation=attn_impl,
+        torch_dtype=dtype,
+        device_map=device_map,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    )
+    loaded_processor = AutoProcessor.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+    )
+    model = loaded_model
+    processor = loaded_processor
 # Global state variables
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
     """Run inference on an image with the given prompt"""
     try:
+        ensure_model_loaded()
         if model is None or processor is None:
             raise RuntimeError("Model not loaded. Please check model initialization.")
             return_tensors="pt",
         )
+        # Move to the model's primary device (works with device_map as well)
+        primary_device = next(model.parameters()).device
+        inputs = inputs.to(primary_device)
         # Generate output
         with torch.no_grad():
         return f"Error during inference: {str(e)}"
+@spaces.GPU()
 def _generate_text_and_confidence_for_crop(
     image: Image.Image,
     max_new_tokens: int = 128,
     Returns (generated_text, average_confidence_percent).
     """
     try:
+        ensure_model_loaded()
         # Prepare a concise extraction prompt for the crop
         messages = [
             {
             padding=True,
             return_tensors="pt",
         )
+        primary_device = next(model.parameters()).device
+        inputs = inputs.to(primary_device)
         # Generate with scores
         with torch.no_grad():
 def process_image(
+    image: Image.Image,
     min_pixels: Optional[int] = None,
+    max_pixels: Optional[int] = None,
+    max_new_tokens: int = 24000,
 ) -> Dict[str, Any]:
     """Process a single image with the specified prompt mode"""
     try:
             image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
         # Run inference with the default prompt
+        raw_output = inference(image, prompt, max_new_tokens=max_new_tokens)
         # Process results based on prompt mode
         result = {
                             datatype=["html", "str", "str"],
                             label="OCR Results",
                             interactive=True,
+                            wrap=True
                         )
                     # Markdown output tab
                     with gr.Tab("📝 Extracted Content"):
             return table_data
         # Event handlers
+        @spaces.GPU()
         def process_document(file_path, max_tokens, min_pix, max_pix):
             """Process the uploaded document"""
             global pdf_cache
             try:
+                # Ensure model/processor are loaded within GPU context
+                ensure_model_loaded()
                 if not file_path:
                     return None, [], "Please upload a file first.", None
                     for i, img in enumerate(pdf_cache["images"]):
                         result = process_image(
+                            img,
                             min_pixels=int(min_pix) if min_pix else None,
+                            max_pixels=int(max_pix) if max_pix else None,
+                            max_new_tokens=int(max_tokens) if max_tokens else 24000,
                         )
                         all_results.append(result)
                         if result.get('markdown_content'):
                     result = process_image(
                         image,
                         min_pixels=int(min_pix) if min_pix else None,
+                        max_pixels=int(max_pix) if max_pix else None,
+                        max_new_tokens=int(max_tokens) if max_tokens else 24000,
                     )
                     pdf_cache["results"] = [result]