Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

476fc05

verified ·

1 Parent(s): b737007

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -18

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from transformers import (
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# --- Theme and CSS Definition ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
@@ -27,7 +26,7 @@ colors.steel_blue = colors.Color(
     c200="#A8CCE1",
     c300="#7DB3D2",
     c400="#529AC3",
-    c500="#4682B4",  # SteelBlue base color
     c600="#3E72A0",
     c700="#36638C",
     c800="#2E5378",
@@ -91,22 +90,20 @@ css = """
 }
 """
-# --- Fix for Dots.OCR Processor Loading ---
-# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
-# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'), # Create a dedicated subfolder
     max_workers=20,
     local_dir_use_symlinks=False
 )
-# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
@@ -119,28 +116,24 @@ if os.path.exists(config_file_path):
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
-                # Insert the attributes line to specify which processors to load
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
-        # Write the modified content back to the file
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
-# Add the local model path to sys.path so transformers can use the modified code
 sys.path.append(model_path_d_local)
-# --- Model Loading ---
-# Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -149,7 +142,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
@@ -160,7 +153,7 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load PaddleOCR
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
@@ -222,16 +215,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
-# Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
 ]
-# Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")

 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c200="#A8CCE1",
     c300="#7DB3D2",
     c400="#529AC3",
+    c500="#4682B4",
     c600="#3E72A0",
     c700="#36638C",
     c800="#2E5378",
 }
 """
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
         buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
 ]
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")