Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18

Commit

f75e630

verified ·

1 Parent(s): f50453e

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -70

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
-    VisionEncoderDecoderModel,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -92,20 +91,27 @@ css = """
 """
 # --- Fix for Dots.OCR Processor Loading ---
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
@@ -116,12 +122,18 @@ if os.path.exists(config_file_path):
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
@@ -133,7 +145,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Dots.OCR
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
@@ -144,10 +156,14 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load ByteDance/Dolphin
-MODEL_ID_B = "ByteDance/Dolphin"
-processor_b = AutoProcessor.from_pretrained(MODEL_ID_B)
-model_b = VisionEncoderDecoderModel.from_pretrained(MODEL_ID_B, torch_dtype=torch.float16).to(device).eval()
 @spaces.GPU
@@ -158,76 +174,48 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    if image is None:
-        yield "Please upload an image.", "Please upload an image."
-        return
-    images = [image.convert("RGB")]
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
-        messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty, "do_sample": True}
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
-            yield buffer, buffer
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
-        messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty, "do_sample": True}
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
-            yield buffer, buffer
-    elif model_name == "ByteDance/Dolphin":
-        processor, model = processor_b, model_b
-        pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device, torch.float16)
-        prompt_template = f"<s>{text} <Answer/>"
-        prompt_inputs = processor.tokenizer(
-            [prompt_template],
-            add_special_tokens=False,
-            return_tensors="pt"
-        )
-        prompt_ids = prompt_inputs.input_ids.to(device)
-        attention_mask = prompt_inputs.attention_mask.to(device)
-        outputs = model.generate(
-            pixel_values=pixel_values,
-            decoder_input_ids=prompt_ids,
-            decoder_attention_mask=attention_mask,
-            max_length=max_new_tokens,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-            use_cache=True,
-            bad_words_ids=[[processor.tokenizer.unk_token_id]],
-            return_dict_in_generate=True,
-            do_sample=False,  # Dolphin works best with greedy decoding
-            num_beams=1,
-            repetition_penalty=repetition_penalty
-        )
-        sequence = processor.tokenizer.decode(outputs.sequences[0], skip_special_tokens=False)
-        cleaned_output = sequence.replace(prompt_template, "").replace("<pad>", "").replace("</s>", "").strip()
-        yield cleaned_output, cleaned_output
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
@@ -259,7 +247,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR", "ByteDance/Dolphin"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )

     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 """
 # --- Fix for Dots.OCR Processor Loading ---
+# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
+# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
+# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
+# Add the local model path to sys.path so transformers can use the modified code
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
+# Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
+# Load PaddleOCR
+MODEL_ID_P = "strangervisionhf/paddle"
+processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
+model_p = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_P,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to(device).eval()
 @spaces.GPU
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
+    elif model_name == "PaddleOCR":
+        processor, model = processor_p, model_p
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
+    if image is None:
+        yield "Please upload an image.", "Please upload an image."
+        return
+    images = [image.convert("RGB")]
+    messages = [
+        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+        "do_sample": True
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
+        yield buffer, buffer
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )