Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18

Commit

f50453e

verified ·

1 Parent(s): 9efae34

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -64

app.py CHANGED Viewed

@@ -92,52 +92,36 @@ css = """
 """
 # --- Fix for Dots.OCR Processor Loading ---
-# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
-# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
-# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
-                # Insert the attributes line to specify which processors to load
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
-        # Write the modified content back to the file
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
-# Add the local model path to sys.path so transformers can use the modified code
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
-# Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
@@ -149,7 +133,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
@@ -163,8 +147,7 @@ model_d = AutoModelForCausalLM.from_pretrained(
 # Load ByteDance/Dolphin
 MODEL_ID_B = "ByteDance/Dolphin"
 processor_b = AutoProcessor.from_pretrained(MODEL_ID_B)
-model_b = VisionEncoderDecoderModel.from_pretrained(MODEL_ID_B)
-model_b.to(device).eval().half()
 @spaces.GPU
@@ -175,64 +158,75 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    is_streaming = True
-    if model_name == "Nanonets-OCR2-3B":
-        processor, model = processor_m, model_m
-    elif model_name == "Dots.OCR":
-        processor, model = processor_d, model_d
-    elif model_name == "Dolphin":
-        processor, model = processor_b, model_b
-        is_streaming = False
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
-    image_rgb = image.convert("RGB")
-    if is_streaming:
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "image"}] + [{"type": "text", "text": text}]
-            }
-        ]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=[image_rgb], return_tensors="pt").to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-            "do_sample": True
-        }
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         for new_text in streamer:
             buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
             yield buffer, buffer
-    else:
-        # Handle non-streaming generation for ByteDance/Dolphin
-        pixel_values = processor(images=[image_rgb], return_tensors="pt").pixel_values.to(device).half()
-        # Note: The user's text query is not explicitly used here as the VisionEncoderDecoderModel
-        # pipeline primarily generates captions from images directly.
-        generated_ids = model.generate(pixel_values, max_new_tokens=max_new_tokens)
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # For this model, the output appears at once.
-        yield generated_text, generated_text
 # Define examples for image inference
 image_examples = [
@@ -265,7 +259,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR", "Dolphin"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )

 """
 # --- Fix for Dots.OCR Processor Loading ---
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Dots.OCR
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
 # Load ByteDance/Dolphin
 MODEL_ID_B = "ByteDance/Dolphin"
 processor_b = AutoProcessor.from_pretrained(MODEL_ID_B)
+model_b = VisionEncoderDecoderModel.from_pretrained(MODEL_ID_B, torch_dtype=torch.float16).to(device).eval()
 @spaces.GPU
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
+    images = [image.convert("RGB")]
+    if model_name == "Nanonets-OCR2-3B":
+        processor, model = processor_m, model_m
+        messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty, "do_sample": True}
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         for new_text in streamer:
             buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
             yield buffer, buffer
+    elif model_name == "Dots.OCR":
+        processor, model = processor_d, model_d
+        messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty, "do_sample": True}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
+            yield buffer, buffer
+    elif model_name == "ByteDance/Dolphin":
+        processor, model = processor_b, model_b
+        pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device, torch.float16)
+        prompt_template = f"<s>{text} <Answer/>"
+        prompt_inputs = processor.tokenizer(
+            [prompt_template],
+            add_special_tokens=False,
+            return_tensors="pt"
+        )
+        prompt_ids = prompt_inputs.input_ids.to(device)
+        attention_mask = prompt_inputs.attention_mask.to(device)
+        outputs = model.generate(
+            pixel_values=pixel_values,
+            decoder_input_ids=prompt_ids,
+            decoder_attention_mask=attention_mask,
+            max_length=max_new_tokens,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            do_sample=False,  # Dolphin works best with greedy decoding
+            num_beams=1,
+            repetition_penalty=repetition_penalty
+        )
+        sequence = processor.tokenizer.decode(outputs.sequences[0], skip_special_tokens=False)
+        cleaned_output = sequence.replace(prompt_template, "").replace("<pad>", "").replace("</s>", "").strip()
+        yield cleaned_output, cleaned_output
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
 # Define examples for image inference
 image_examples = [
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR", "ByteDance/Dolphin"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )