Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 20, 2025

Commit

c97526c

verified ·

1 Parent(s): 5cdeb4d

Update app.py

Browse files

Files changed (1) hide show

app.py +228 -280

app.py CHANGED Viewed

@@ -20,9 +20,9 @@ from transformers import (
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
-    EncoderDecoderCache  # Added to handle the new caching mechanism
 )
 from transformers.image_utils import load_image
 from docling_core.types.doc import DoclingDocument, DocTagsDocument
@@ -80,151 +80,126 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     width, height = image.size
-    pad_w_percent = random.uniform(min_percent, max_percent)
-    pad_h_percent = random.uniform(min_percent, max_percent)
-    pad_w = int(width * pad_w_percent)
-    pad_h = int(height * pad_h_percent)
-    corner_pixel = image.getpixel((0, 0))  # Top-left corner
-    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
-    return padded_image
 def normalize_values(text, target_max=500):
-    """Normalize numerical values in text to a target maximum."""
-    def normalize_list(values):
-        max_value = max(values) if values else 1
-        return [round((v / max_value) * target_max) for v in values]
-    def process_match(match):
-        num_list = ast.literal_eval(match.group(0))
-        normalized = normalize_list(num_list)
-        return "".join([f"<loc_{num}>" for num in normalized])
-    pattern = r"\[([\d\.\s,]+)\]"
-    normalized_text = re.sub(pattern, process_match, text)
-    return normalized_text
 def downsample_video(video_path):
-    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
     return frames
-# Dolphin-specific functions
 def model_chat(prompt, image):
-    """Use Dolphin model for inference."""
-    processor = processor_k
-    model = model_k
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    inputs = processor(image, return_tensors="pt").to(device)
-    pixel_values = inputs.pixel_values.half()
-    prompt_inputs = processor.tokenizer(
-        f"<s>{prompt} <Answer/>",
-        add_special_tokens=False,
-        return_tensors="pt"
-    ).to(device)
-    # Explicitly set past_key_values to None to align with new caching mechanism and avoid deprecated tuple warning
-    outputs = model.generate(
-        pixel_values=pixel_values,
-        decoder_input_ids=prompt_inputs.input_ids,
-        decoder_attention_mask=prompt_inputs.attention_mask,
-        min_length=1,
-        max_length=4096,
-        pad_token_id=processor.tokenizer.pad_token_id,
-        eos_token_id=processor.tokenizer.eos_token_id,
-        use_cache=True,
-        bad_words_ids=[[processor.tokenizer.unk_token_id]],
         return_dict_in_generate=True,
-        do_sample=False,
-        num_beams=1,
-        repetition_penalty=1.1,
-        past_key_values=None  # Added to prevent deprecated tuple handling
     )
-    sequence = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
-    cleaned = sequence.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
-    return cleaned
-def process_elements(layout_results, image):
-    """Parse layout results and extract elements from the image."""
-    # Placeholder parsing logic based on expected Dolphin output
-    # Assuming layout_results is a string like "[(x1,y1,x2,y2,label), ...]"
     try:
-        elements = ast.literal_eval(layout_results)
     except:
-        elements = []  # Fallback if parsing fails
-    recognition_results = []
-    reading_order = 0
     for bbox, label in elements:
-        try:
-            x1, y1, x2, y2 = map(int, bbox)
-            cropped = image.crop((x1, y1, x2, y2))
-            if cropped.size[0] > 0 and cropped.size[1] > 0:
-                if label == "text":
-                    text = model_chat("Read text in the image.", cropped)
-                    recognition_results.append({
-                        "label": label,
-                        "bbox": [x1, y1, x2, y2],
-                        "text": text.strip(),
-                        "reading_order": reading_order
-                    })
-                elif label == "table":
-                    table_text = model_chat("Parse the table in the image.", cropped)
-                    recognition_results.append({
-                        "label": label,
-                        "bbox": [x1, y1, x2, y2],
-                        "text": table_text.strip(),
-                        "reading_order": reading_order
-                    })
-                elif label == "figure":
-                    recognition_results.append({
-                        "label": label,
-                        "bbox": [x1, y1, x2, y2],
-                        "text": "[Figure]",  # Placeholder for figure content
-                        "reading_order": reading_order
-                    })
-            reading_order += 1
-        except Exception as e:
-            print(f"Error processing element: {e}")
             continue
-    return recognition_results
-def generate_markdown(recognition_results):
-    """Generate markdown from extracted elements."""
-    markdown = ""
-    for element in sorted(recognition_results, key=lambda x: x["reading_order"]):
-        if element["label"] == "text":
-            markdown += f"{element['text']}\n\n"
-        elif element["label"] == "table":
-            markdown += f"**Table:**\n{element['text']}\n\n"
-        elif element["label"] == "figure":
-            markdown += f"{element['text']}\n\n"
-    return markdown.strip()
 def process_image_with_dolphin(image):
-    """Process a single image with Dolphin model."""
-    layout_output = model_chat("Parse the reading order of this document.", image)
-    elements = process_elements(layout_output, image)
-    markdown_content = generate_markdown(elements)
-    return markdown_content
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -233,83 +208,78 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
-    """Generate responses for image input using the selected model."""
     if model_name == "ByteDance-s-Dolphin":
         if image is None:
             yield "Please upload an image."
-            return
-        markdown_content = process_image_with_dolphin(image)
-        yield markdown_content
-    else:
-        # Existing logic for other models
-        if model_name == "Nanonets-OCR-s":
-            processor = processor_m
-            model = model_m
-        elif model_name == "MonkeyOCR-Recognition":
-            processor = processor_g
-            model = model_g
-        elif model_name == "SmolDocling-256M-preview":
-            processor = processor_x
-            model = model_x
         else:
-            yield "Invalid model selected."
-            return
-        if image is None:
-            yield "Please upload an image."
-            return
-        images = [image]
-        if model_name == "SmolDocling-256M-preview":
-            if "OTSL" in text or "code" in text:
-                images = [add_random_padding(img) for img in images]
-            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-                text = normalize_values(text, target_max=500)
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "image"} for _ in images] + [
-                    {"type": "text", "text": text}
-                ]
-            }
-        ]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
         }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        full_output = ""
-        for new_text in streamer:
-            full_output += new_text
-            buffer += new_text.replace("<|im_end|>", "")
-            yield buffer
-        if model_name == "SmolDocling-256M-preview":
-            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-                if "<chart>" in cleaned_output:
-                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-                markdown_output = doc.export_to_markdown()
-                yield f"**MD Output:**\n\n{markdown_output}"
-            else:
-                yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
@@ -318,97 +288,77 @@ def generate_video(model_name: str, text: str, video_path: str,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
-    """Generate responses for video input using the selected model."""
     if model_name == "ByteDance-s-Dolphin":
-        if video_path is None:
             yield "Please upload a video."
             return
-        frames = downsample_video(video_path)
-        markdown_contents = []
-        for frame, _ in frames:
-            markdown_content = process_image_with_dolphin(frame)
-            markdown_contents.append(markdown_content)
-        combined_markdown = "\n\n".join(markdown_contents)
-        yield combined_markdown
     else:
-        # Existing logic for other models
-        if model_name == "Nanonets-OCR-s":
-            processor = processor_m
-            model = model_m
-        elif model_name == "MonkeyOCR-Recognition":
-            processor = processor_g
-            model = model_g
-        elif model_name == "SmolDocling-256M-preview":
-            processor = processor_x
-            model = model_x
-        else:
-            yield "Invalid model selected."
-            return
-        if video_path is None:
-            yield "Please upload a video."
-            return
-        frames = downsample_video(video_path)
-        images = [frame for frame, _ in frames]
-        if model_name == "SmolDocling-256M-preview":
-            if "OTSL" in text or "code" in text:
-                images = [add_random_padding(img) for img in images]
-            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-                text = normalize_values(text, target_max=500)
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "image"} for _ in images] + [
-                    {"type": "text", "text": text}
-                ]
-            }
-        ]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
         }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        full_output = ""
-        for new_text in streamer:
-            full_output += new_text
-            buffer += new_text.replace("<|im_end|>", "")
-            yield buffer
-        if model_name == "SmolDocling-256M-preview":
-            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-                if "<chart>" in cleaned_output:
-                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-                markdown_output = doc.export_to_markdown()
-                yield f"**MD Output:**\n\n{markdown_output}"
-            else:
-                yield cleaned_output
-# Define examples for image and video inference
 image_examples = [
     ["Convert this page to docling", "images/1.png"],
     ["OCR the image", "images/2.jpg"],
     ["Convert this page to docling", "images/3.png"],
 ]
 video_examples = [
     ["Explain the ad in detail", "example/1.mp4"],
     ["Identify the main actions in the coca cola ad...", "example/2.mp4"]
@@ -424,7 +374,6 @@ css = """
 }
 """
-# Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
@@ -459,7 +408,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 label="Select Model",
                 value="Nanonets-OCR-s"
             )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],

     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
+from transformers.generation import GenerationConfig
 from docling_core.types.doc import DoclingDocument, DocTagsDocument
 ).to(device).eval()
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     width, height = image.size
+    pad_w = int(width * random.uniform(min_percent, max_percent))
+    pad_h = int(height * random.uniform(min_percent, max_percent))
+    corner_pixel = image.getpixel((0, 0))
+    padded = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
+    return padded
 def normalize_values(text, target_max=500):
+    """Normalize numerical lists in text to a target maximum."""
+    def norm_list(vals):
+        m = max(vals) if vals else 1
+        return [round(v / m * target_max) for v in vals]
+    def repl(m):
+        lst = ast.literal_eval(m.group(0))
+        return "".join(f"<loc_{n}>" for n in norm_list(lst))
+    return re.sub(r"\[([\d\.\s,]+)\]", repl, text)
 def downsample_video(video_path):
+    """Extract 10 evenly spaced frames (with timestamps) from a video."""
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frames, indices = [], np.linspace(0, total - 1, 10, dtype=int)
+    for idx in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
+        ok, img = cap.read()
+        if not ok:
+            continue
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        frames.append((Image.fromarray(img), round(idx / fps, 2)))
+    cap.release()
     return frames
+# Dolphin-specific inference
 def model_chat(prompt, image):
+    proc = processor_k
+    mdl = model_k
+    device_str = "cuda" if torch.cuda.is_available() else "cpu"
+    # encode image
+    inputs = proc(image, return_tensors="pt").to(device_str).pixel_values.half()
+    # encode prompt
+    pi = proc.tokenizer(f"<s>{prompt} <Answer/>", add_special_tokens=False, return_tensors="pt").to(device_str)
+    # build generation config
+    gen_cfg = GenerationConfig.from_model_config(mdl.config)
+    gen_cfg.max_length = 4096
+    gen_cfg.min_length = 1
+    gen_cfg.use_cache = True
+    gen_cfg.bad_words_ids = [[proc.tokenizer.unk_token_id]]
+    gen_cfg.num_beams = 1
+    gen_cfg.do_sample = False
+    gen_cfg.repetition_penalty = 1.1
+    out = mdl.generate(
+        pixel_values=inputs,
+        decoder_input_ids=pi.input_ids,
+        decoder_attention_mask=pi.attention_mask,
+        generation_config=gen_cfg,
         return_dict_in_generate=True,
     )
+    seq = proc.tokenizer.batch_decode(out.sequences, skip_special_tokens=False)[0]
+    return seq.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
+def process_elements(layout_result, image):
     try:
+        elements = ast.literal_eval(layout_result)
     except:
+        elements = []
+    results, order = [], 0
     for bbox, label in elements:
+        x1, y1, x2, y2 = map(int, bbox)
+        crop = image.crop((x1, y1, x2, y2))
+        if crop.width == 0 or crop.height == 0:
             continue
+        if label == "text":
+            txt = model_chat("Read text in the image.", crop)
+        elif label == "table":
+            txt = model_chat("Parse the table in the image.", crop)
+        else:
+            txt = "[Figure]"
+        results.append({
+            "label": label,
+            "bbox": [x1, y1, x2, y2],
+            "text": txt.strip(),
+            "reading_order": order
+        })
+        order += 1
+    return results
+def generate_markdown(recog):
+    md = ""
+    for el in sorted(recog, key=lambda x: x["reading_order"]):
+        if el["label"] == "text":
+            md += el["text"] + "\n\n"
+        elif el["label"] == "table":
+            md += f"**Table:**\n{el['text']}\n\n"
+        else:
+            md += el["text"] + "\n\n"
+    return md.strip()
 def process_image_with_dolphin(image):
+    layout = model_chat("Parse the reading order of this document.", image)
+    elems  = process_elements(layout, image)
+    return generate_markdown(elems)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     if model_name == "ByteDance-s-Dolphin":
         if image is None:
             yield "Please upload an image."
         else:
+            yield process_image_with_dolphin(image)
+        return
+    if model_name == "Nanonets-OCR-s":
+        proc, mdl = processor_m, model_m
+    elif model_name == "SmolDocling-256M-preview":
+        proc, mdl = processor_x, model_x
+    elif model_name == "MonkeyOCR-Recognition":
+        proc, mdl = processor_g, model_g
+    else:
+        yield "Invalid model selected."
+        return
+    if image is None:
+        yield "Please upload an image."
+        return
+    imgs = [image]
+    if model_name == "SmolDocling-256M-preview":
+        if any(tok in text for tok in ["OTSL", "code"]):
+            imgs = [add_random_padding(img) for img in imgs]
+        if any(tok in text for tok in ["OCR at text", "Identify element", "formula"]):
+            text = normalize_values(text, target_max=500)
+    messages = [
+        {"role":"user",
+         "content":[{"type":"image"} for _ in imgs] + [{"type":"text","text":text}]
         }
+    ]
+    prompt = proc.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = proc(text=prompt, images=imgs, return_tensors="pt").to(device)
+    gen_cfg = GenerationConfig.from_model_config(mdl.config)
+    gen_cfg.max_new_tokens      = max_new_tokens
+    gen_cfg.temperature         = temperature
+    gen_cfg.top_p               = top_p
+    gen_cfg.top_k               = top_k
+    gen_cfg.repetition_penalty  = repetition_penalty
+    gen_cfg.use_cache           = True
+    streamer = TextIteratorStreamer(proc, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "generation_config": gen_cfg,
+    }
+    thread = Thread(target=mdl.generate, kwargs=gen_kwargs)
+    thread.start()
+    buffer = ""
+    full_output = ""
+    for new_text in streamer:
+        full_output += new_text
+        buffer     += new_text.replace("<|im_end|>", "")
+        yield buffer
+    if model_name == "SmolDocling-256M-preview":
+        cleaned = full_output.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned for tag in ["<doctag>","<otsl>","<code>","<chart>","<formula>"]):
+            if "<chart>" in cleaned:
+                cleaned = cleaned.replace("<chart>","<otsl>").replace("</chart>","</otsl>")
+                cleaned = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned)
+            tags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned], imgs)
+            doc = DoclingDocument.load_from_doctags(tags_doc, document_name="Document")
+            yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
+        else:
+            yield cleaned
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     if model_name == "ByteDance-s-Dolphin":
+        if not video_path:
             yield "Please upload a video."
             return
+        md_list = []
+        for frame, _ in downsample_video(video_path):
+            md_list.append(process_image_with_dolphin(frame))
+        yield "\n\n".join(md_list)
+        return
+    if model_name == "Nanonets-OCR-s":
+        proc, mdl = processor_m, model_m
+    elif model_name == "SmolDocling-256M-preview":
+        proc, mdl = processor_x, model_x
+    elif model_name == "MonkeyOCR-Recognition":
+        proc, mdl = processor_g, model_g
     else:
+        yield "Invalid model selected."
+        return
+    if not video_path:
+        yield "Please upload a video."
+        return
+    frames = [f for f, _ in downsample_video(video_path)]
+    imgs = frames
+    if model_name == "SmolDocling-256M-preview":
+        if any(tok in text for tok in ["OTSL", "code"]):
+            imgs = [add_random_padding(img) for img in imgs]
+        if any(tok in text for tok in ["OCR at text", "Identify element", "formula"]):
+pm.text.normalize_values(text, target_max=500)
+    messages = [
+        {"role":"user",
+         "content":[{"type":"image"} for _ in imgs] + [{"type":"text","text":text}]
         }
+    ]
+    prompt = proc.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = proc(text=prompt, images=imgs, return_tensors="pt").to(device)
+    gen_cfg = GenerationConfig.from_model_config(mdl.config)
+    gen_cfg.max_new_tokens      = max_new_tokens
+    gen_cfg.temperature         = temperature
+    gen_cfg.top_p               = top_p
+    gen_cfg.top_k               = top_k
+    gen_cfg.repetition_penalty  = repetition_penalty
+    gen_cfg.use_cache           = True
+    streamer = TextIteratorStreamer(proc, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "generation_config": gen_cfg,
+    }
+    thread = Thread(target=mdl.generate, kwargs=gen_kwargs)
+    thread.start()
+    buff = ""
+    full = ""
+    for nt in streamer:
+        full += nt
+        buff += nt.replace("<|im_end|>", "")
+        yield buff
+# Gradio UI
 image_examples = [
     ["Convert this page to docling", "images/1.png"],
     ["OCR the image", "images/2.jpg"],
     ["Convert this page to docling", "images/3.png"],
 ]
 video_examples = [
     ["Explain the ad in detail", "example/1.mp4"],
     ["Identify the main actions in the coca cola ad...", "example/2.mp4"]
 }
 """
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
                 label="Select Model",
                 value="Nanonets-OCR-s"
             )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],