Multimodal-OCR2_By_Aryan

Runtime error

App Files Files Community

prithivMLmods commited on Jul 18

Commit

09230e7

verified ·

1 Parent(s): 354f88a

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -35

app.py CHANGED Viewed

@@ -27,6 +27,7 @@ from docling_core.types.doc import DoclingDocument, DocTagsDocument
 import re
 import ast
 import html
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
@@ -44,7 +45,6 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-#-----------------------------subfolder-----------------------------#
 # Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
 SUBFOLDER = "Recognition"
@@ -59,7 +59,6 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     subfolder=SUBFOLDER,
     torch_dtype=torch.float16
 ).to(device).eval()
-#-----------------------------subfolder-----------------------------#
 # Load Typhoon-OCR-7B
 MODEL_ID_L = "scb10x/typhoon-ocr-7b"
@@ -133,7 +132,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    # Model selection
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
@@ -154,17 +152,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    # Prepare images as a list (single image for image inference)
     images = [image]
-    # SmolDocling-256M specific preprocessing
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
         if "OCR at text at" in text or "Identify element" in text or "formula" in text:
             text = normalize_values(text, target_max=500)
-    # Unified message structure for all models
     messages = [
         {
             "role": "user",
@@ -176,7 +171,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    # Generation with streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -190,13 +184,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Stream output
     buffer = ""
     for new_text in streamer:
         buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
-    # SmolDocling-256M specific postprocessing
     if model_name == "SmolDocling-256M-preview":
         cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -218,7 +210,6 @@ def generate_video(model_name: str, text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
-    # Model selection
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
@@ -239,18 +230,15 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield "Please upload a video.", "Please upload a video."
         return
-    # Extract frames from video
     frames = downsample_video(video_path)
     images = [frame for frame, _ in frames]
-    # SmolDocling-256M specific preprocessing
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
         if "OCR at text at" in text or "Identify element" in text or "formula" in text:
             text = normalize_values(text, target_max=500)
-    # Unified message structure for all models
     messages = [
         {
             "role": "user",
@@ -262,7 +250,6 @@ def generate_video(model_name: str, text: str, video_path: str,
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    # Generation with streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -276,13 +263,11 @@ def generate_video(model_name: str, text: str, video_path: str,
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Stream output
     buffer = ""
     for new_text in streamer:
         buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
-    # SmolDocling-256M specific postprocessing
     if model_name == "SmolDocling-256M-preview":
         cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -313,20 +298,84 @@ video_examples = [
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
-# Updated CSS to include styling for the Result Canvas
-css = """
-.submit-btn {
-    background-color: #2980b9 !important;
-    color: white !important;
-}
-.submit-btn:hover {
-    background-color: #3498db !important;
-}
-.canvas-output {
-    border: 2px solid #4682B4;
-    border-radius: 10px;
-    padding: 20px;
-}
 """
 # Create the Gradio Interface
@@ -338,7 +387,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
-                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=image_examples,
                         inputs=[image_query, image_upload]
@@ -346,7 +395,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
-                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
@@ -354,12 +403,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
-            # Result Canvas with raw and formatted outputs
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
@@ -380,7 +428,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
-    # Connect submit buttons to generation functions with both outputs
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],

 import re
 import ast
 import html
+import urllib.parse
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
 SUBFOLDER = "Recognition"
     subfolder=SUBFOLDER,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Typhoon-OCR-7B
 MODEL_ID_L = "scb10x/typhoon-ocr-7b"
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
         yield "Please upload an image.", "Please upload an image."
         return
     images = [image]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
         if "OCR at text at" in text or "Identify element" in text or "formula" in text:
             text = normalize_values(text, target_max=500)
     messages = [
         {
             "role": "user",
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
         cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
         yield "Please upload a video.", "Please upload a video."
         return
     frames = downsample_video(video_path)
     images = [frame for frame, _ in frames]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
         if "OCR at text at" in text or "Identify element" in text or "formula" in text:
             text = normalize_values(text, target_max=500)
     messages = [
         {
             "role": "user",
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
         cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
+# SVG data URL for the button icon
+svg_code = '''
+<svg fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.5">
+  <polyline points="13.18 1.37 13.18 9.64 21.45 9.64 10.82 22.63 10.82 14.36 2.55 14.36 13.18 1.37"></polyline>
+</svg>
+'''
+svg_data_url = 'data:image/svg+xml,' + urllib.parse.quote(svg_code)
+# Updated CSS with fancy-button styles
+css = f"""
+.fancy-button {{
+  --round: 0.75rem;
+  cursor: pointer;
+  position: relative;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  overflow: hidden;
+  transition: all 0.25s ease;
+  background: radial-gradient(
+      65.28% 65.28% at 50% 100%,
+      rgba(223, 113, 255, 0.8) 0%,
+      rgba(223, 113, 255, 0) 100%
+    ),
+    linear-gradient(0deg, #7a5af8, #7a5af8);
+  border-radius: var(--round);
+  border: none;
+  outline: none;
+  padding: 12px 18px 12px 40px;
+  color: white;
+  font-size: 16px;
+  font-weight: 500;
+}}
+.fancy-button::before {{
+  content: '';
+  position: absolute;
+  left: 10px;
+  top: 50%;
+  transform: translateY(-50%);
+  width: 18px;
+  height: 18px;
+  background: url('{svg_data_url}') no-repeat center;
+  background-size: contain;
+}}
+.fancy-button::after {{
+  content: '';
+  position: absolute;
+  top: 0;
+  right: 0;
+  width: 1rem;
+  height: 1rem;
+  background: radial-gradient(
+    100% 75% at 55%,
+    rgba(223, 113, 255, 0.8) 0%,
+    rgba(223, 113, 255, 0) 100%
+  );
+  box-shadow: 0 0 3px black;
+  border-bottom-left-radius: 0.5rem;
+  border-top-right-radius: var(--round);
+  transition: all 0.5s ease-in-out;
+}}
+.fancy-button:hover::after {{
+  margin-top: -1rem;
+  margin-right: -1rem;
+}}
+.fancy-button:active {{
+  transform: scale(0.95);
+}}
+.canvas-output {{
+  border: 2px solid #4682B4;
+  border-radius: 10px;
+  padding: 20px;
+}}
 """
 # Create the Gradio Interface
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
+                    image_submit = gr.Button("Submit", elem_classes="fancy-button")
                     gr.Examples(
                         examples=image_examples,
                         inputs=[image_query, image_upload]
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
+                    video_submit = gr.Button("Submit", elem_classes="fancy-button")
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],