Multimodal-OCR2

Build error

App Files Files Community

prithivMLmods commited on Sep 25

Commit

66c74a2

verified ·

1 Parent(s): 2acc319

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -30

app.py CHANGED Viewed

@@ -188,21 +188,25 @@ def generate_response(
 ):
     """Unified generation function for both image and video."""
     if media_file is None:
-        yield "Please upload an image or video file first.", "Please upload an image or video file first."
         return
     processor, model = get_model_and_processor(model_name)
     if not processor or not model:
-        yield "Invalid model selected.", "Invalid model selected."
         return
     media_type = "video" if is_video_file(media_file) else "image"
-    if media_type == "video":
-        frames = downsample_video(media_file)
-        images = [frame for frame, _ in frames]
-    else: # image
-        images = [Image.open(media_file)]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in query or "code" in query:
@@ -273,17 +277,24 @@ body, .gradio-container { font-family: 'Inter', sans-serif; }
 def handle_file_upload(file):
     if file is None:
-        return None, gr.update(visible=False)
     if is_video_file(file.name):
-        return gr.update(value=file.name, visible=False), gr.update(value=file.name, visible=True)
     else:
-        return gr.update(value=file.name, visible=True), gr.update(value=file.name, visible=False)
 def clear_all():
-    return None, None, None, ""
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
-    # Hidden state to store the path to the uploaded file
     media_file_path = gr.State(None)
     with gr.Row(elem_classes="main-container"):
@@ -312,25 +323,29 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
         with gr.Column(scale=4):
             gr.Markdown("# Multimodal OCR")
-            # --- Media Display Area ---
             with gr.Column(elem_classes="media-display"):
                 image_display = gr.Image(type="filepath", label="Image Preview", visible=False)
                 video_display = gr.Video(label="Video Preview", visible=False)
                 gr.Markdown("Upload an image or video to begin.")
-            # --- Examples ---
             gr.Examples(
                 examples=all_examples,
-                inputs=[media_file_path, "query_input"],
                 label="Examples (Click to run)",
-                fn=handle_file_upload, # Custom function to update media display
-                outputs=[image_display, video_display]
             )
-            # --- Chat/Output Window ---
             output_display = gr.Markdown(elem_classes="chat-window", value="### Output will be shown here")
-            # --- Input Bar ---
             with gr.Row(elem_classes="input-bar", vertical=False):
                 upload_btn = gr.UploadButton("📁 Add Files", file_types=["image", "video"])
                 model_dropdown = gr.Dropdown(
@@ -338,22 +353,15 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
                     label="Select Model",
                     value="Nanonets-OCR-s"
                 )
-                query_input = gr.Textbox(
-                    placeholder="Enter your query here...",
-                    show_label=False,
-                    scale=4,
-                )
                 submit_btn = gr.Button("▶", elem_classes="submit-button")
     # --- Event Handlers ---
     upload_btn.upload(
         fn=handle_file_upload,
         inputs=[upload_btn],
-        outputs=[image_display, video_display]
     )
-    # When file is uploaded, also store its path in the state
-    upload_btn.upload(lambda f: f.name if f else None, upload_btn, media_file_path)
     submit_btn.click(
         fn=generate_response,
@@ -363,7 +371,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     add_conv_btn.click(
         fn=clear_all,
-        outputs=[media_file_path, image_display, video_display, output_display]
     )

 ):
     """Unified generation function for both image and video."""
     if media_file is None:
+        yield "Please upload an image or video file first."
         return
     processor, model = get_model_and_processor(model_name)
     if not processor or not model:
+        yield "Invalid model selected."
         return
     media_type = "video" if is_video_file(media_file) else "image"
+    try:
+        if media_type == "video":
+            frames = downsample_video(media_file)
+            images = [frame for frame, _ in frames]
+        else: # image
+            images = [Image.open(media_file)]
+    except Exception as e:
+        yield f"Error processing file: {e}"
+        return
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in query or "code" in query:
 def handle_file_upload(file):
     if file is None:
+        return None, gr.update(visible=False), gr.update(visible=False)
     if is_video_file(file.name):
+        return file.name, gr.update(visible=False), gr.update(value=file.name, visible=True)
+    else:
+        return file.name, gr.update(value=file.name, visible=True), gr.update(visible=False)
+def handle_example_click(file_path, query):
+    if is_video_file(file_path):
+        # Update state, hide image, show video, update query
+        return file_path, gr.update(visible=False), gr.update(value=file_path, visible=True), query
     else:
+        # Update state, show image, hide video, update query
+        return file_path, gr.update(value=file_path, visible=True), gr.update(visible=False), query
 def clear_all():
+    return None, None, None, "### Output will be shown here", ""
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     media_file_path = gr.State(None)
     with gr.Row(elem_classes="main-container"):
         with gr.Column(scale=4):
             gr.Markdown("# Multimodal OCR")
             with gr.Column(elem_classes="media-display"):
                 image_display = gr.Image(type="filepath", label="Image Preview", visible=False)
                 video_display = gr.Video(label="Video Preview", visible=False)
                 gr.Markdown("Upload an image or video to begin.")
+            # Define query_input here so gr.Examples can reference it
+            query_input = gr.Textbox(
+                placeholder="Enter your query here...",
+                show_label=False,
+                scale=4,
+            )
             gr.Examples(
                 examples=all_examples,
+                inputs=[media_file_path, query_input], # Pass component objects
+                outputs=[media_file_path, image_display, video_display, query_input],
+                fn=handle_example_click,
                 label="Examples (Click to run)",
+                cache_examples=True
             )
             output_display = gr.Markdown(elem_classes="chat-window", value="### Output will be shown here")
             with gr.Row(elem_classes="input-bar", vertical=False):
                 upload_btn = gr.UploadButton("📁 Add Files", file_types=["image", "video"])
                 model_dropdown = gr.Dropdown(
                     label="Select Model",
                     value="Nanonets-OCR-s"
                 )
+                # The query_input is already defined above, but we place it here visually
                 submit_btn = gr.Button("▶", elem_classes="submit-button")
     # --- Event Handlers ---
     upload_btn.upload(
         fn=handle_file_upload,
         inputs=[upload_btn],
+        outputs=[media_file_path, image_display, video_display]
     )
     submit_btn.click(
         fn=generate_response,
     add_conv_btn.click(
         fn=clear_all,
+        outputs=[media_file_path, image_display, video_display, output_display, query_input]
     )