Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 25

Commit

8de4827

verified ·

1 Parent(s): 808f211

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -16

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import io
 from threading import Thread
 from reportlab.lib.pagesizes import A4
 from reportlab.lib.styles import getSampleStyleSheet
-from reportlab.lib import colors
 from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
 from reportlab.lib.units import inch
 from reportlab.pdfbase import pdfmetrics
@@ -64,6 +63,21 @@ def identify_and_save_blob(blob_path):
     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
 @spaces.GPU
 def qwen_inference(model_name, media_input, text_input=None):
     """Handles inference for the selected model."""
@@ -72,7 +86,6 @@ def qwen_inference(model_name, media_input, text_input=None):
     # Determine media type and obtain a file path if needed
     if isinstance(media_input, str):
-        # If the input is a file path, check extension
         media_path = media_input
         if media_path.endswith(tuple(image_extensions.keys())):
             media_type = "image"
@@ -83,13 +96,7 @@ def qwen_inference(model_name, media_input, text_input=None):
                 raise ValueError("Unsupported media type. Please upload a valid image.")
     else:
         # media_input is a PIL image (or numpy array) coming from gr.Image
-        if not isinstance(media_input, Image.Image):
-            # In case gr.Image returns a numpy array, convert it.
-            media_input = Image.fromarray(media_input)
-        # Save the image temporarily to disk
-        temp_filename = f"temp_{uuid.uuid4()}.png"
-        media_input.save(temp_filename)
-        media_path = temp_filename
         media_type = "image"
     messages = [
@@ -133,12 +140,13 @@ def qwen_inference(model_name, media_input, text_input=None):
 def format_plain_text(output_text):
     """Formats the output text as plain text without LaTeX delimiters."""
-    # Remove LaTeX delimiters and convert to plain text
     plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
     return plain_text
-def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
     """Generates a document with the input image and plain text output."""
     plain_text = format_plain_text(output_text)
     if file_format == "pdf":
         return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
@@ -248,7 +256,6 @@ with gr.Blocks(css=css) as demo:
     gr.Markdown("# Qwen2VL: Compact Vision & Language Processing")
     with gr.Tab(label="Image Input"):
         with gr.Row():
             with gr.Column():
                 model_choice = gr.Dropdown(
@@ -262,7 +269,6 @@ with gr.Blocks(css=css) as demo:
                 )
                 text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text", lines=10)
                 plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=10)
@@ -347,12 +353,12 @@ with gr.Blocks(css=css) as demo:
                     label="Image Size"
                 )
                 file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")
         with gr.Row():
             get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")
         get_document_btn.click(
-            generate_document, [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size], gr.File(label="Download Document")
         )
 demo.launch(debug=True)

 from threading import Thread
 from reportlab.lib.pagesizes import A4
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
 from reportlab.lib.units import inch
 from reportlab.pdfbase import pdfmetrics
     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
+def get_media_file(media_input):
+    """
+    Ensures that the media input is a file path.
+    If it is a PIL image, it saves it temporarily and returns the file path.
+    """
+    if isinstance(media_input, str):
+        return media_input  # Already a file path
+    else:
+        if not isinstance(media_input, Image.Image):
+            # Convert numpy array to PIL image if needed
+            media_input = Image.fromarray(media_input)
+        temp_filename = f"temp_{uuid.uuid4()}.png"
+        media_input.save(temp_filename)
+        return temp_filename
 @spaces.GPU
 def qwen_inference(model_name, media_input, text_input=None):
     """Handles inference for the selected model."""
     # Determine media type and obtain a file path if needed
     if isinstance(media_input, str):
         media_path = media_input
         if media_path.endswith(tuple(image_extensions.keys())):
             media_type = "image"
                 raise ValueError("Unsupported media type. Please upload a valid image.")
     else:
         # media_input is a PIL image (or numpy array) coming from gr.Image
+        media_path = get_media_file(media_input)
         media_type = "image"
     messages = [
 def format_plain_text(output_text):
     """Formats the output text as plain text without LaTeX delimiters."""
     plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
     return plain_text
+def generate_document(media_input, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
     """Generates a document with the input image and plain text output."""
+    # Ensure media_input is a file path.
+    media_path = get_media_file(media_input)
     plain_text = format_plain_text(output_text)
     if file_format == "pdf":
         return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
     gr.Markdown("# Qwen2VL: Compact Vision & Language Processing")
     with gr.Tab(label="Image Input"):
         with gr.Row():
             with gr.Column():
                 model_choice = gr.Dropdown(
                 )
                 text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text", lines=10)
                 plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=10)
                     label="Image Size"
                 )
                 file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")
         with gr.Row():
             get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")
         get_document_btn.click(
+            generate_document,
+            [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size],
+            gr.File(label="Download Document")
         )
 demo.launch(debug=True)