Spaces:

sflindrs
/

vlm_comparer

Running

App Files Files Community

sflindrs commited on Feb 11

Commit

6133d17

verified ·

1 Parent(s): d404fc4

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -6

app.py CHANGED Viewed

@@ -30,7 +30,6 @@ def resolve_model(chosen, custom):
         return chosen
 # --- Main inference function ---
-# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1.
 # The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
 @spaces.GPU()
 def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
@@ -42,18 +41,15 @@ def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, mo
     device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
     # Create pipelines for image-to-text.
-    # Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image.
-    # We use the "image-to-text" task here so that the prompt is taken into account.
     pipe1 = pipeline("image-to-text", model=model1_name, device=device)
     pipe2 = pipeline("image-to-text", model=model2_name, device=device)
     # Run inference on the image with the provided prompt.
-    # Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt).
     output1 = pipe1(image, prompt)
     output2 = pipe2(image, prompt)
     # Extract the generated text.
-    # (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.)
     def extract_text(output):
         if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
             return output[0]["generated_text"]
@@ -70,7 +66,6 @@ def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, mo
     return chat1, chat2
 # --- Build the Gradio interface ---
-# Pre-populated sample prompt.
 sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
 with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:

         return chosen
 # --- Main inference function ---
 # The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
 @spaces.GPU()
 def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
     device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
     # Create pipelines for image-to-text.
+    # These models should support a call signature of (image, prompt)
     pipe1 = pipeline("image-to-text", model=model1_name, device=device)
     pipe2 = pipeline("image-to-text", model=model2_name, device=device)
     # Run inference on the image with the provided prompt.
     output1 = pipe1(image, prompt)
     output2 = pipe2(image, prompt)
     # Extract the generated text.
     def extract_text(output):
         if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
             return output[0]["generated_text"]
     return chat1, chat2
 # --- Build the Gradio interface ---
 sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
 with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo: