Spaces:

EYEDOL
/

TESTLLAVA

Sleeping

App Files Files Community

EYEDOL commited on Nov 20, 2025

Commit

3575d8b

verified ·

1 Parent(s): 07bac4e

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -58

app.py CHANGED Viewed

@@ -1,47 +1,31 @@
 import gradio as gr
 import torch
-from transformers import LlavaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
-from PIL import Image
 # Configuration
-MODEL_ID = "llava-hf/llava-1.5-7b-hf"
-print(f"Loading {MODEL_ID}... This may take a few minutes depending on your internet connection.")
-# 1. Load Model with Quantization (to save GPU memory)
-# We use 4-bit quantization so this can run on consumer GPUs (approx 6-8GB VRAM required)
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
 try:
-    processor = AutoProcessor.from_pretrained(MODEL_ID)
-    model = LlavaForConditionalGeneration.from_pretrained(
-        MODEL_ID,
-        quantization_config=quantization_config,
-        device_map="auto"
     )
     print("Model loaded successfully!")
 except Exception as e:
     print(f"Error loading model: {e}")
-    print("Ensure you have a GPU available and 'bitsandbytes' installed.")
     exit()
-def format_prompt(image, history, message):
-    """
-    Formats the conversation history and new message into the template LLaVA expects.
-    Standard LLaVA 1.5 format: USER: <image>\n<prompt>\nASSISTANT:
-    """
-    prompt = ""
-    # Use the conversation history to build context (simplified for single-turn image focus)
-    # Note: Multi-turn chat with LLaVA can get heavy on context length,
-    # so we focus primarily on the current question + image.
-    prompt = f"USER: <image>\n{message}\nASSISTANT:"
-    return prompt
 def chat_response(message, history, image_input):
     """
     Main generation function called by Gradio.
@@ -49,42 +33,66 @@ def chat_response(message, history, image_input):
     if image_input is None:
         return "Please upload an image first to chat about it!"
-    # 1. Prepare text prompt
-    prompt_text = format_prompt(image_input, history, message)
-    # 2. Process inputs (Image + Text)
-    # Converting image to RGB is important as some PNGs have alpha channels
-    try:
-        image = image_input.convert("RGB")
-    except Exception:
-        return "Error processing image. Please ensure it is a valid image file."
-    inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(model.device)
-    # 3. Generate Response
-    # max_new_tokens determines how long the answer can be
-    output = model.generate(
-        **inputs,
         max_new_tokens=200,
         do_sample=True,
         temperature=0.7,
         top_p=0.9
     )
-    # 4. Decode output
-    decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0]
-    # The raw output contains the prompt, so we strip it out to get just the assistant's reply
-    # The prompt format is "USER: ... ASSISTANT:", so we split by ASSISTANT:
-    response = decoded_output.split("ASSISTANT:")[-1].strip()
     return response
 # --- Gradio UI Setup ---
-with gr.Blocks(title="LLaVA Image Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🌋 LLaVA: Chat with Images")
-    gr.Markdown("Upload an image and ask questions about it using the LLaVA 1.5 Model.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -96,14 +104,12 @@ with gr.Blocks(title="LLaVA Image Chat", theme=gr.themes.Soft()) as demo:
                 additional_inputs=[image_box],
                 title="Chat",
                 description="Ask about the uploaded image.",
-                # Examples must match the inputs: [text_message, image_input_value]
                 examples=[
                     ["What is in this image?", None],
-                    ["Describe the colors.", None],
-                    ["Can you read the text in the image?", None],
                 ],
             )
 if __name__ == "__main__":
-    # queue() is required for generator/streaming interactions in some environments
     demo.queue().launch()

 import gradio as gr
 import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
 # Configuration
+MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+print(f"Loading {MODEL_ID}...")
+# 1. Load Model
+# We use bfloat16 (half precision) which is faster than 4-bit for small models
+# and fits easily in 16GB or even 8GB VRAM.
 try:
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
     )
+    # The min_pixels and max_pixels arguments help control resolution for speed
+    processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=256*28*28, max_pixels=1280*28*28)
     print("Model loaded successfully!")
 except Exception as e:
     print(f"Error loading model: {e}")
+    print("Ensure you have a GPU available.")
     exit()
 def chat_response(message, history, image_input):
     """
     Main generation function called by Gradio.
     if image_input is None:
         return "Please upload an image first to chat about it!"
+    # 2. Prepare the messages for Qwen2-VL
+    # Qwen expects a specific format: a list of messages with specific 'type' keys
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image_input, # Pass the PIL image directly
+                },
+                {"type": "text", "text": message},
+            ],
+        }
+    ]
+    # 3. Process inputs
+    # qwen_vl_utils helps process the image and text into tensors
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    # Move inputs to the same device as the model
+    inputs = inputs.to(model.device)
+    # 4. Generate Response
+    # We limit max_new_tokens to 200 for speed
+    generated_ids = model.generate(
+        **inputs,
         max_new_tokens=200,
         do_sample=True,
         temperature=0.7,
         top_p=0.9
     )
+    # 5. Decode output
+    # We trim the input tokens from the output to get only the new response
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    response = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
     return response
 # --- Gradio UI Setup ---
+with gr.Blocks(title="Qwen2-VL Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 Qwen2-VL-2B: Fast Image Chat")
+    gr.Markdown("Upload an image and ask questions. This 2B model is significantly faster than LLaVA-7B.")
     with gr.Row():
         with gr.Column(scale=1):
                 additional_inputs=[image_box],
                 title="Chat",
                 description="Ask about the uploaded image.",
                 examples=[
                     ["What is in this image?", None],
+                    ["Describe the lighting.", None],
+                    ["Read the text in the image.", None],
                 ],
             )
 if __name__ == "__main__":
     demo.queue().launch()