Spaces:

EYEDOL
/

TESTLLAVA

Sleeping

App Files Files Community

EYEDOL commited on Nov 19, 2025

Commit

019b165

verified ·

1 Parent(s): 2e2f9b7

Create app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import torch
+from transformers import LlavaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
+from PIL import Image
+# Configuration
+MODEL_ID = "llava-hf/llava-1.5-7b-hf"
+print(f"Loading {MODEL_ID}... This may take a few minutes depending on your internet connection.")
+# 1. Load Model with Quantization (to save GPU memory)
+# We use 4-bit quantization so this can run on consumer GPUs (approx 6-8GB VRAM required)
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+try:
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = LlavaForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        quantization_config=quantization_config,
+        device_map="auto"
+    )
+    print("Model loaded successfully!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    print("Ensure you have a GPU available and 'bitsandbytes' installed.")
+    exit()
+def format_prompt(image, history, message):
+    """
+    Formats the conversation history and new message into the template LLaVA expects.
+    Standard LLaVA 1.5 format: USER: <image>\n<prompt>\nASSISTANT:
+    """
+    prompt = ""
+    # Use the conversation history to build context (simplified for single-turn image focus)
+    # Note: Multi-turn chat with LLaVA can get heavy on context length,
+    # so we focus primarily on the current question + image.
+    prompt = f"USER: <image>\n{message}\nASSISTANT:"
+    return prompt
+def chat_response(message, history, image_input):
+    """
+    Main generation function called by Gradio.
+    """
+    if image_input is None:
+        return "Please upload an image first to chat about it!"
+    # 1. Prepare text prompt
+    prompt_text = format_prompt(image_input, history, message)
+    # 2. Process inputs (Image + Text)
+    # Converting image to RGB is important as some PNGs have alpha channels
+    image = image_input.convert("RGB")
+    inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(model.device)
+    # 3. Generate Response
+    # max_new_tokens determines how long the answer can be
+    output = model.generate(
+        **inputs,
+        max_new_tokens=200,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9
+    )
+    # 4. Decode output
+    decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0]
+    # The raw output contains the prompt, so we strip it out to get just the assistant's reply
+    # The prompt format is "USER: ... ASSISTANT:", so we split by ASSISTANT:
+    response = decoded_output.split("ASSISTANT:")[-1].strip()
+    return response
+# --- Gradio UI Setup ---
+with gr.Blocks(title="LLaVA Image Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🌋 LLaVA: Chat with Images")
+    gr.Markdown("Upload an image and ask questions about it using the LLaVA 1.5 Model.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_box = gr.Image(type="pil", label="Upload Image")
+        with gr.Column(scale=2):
+            chatbot = gr.ChatInterface(
+                fn=chat_response,
+                additional_inputs=[image_box],
+                title="Chat",
+                description="Ask about the uploaded image.",
+                examples=["What is in this image?", "Describe the colors.", "Can you read the text in the image?"],
+            )
+if __name__ == "__main__":
+    # queue() is required for generator/streaming interactions in some environments
+    demo.queue().launch()