Spaces:

angkul07
/

Fashion-Caption-Generator

Runtime error

App Files Files Community

angkul07 commited on May 7, 2025

Commit

c7ec752

verified ·

1 Parent(s): 2fd3d87

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -103

app.py CHANGED Viewed

@@ -1,110 +1,24 @@
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
 from PIL import Image
-# Set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# Replace with your model's Hugging Face repo ID
-MODEL_ID = "angkul07/fashion_finetuned_Llama-3.2-11B-Vision"
-def load_model():
-    """Load the fine-tuned vision language model from Hugging Face"""
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    processor = AutoProcessor.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None
-    )
-    return model, tokenizer, processor
-# Initialize model, tokenizer, and processor
-print("Loading model...")
-model, tokenizer, processor = load_model()
-print("Model loaded successfully!")
-def generate_response(image, prompt="What's in this image?", max_new_tokens=256, temperature=0.7):
-    """Generate a response based on the uploaded image and optional prompt"""
-    if image is None:
-        return "Please upload an image."
-    try:
-        # Process the image and text inputs
-        inputs = processor(
-            text=prompt,
-            images=image,
-            return_tensors="pt"
-        ).to(device)
-        # Generate response
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=temperature
-            )
-        # Decode the generated tokens
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # For some models, you might need to extract only the generated part,
-        # removing the input prompt from the response
-        if prompt in response:
-            response = response.split(prompt, 1)[1].strip()
-        return response
-    except Exception as e:
-        return f"Error generating response: {str(e)}"
-# Create Gradio interface
-with gr.Blocks(title="Llama-3.2-11B-Vision Interface") as demo:
-    gr.Markdown("# Llama-3.2-11B-Vision Fine-tuned Model")
-    gr.Markdown("Upload an image and get a description from the fine-tuned vision model.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Image")
-            prompt_input = gr.Textbox(label="Prompt (Optional)", value="What's in this image?", lines=2)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    max_new_tokens = gr.Slider(minimum=10, maximum=512, value=256, step=1, label="Max New Tokens")
-                with gr.Column(scale=1):
-                    temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
-            submit_btn = gr.Button("Generate Response", variant="primary")
-        with gr.Column(scale=1):
-            output = gr.Textbox(label="Model Output", lines=10)
-    # Set up the button click event
-    submit_btn.click(
-        fn=generate_response,
-        inputs=[image_input, prompt_input, max_new_tokens, temperature],
-        outputs=output
-    )
-    gr.Examples(
-        examples=[
-            ["sample_images/cat.jpg", "Describe this animal in detail"],
-            ["sample_images/landscape.jpg", "What location might this be?"],
-        ],
-        inputs=[image_input, prompt_input]
-    )
-    gr.Markdown("### Instructions")
-    gr.Markdown("""
-    1. Upload an image using the file selector
-    2. (Optional) Edit the prompt to ask something specific about the image
-    3. Adjust the generation parameters if needed
-    4. Click 'Generate Response' to get the model's output
-    """)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(share=True)  # Set share=False if you don't want a public link

+from unsloth import FastVisionModel
 import gradio as gr
 from PIL import Image
+model, tokenizer = FastVisionModel.from_pretrained(
+    model_name = "angkul07/fashion_finetuned_Llama-3.2-11B-Vision",
+    load_in_4bit = True,
+)
+FastVisionModel.for_inference(model)
+def predict(image):
+    # You may need to adjust this depending on your model's expected input/output
+    prompt = "Generate caption"
+    output = model.generate(image, prompt=prompt)
+    return output
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil"),
+    outputs="text"
+)
 if __name__ == "__main__":
+    iface.launch()