angkul07 commited on
Commit
c7ec752
·
verified ·
1 Parent(s): 2fd3d87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -103
app.py CHANGED
@@ -1,110 +1,24 @@
 
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
4
  from PIL import Image
5
 
6
- # Set device
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
- print(f"Using device: {device}")
 
 
9
 
10
- # Replace with your model's Hugging Face repo ID
11
- MODEL_ID = "angkul07/fashion_finetuned_Llama-3.2-11B-Vision"
 
 
 
12
 
13
- def load_model():
14
- """Load the fine-tuned vision language model from Hugging Face"""
15
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
16
- processor = AutoProcessor.from_pretrained(MODEL_ID)
17
- model = AutoModelForCausalLM.from_pretrained(
18
- MODEL_ID,
19
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
20
- device_map="auto" if torch.cuda.is_available() else None
21
- )
22
- return model, tokenizer, processor
23
 
24
- # Initialize model, tokenizer, and processor
25
- print("Loading model...")
26
- model, tokenizer, processor = load_model()
27
- print("Model loaded successfully!")
28
-
29
- def generate_response(image, prompt="What's in this image?", max_new_tokens=256, temperature=0.7):
30
- """Generate a response based on the uploaded image and optional prompt"""
31
- if image is None:
32
- return "Please upload an image."
33
-
34
- try:
35
- # Process the image and text inputs
36
- inputs = processor(
37
- text=prompt,
38
- images=image,
39
- return_tensors="pt"
40
- ).to(device)
41
-
42
- # Generate response
43
- with torch.no_grad():
44
- outputs = model.generate(
45
- **inputs,
46
- max_new_tokens=max_new_tokens,
47
- do_sample=True,
48
- temperature=temperature
49
- )
50
-
51
- # Decode the generated tokens
52
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
-
54
- # For some models, you might need to extract only the generated part,
55
- # removing the input prompt from the response
56
- if prompt in response:
57
- response = response.split(prompt, 1)[1].strip()
58
-
59
- return response
60
-
61
- except Exception as e:
62
- return f"Error generating response: {str(e)}"
63
-
64
- # Create Gradio interface
65
- with gr.Blocks(title="Llama-3.2-11B-Vision Interface") as demo:
66
- gr.Markdown("# Llama-3.2-11B-Vision Fine-tuned Model")
67
- gr.Markdown("Upload an image and get a description from the fine-tuned vision model.")
68
-
69
- with gr.Row():
70
- with gr.Column(scale=1):
71
- image_input = gr.Image(type="pil", label="Upload Image")
72
- prompt_input = gr.Textbox(label="Prompt (Optional)", value="What's in this image?", lines=2)
73
-
74
- with gr.Row():
75
- with gr.Column(scale=1):
76
- max_new_tokens = gr.Slider(minimum=10, maximum=512, value=256, step=1, label="Max New Tokens")
77
- with gr.Column(scale=1):
78
- temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
79
-
80
- submit_btn = gr.Button("Generate Response", variant="primary")
81
-
82
- with gr.Column(scale=1):
83
- output = gr.Textbox(label="Model Output", lines=10)
84
-
85
- # Set up the button click event
86
- submit_btn.click(
87
- fn=generate_response,
88
- inputs=[image_input, prompt_input, max_new_tokens, temperature],
89
- outputs=output
90
- )
91
-
92
- gr.Examples(
93
- examples=[
94
- ["sample_images/cat.jpg", "Describe this animal in detail"],
95
- ["sample_images/landscape.jpg", "What location might this be?"],
96
- ],
97
- inputs=[image_input, prompt_input]
98
- )
99
-
100
- gr.Markdown("### Instructions")
101
- gr.Markdown("""
102
- 1. Upload an image using the file selector
103
- 2. (Optional) Edit the prompt to ask something specific about the image
104
- 3. Adjust the generation parameters if needed
105
- 4. Click 'Generate Response' to get the model's output
106
- """)
107
-
108
- # Launch the app
109
  if __name__ == "__main__":
110
- demo.launch(share=True) # Set share=False if you don't want a public link
 
1
+ from unsloth import FastVisionModel
2
  import gradio as gr
 
 
3
  from PIL import Image
4
 
5
+ model, tokenizer = FastVisionModel.from_pretrained(
6
+ model_name = "angkul07/fashion_finetuned_Llama-3.2-11B-Vision",
7
+ load_in_4bit = True,
8
+ )
9
+ FastVisionModel.for_inference(model)
10
 
11
+ def predict(image):
12
+ # You may need to adjust this depending on your model's expected input/output
13
+ prompt = "Generate caption"
14
+ output = model.generate(image, prompt=prompt)
15
+ return output
16
 
17
+ iface = gr.Interface(
18
+ fn=predict,
19
+ inputs=gr.Image(type="pil"),
20
+ outputs="text"
21
+ )
 
 
 
 
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if __name__ == "__main__":
24
+ iface.launch()