razaali10 commited on
Commit
035fa81
·
verified ·
1 Parent(s): b97295c

Upload 3 files

Browse files
Files changed (3) hide show
  1. ap.py +46 -0
  2. readme.md +16 -0
  3. requirements.txt +4 -0
ap.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import requests
4
+ import torch
5
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
6
+
7
+ # Load the model and processor
8
+ @st.cache_resource
9
+ def load_model_and_processor():
10
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
11
+ model = MllamaForConditionalGeneration.from_pretrained(
12
+ model_id, torch_dtype=torch.bfloat16, device_map="auto"
13
+ )
14
+ processor = AutoProcessor.from_pretrained(model_id)
15
+ return model, processor
16
+
17
+ model, processor = load_model_and_processor()
18
+
19
+ # Title and instructions
20
+ st.title("Llama 3.2 Vision-Instruct")
21
+ st.write("Upload an image and ask a question about it. The model will analyze the image and provide an answer.")
22
+
23
+ # File uploader for image
24
+ uploaded_file = st.file_uploader("Upload an Image (JPG/PNG)", type=["jpg", "png", "jpeg"])
25
+
26
+ # Text input for the question
27
+ user_question = st.text_input("Enter your question about the image:")
28
+
29
+ # Process and respond
30
+ if uploaded_file and user_question:
31
+ # Display the uploaded image
32
+ image = Image.open(uploaded_file)
33
+ st.image(image, caption="Uploaded Image", use_column_width=True)
34
+
35
+ # Prepare input for the model
36
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_question}]}]
37
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
38
+ inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)
39
+
40
+ # Generate the response
41
+ with st.spinner("Processing..."):
42
+ output = model.generate(**inputs, max_new_tokens=30)
43
+ response = processor.decode(output[0])
44
+
45
+ # Display the response
46
+ st.write(f"**Model's Response:** {response}")
readme.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Llama 3.2 Vision-Instruct Space
2
+
3
+ This application uses the Llama-3.2-11B-Vision-Instruct model to process and answer questions about images.
4
+ Simply upload an image and ask your question in the input field!
5
+
6
+ ## How It Works
7
+ - Upload an image in JPG or PNG format.
8
+ - Enter your question in the text input box.
9
+ - The model analyzes the image and provides a response.
10
+
11
+ ## Requirements
12
+ - Python 3.8 or later
13
+ - See `requirements.txt` for dependencies.
14
+
15
+ ## License
16
+ This application uses the Llama-3.2-11B-Vision-Instruct model under the Llama 3.2 Community License.
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ transformers>=4.45.0
3
+ Pillow
4
+ streamlit