yashsharmaa commited on
Commit
97708b6
·
verified ·
1 Parent(s): 6a9740c

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +31 -0
  2. models/caption.py +14 -0
  3. models/qna.py +25 -0
  4. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ from models.caption import generate_caption
4
+ from models.qna import ask_question
5
+
6
+ st.set_page_config(page_title="🖼️ Image Caption & QnA", layout="centered")
7
+ st.title("🖼️ Visual Caption & 💬 QnA")
8
+
9
+ if "caption" not in st.session_state:
10
+ st.session_state.caption = ""
11
+
12
+ # Image upload and captioning section
13
+ uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
14
+
15
+ if uploaded_file:
16
+ image = Image.open(uploaded_file).convert("RGB")
17
+ st.image(image, caption="Uploaded Image", use_column_width=True)
18
+
19
+ with st.spinner("Generating caption..."):
20
+ st.session_state.caption = generate_caption(image)
21
+
22
+ st.success("Caption generated!")
23
+ st.markdown(f"**Caption:** _{st.session_state.caption}_")
24
+
25
+ # QnA section - persistent while caption exists
26
+ if st.session_state.caption:
27
+ question = st.text_input("Ask a question about the image caption")
28
+ if question:
29
+ with st.spinner("Thinking..."):
30
+ answer = ask_question(st.session_state.caption, question)
31
+ st.markdown(f"**Answer:** {answer}")
models/caption.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BlipProcessor, BlipForConditionalGeneration
2
+ import torch
3
+
4
+ # Load BLIP model and processor once
5
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
6
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
7
+
8
+ @torch.no_grad()
9
+ def generate_caption(image):
10
+ inputs = processor(images=image, return_tensors="pt").to("cuda")
11
+ output = model.generate(**inputs, max_new_tokens=50)
12
+ caption = processor.tokenizer.decode(output[0], skip_special_tokens=True)
13
+ return caption
14
+
models/qna.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import torch
3
+ import os
4
+
5
+ # ✅ Load Hugging Face token securely from environment
6
+ hf_token = os.getenv("HF_TOKEN")
7
+ if not hf_token:
8
+ raise ValueError("❌ HF_TOKEN environment variable not set. Please add it in Streamlit Cloud secrets.")
9
+
10
+ # LLaMA model ID
11
+ model_id = "meta-llama/Llama-3.2-1B-Instruct"
12
+
13
+ # Load tokenizer and model
14
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
15
+ model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, torch_dtype=torch.float16).to("cuda")
16
+
17
+ @torch.no_grad()
18
+ def ask_question(caption, question):
19
+ prompt = f"""Image Caption: {caption}
20
+
21
+ Question: {question}
22
+ Answer:"""
23
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
24
+ output = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
25
+ return tokenizer.decode(output[0], skip_special_tokens=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ Pillow
5
+ accelerate