Spaces:

yashsharmaa
/

CIRChat

Sleeping

yashsharmaa commited on Apr 21, 2025

Commit

97708b6

verified ·

1 Parent(s): 6a9740c

Upload 4 files

Files changed (4) hide show

app.py ADDED Viewed

+import streamlit as st
+from PIL import Image
+from models.caption import generate_caption
+from models.qna import ask_question
+st.set_page_config(page_title="🖼️ Image Caption & QnA", layout="centered")
+st.title("🖼️ Visual Caption & 💬 QnA")
+if "caption" not in st.session_state:
+    st.session_state.caption = ""
+# Image upload and captioning section
+uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+if uploaded_file:
+    image = Image.open(uploaded_file).convert("RGB")
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    with st.spinner("Generating caption..."):
+        st.session_state.caption = generate_caption(image)
+    st.success("Caption generated!")
+    st.markdown(f"**Caption:** _{st.session_state.caption}_")
+# QnA section - persistent while caption exists
+if st.session_state.caption:
+    question = st.text_input("Ask a question about the image caption")
+    if question:
+        with st.spinner("Thinking..."):
+            answer = ask_question(st.session_state.caption, question)
+        st.markdown(f"**Answer:** {answer}")

models/caption.py ADDED Viewed

+from transformers import BlipProcessor, BlipForConditionalGeneration
+import torch
+# Load BLIP model and processor once
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
+@torch.no_grad()
+def generate_caption(image):
+    inputs = processor(images=image, return_tensors="pt").to("cuda")
+    output = model.generate(**inputs, max_new_tokens=50)
+    caption = processor.tokenizer.decode(output[0], skip_special_tokens=True)
+    return caption

models/qna.py ADDED Viewed

+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+# ✅ Load Hugging Face token securely from environment
+hf_token = os.getenv("HF_TOKEN")
+if not hf_token:
+    raise ValueError("❌ HF_TOKEN environment variable not set. Please add it in Streamlit Cloud secrets.")
+# LLaMA model ID
+model_id = "meta-llama/Llama-3.2-1B-Instruct"
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
+model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, torch_dtype=torch.float16).to("cuda")
+@torch.no_grad()
+def ask_question(caption, question):
+    prompt = f"""Image Caption: {caption}
+Question: {question}
+Answer:"""
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    output = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
+    return tokenizer.decode(output[0], skip_special_tokens=True)

requirements.txt ADDED Viewed

+streamlit
+transformers
+torch
+Pillow
+accelerate