caption-gen

Runtime error

App Files Files Community

caption-gen / app.py

Sher1988

Update app.py

49efe0e verified 14 days ago

raw

history blame contribute delete

5.23 kB

	import torch
	import pandas as pd
	import streamlit as st
	from PIL import Image

	from encoder import EncoderCNN
	from decoder import DecoderRNN
	from utils.vocab import Vocabulary
	#from torchvision import transforms as T
	from utils.helpers import VOCAB_PATH, CAPTIONS_PATH, IMAGE_DIR
	from utils.transforms import transforms
	from inference import sample_with_temp, sample
	import sacrebleu
	import os
	from huggingface_hub import hf_hub_download

	@st.cache_resource
	def load_models():
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


	# Load captions and vocab
	captions = pd.read_csv(CAPTIONS_PATH)
	vocab = Vocabulary(load_path=VOCAB_PATH)

	# Initialize Models
	encoder = EncoderCNN(256).to(device)
	decoder = DecoderRNN(len(vocab), 256, 512).to(device)

	#
	repo_id = "Sher1988/image-classifier-weights"
	encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pth")
	decoder_path = hf_hub_download(repo_id=repo_id, filename="decoder.pth")

	# Load Weights
	encoder.load_state_dict(torch.load(encoder_path, map_location=device))
	decoder.load_state_dict(torch.load(decoder_path, map_location=device))

	encoder.eval()
	decoder.eval()

	return encoder, decoder, vocab, device, captions


	# --- Sidebar Configuration ---
	st.sidebar.header("Select an Example Image")

	if os.path.exists(IMAGE_DIR):
	available_images = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
	selected_img_name = st.sidebar.selectbox("Choose from Flickr8k:", ["None"] + available_images)

	# Add the preview thumbnail here
	if selected_img_name != "None":
	img_path = os.path.join(IMAGE_DIR, selected_img_name)
	st.sidebar.image(Image.open(img_path), caption="Sidebar Selection Preview", use_container_width=True)
	else:
	st.sidebar.warning("Image directory not found. Please check IMAGE_DIR path.")
	selected_img_name = "None"

	# --- Main App Logic ---
	encoder, decoder, vocab, device, captions = load_models()
	act_caps = []
	caption = ''
	st.title("📸 AI Image Captioner")

	temp = st.slider("Sampling Temperature", min_value=0.0, max_value=0.8, value=0.1, step=0.1)
	st.info("Higher temperature = more creative/random. Lower temperature = more predictable.")

	uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])

	# Determine which image to process
	img = None
	img_name = None

	if uploaded_file is not None:
	img = Image.open(uploaded_file).convert('RGB')
	img_name = uploaded_file.name
	elif selected_img_name != "None":
	img_path = os.path.join(IMAGE_DIR, selected_img_name)
	img = Image.open(img_path).convert('RGB')
	img_name = selected_img_name

	# If we have an image (from either source), run the model
	if img is not None:
	st.image(img, caption=f'Selected: {img_name}', width=300)

	# Process
	# Assuming transforms is defined or returned from load_models
	img_tensor = transforms(img).unsqueeze(0).to(device)

	# Get ground truth captions for the selected image name
	act_caps = captions[captions['image'] == img_name]['caption'].tolist()

	if act_caps:
	st.subheader("Actual Captions:")
	st.success(" \n".join(act_caps))
	else:
	st.info("No ground truth captions found for this image in the CSV.")

	with torch.no_grad():
	encoder_out = encoder(img_tensor)
	# Pass the 'temp' variable from the slider here
	caption = sample_with_temp(encoder_out, decoder, vocab, temp=temp)

	st.subheader("Generated Caption:")
	st.success(caption)

	if act_caps:
	# sacrebleu expects a list of strings for hypothesis
	# and a list of strings for references
	refs = [act_caps]
	sys = [caption]

	bleu = sacrebleu.corpus_bleu(sys, refs)

	st.subheader("Evaluation Metrics:")
	st.metric(label="SacreBLEU Score", value=f"{bleu.score:.2f}")
	st.progress(min(bleu.score / 50, 1.0))

	# N-gram Precision breakdown
	# bleu.precisions is a list: [p1, p2, p3, p4]
	cols = st.columns(4)
	for i, p in enumerate(bleu.precisions):
	cols[i].markdown(f"{i+1}-gram")
	cols[i].write(f"{p:.1f}%")

	# Brief explanation
	with st.expander("What do these mean?"):
	st.write("""
	- 1-gram: Individual word accuracy (Vocabulary).
	- 2-gram: Fluency of word pairs.
	- 4-gram: Capturing longer phrases/sentence structure.
	""")
	else:
	st.info("Upload an image from the Flickr8k set to see BLEU metrics.")

	st.header('About this Project')
	st.markdown("""
	This AI model generates descriptive captions for uploaded images using a ResNet50 + LSTM architecture.

	* Encoder: Pre-trained ResNet50 (Frozen) extracts high-level visual features.
	* Decoder: A Long Short-Term Memory (LSTM) network trained for 10 epochs.
	* Dataset: Trained on the Flickr8k dataset (8,000 images).

	⚠️ Note: Because the model was trained on a specific, small-scale dataset with a frozen backbone, it performs satisfactory on outdoor scenes, people, and animals. It may produce unexpected results for images significantly different from the Flickr8k distribution.
	""")