Test006 / app.py
AmandaPanda's picture
Upload 17 files
702152b verified
# Caption Generator w/English-to-Spanish Translation
# A. Harper | ARIN 460 | December 2025
# Load into Hugging Face Space (using the Gradio Framework)
# Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers)
# To run, navigate to the App tab. Click the red Generate button.
# The app will randomly select image, generate (English) caption,
# then generate Spanish translation.
# Import gradio - app framework
import gradio as gr
# Two image datasources are available.
# Minor adjustments (add/remove # to deactivate/activate) to switch between datasources.
# AA comments refer to images in the DataFrame / from Coco database
# BB comments refer to images stored in local Gradio app folder
# Import os and random to support random selection of image (from folder)
import os
import random
# Import pandas datasets, transformers, torch
import pandas as pd
from datasets import load_dataset
from transformers import (
BlipProcessor,
BlipForConditionalGeneration,
AutoTokenizer,
AutoModelForSeq2SeqLM,
MarianMTModel,
MarianTokenizer
)
from PIL import Image
import torch
# AA: Load dataset. Initial image source.
#Load dataset (henryscheible/coco_val2014_tiny)
dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation")
# Reduce dataset to 20 rows, i.e., get sample
samples = dataset.select(range(20))
#Convert to dataframe
df = pd.DataFrame(samples)
# BB: Direct to Photos folder
IMAGE_FOLDER = "Photos"
image_paths = [
os.path.join(IMAGE_FOLDER, f)
for f in os.listdir(IMAGE_FOLDER)
if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
#Load the image captioning model (Salesforce/blip-image-captioning-large)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
#Load transformer for translating captions from English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
trans_tokenizer = MarianTokenizer.from_pretrained(model_name)
trans_model = MarianMTModel.from_pretrained(model_name)
#Configure captioning function
def caption_random_image():
# AA: pick random row - from DF
##sample = df.sample(1).iloc[0]
# BB: Pick a random image path - image from folder
img_path = random.choice(image_paths)
# BB: Load into PIL - image from folder - image from folder
image = Image.open(img_path).convert("RGB")
# AA: Image - for DF
##image = sample["image"]
# Unconditional image captioning
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption_eng = processor.decode(out[0], skip_special_tokens=True)
# Translate caption from English to Spanish
trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt")
trans_out = trans_model.generate(trans_inputs)
caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True)
return image, caption_eng, caption_es
demo = gr.Interface(
fn=caption_random_image,
inputs=None,
outputs=[
gr.Image(type="pil", label="Random Image"),
gr.Textbox(label="Caption (English)"),
gr.Textbox(label="Caption (Spanish)")
],
title="Image Captioning (with English to Spanish translation)",
description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish."
)
demo.launch()