# Caption Generator w/English-to-Spanish Translation # A. Harper | ARIN 460 | December 2025 # Load into Hugging Face Space (using the Gradio Framework) # Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers) # To run, navigate to the App tab. Click the red Generate button. # The app will randomly select image, generate (English) caption, # then generate Spanish translation. # Import gradio - app framework import gradio as gr # Two image datasources are available. # Minor adjustments (add/remove # to deactivate/activate) to switch between datasources. # AA comments refer to images in the DataFrame / from Coco database # BB comments refer to images stored in local Gradio app folder # Import os and random to support random selection of image (from folder) import os import random # Import pandas datasets, transformers, torch import pandas as pd from datasets import load_dataset from transformers import ( BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer ) from PIL import Image import torch # AA: Load dataset. Initial image source. #Load dataset (henryscheible/coco_val2014_tiny) dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation") # Reduce dataset to 20 rows, i.e., get sample samples = dataset.select(range(20)) #Convert to dataframe df = pd.DataFrame(samples) # BB: Direct to Photos folder IMAGE_FOLDER = "Photos" image_paths = [ os.path.join(IMAGE_FOLDER, f) for f in os.listdir(IMAGE_FOLDER) if f.lower().endswith((".jpg", ".jpeg", ".png")) ] #Load the image captioning model (Salesforce/blip-image-captioning-large) processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") #Load transformer for translating captions from English to Spanish model_name = "Helsinki-NLP/opus-mt-en-es" trans_tokenizer = MarianTokenizer.from_pretrained(model_name) trans_model = MarianMTModel.from_pretrained(model_name) #Configure captioning function def caption_random_image(): # AA: pick random row - from DF ##sample = df.sample(1).iloc[0] # BB: Pick a random image path - image from folder img_path = random.choice(image_paths) # BB: Load into PIL - image from folder - image from folder image = Image.open(img_path).convert("RGB") # AA: Image - for DF ##image = sample["image"] # Unconditional image captioning inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) caption_eng = processor.decode(out[0], skip_special_tokens=True) # Translate caption from English to Spanish trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt") trans_out = trans_model.generate(trans_inputs) caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True) return image, caption_eng, caption_es demo = gr.Interface( fn=caption_random_image, inputs=None, outputs=[ gr.Image(type="pil", label="Random Image"), gr.Textbox(label="Caption (English)"), gr.Textbox(label="Caption (Spanish)") ], title="Image Captioning (with English to Spanish translation)", description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish." ) demo.launch()