Spaces:
Sleeping
Sleeping
File size: 3,568 Bytes
702152b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# Caption Generator w/English-to-Spanish Translation
# A. Harper | ARIN 460 | December 2025
# Load into Hugging Face Space (using the Gradio Framework)
# Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers)
# To run, navigate to the App tab. Click the red Generate button.
# The app will randomly select image, generate (English) caption,
# then generate Spanish translation.
# Import gradio - app framework
import gradio as gr
# Two image datasources are available.
# Minor adjustments (add/remove # to deactivate/activate) to switch between datasources.
# AA comments refer to images in the DataFrame / from Coco database
# BB comments refer to images stored in local Gradio app folder
# Import os and random to support random selection of image (from folder)
import os
import random
# Import pandas datasets, transformers, torch
import pandas as pd
from datasets import load_dataset
from transformers import (
BlipProcessor,
BlipForConditionalGeneration,
AutoTokenizer,
AutoModelForSeq2SeqLM,
MarianMTModel,
MarianTokenizer
)
from PIL import Image
import torch
# AA: Load dataset. Initial image source.
#Load dataset (henryscheible/coco_val2014_tiny)
dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation")
# Reduce dataset to 20 rows, i.e., get sample
samples = dataset.select(range(20))
#Convert to dataframe
df = pd.DataFrame(samples)
# BB: Direct to Photos folder
IMAGE_FOLDER = "Photos"
image_paths = [
os.path.join(IMAGE_FOLDER, f)
for f in os.listdir(IMAGE_FOLDER)
if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
#Load the image captioning model (Salesforce/blip-image-captioning-large)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
#Load transformer for translating captions from English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
trans_tokenizer = MarianTokenizer.from_pretrained(model_name)
trans_model = MarianMTModel.from_pretrained(model_name)
#Configure captioning function
def caption_random_image():
# AA: pick random row - from DF
##sample = df.sample(1).iloc[0]
# BB: Pick a random image path - image from folder
img_path = random.choice(image_paths)
# BB: Load into PIL - image from folder - image from folder
image = Image.open(img_path).convert("RGB")
# AA: Image - for DF
##image = sample["image"]
# Unconditional image captioning
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption_eng = processor.decode(out[0], skip_special_tokens=True)
# Translate caption from English to Spanish
trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt")
trans_out = trans_model.generate(trans_inputs)
caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True)
return image, caption_eng, caption_es
demo = gr.Interface(
fn=caption_random_image,
inputs=None,
outputs=[
gr.Image(type="pil", label="Random Image"),
gr.Textbox(label="Caption (English)"),
gr.Textbox(label="Caption (Spanish)")
],
title="Image Captioning (with English to Spanish translation)",
description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish."
)
demo.launch()
|