File size: 3,568 Bytes
702152b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Caption Generator w/English-to-Spanish Translation
# A. Harper | ARIN 460 | December 2025

# Load into Hugging Face Space (using the Gradio Framework)
# Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers)

# To run, navigate to the App tab. Click the red Generate button. 
# The app will randomly select image, generate (English) caption, 
    # then generate Spanish translation. 


# Import gradio - app framework
import gradio as gr


# Two image datasources are available. 
# Minor adjustments (add/remove # to deactivate/activate) to switch between datasources.
# AA comments refer to images in the DataFrame / from Coco database
# BB comments refer to images stored in local Gradio app folder


# Import os and random to support random selection of image (from folder)
import os
import random


# Import pandas datasets, transformers, torch
import pandas as pd

from datasets import load_dataset

from transformers import (
    BlipProcessor, 
    BlipForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    MarianMTModel, 
    MarianTokenizer
)

from PIL import Image
import torch


# AA: Load dataset. Initial image source. 
#Load dataset (henryscheible/coco_val2014_tiny)
dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation")


# Reduce dataset to 20 rows, i.e., get sample
samples = dataset.select(range(20))


#Convert to dataframe
df = pd.DataFrame(samples)


# BB: Direct to Photos folder
IMAGE_FOLDER = "Photos"

image_paths = [
    os.path.join(IMAGE_FOLDER, f)
    for f in os.listdir(IMAGE_FOLDER)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
]

#Load the image captioning model (Salesforce/blip-image-captioning-large)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")


#Load transformer for translating captions from English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
trans_tokenizer = MarianTokenizer.from_pretrained(model_name)
trans_model = MarianMTModel.from_pretrained(model_name)


#Configure captioning function

def caption_random_image():


    # AA: pick random row - from DF
    ##sample = df.sample(1).iloc[0]


    # BB: Pick a random image path - image from folder
    img_path = random.choice(image_paths)


    # BB: Load into PIL - image from folder - image from folder
    image = Image.open(img_path).convert("RGB")

    
    # AA: Image - for DF
    ##image = sample["image"]


    # Unconditional image captioning
    inputs = processor(image, return_tensors="pt")


    out = model.generate(**inputs)
    caption_eng = processor.decode(out[0], skip_special_tokens=True)


    # Translate caption from English to Spanish
    trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt")
    trans_out = trans_model.generate(trans_inputs)
    caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True)


    return image, caption_eng, caption_es




demo = gr.Interface(
    fn=caption_random_image,
    inputs=None,
    outputs=[
        gr.Image(type="pil", label="Random Image"),
        gr.Textbox(label="Caption (English)"),
        gr.Textbox(label="Caption (Spanish)")
    ],
    title="Image Captioning (with English to Spanish translation)",
    description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish."
)


demo.launch()