File size: 3,568 Bytes
683a5b4
 
 
 
 
 
 
 
 
 
 
d3f61c4
ff9f4e9
1f447f3
ff9f4e9
1f0a8ac
 
 
 
 
 
 
 
 
 
 
d3f61c4
ef8e1c9
 
1f0a8ac
1792c67
607f6c3
 
 
 
0680b7e
 
 
607f6c3
 
d3f61c4
 
1ba53ba
1792c67
1f0a8ac
359647d
fa9c320
1ba53ba
1792c67
d3f61c4
33e5ff5
1ba53ba
1792c67
d3f61c4
 
1ba53ba
b68a7f1
1f0a8ac
 
 
 
 
 
 
 
1792c67
d3f61c4
1ba53ba
 
 
1792c67
607f6c3
157edf2
0680b7e
 
607f6c3
3f77bf2
1f0a8ac
1f447f3
1f0a8ac
1792c67
 
1f0a8ac
5da3a6f
1f0a8ac
 
 
5da3a6f
1792c67
1ba53ba
1f0a8ac
5da3a6f
b68a7f1
359647d
1f0a8ac
5da3a6f
1ba53ba
 
d3f61c4
 
1ba53ba
1792c67
d3f61c4
607f6c3
 
1792c67
607f6c3
1432dc1
 
 
1ba53ba
1792c67
0d0471f
1792c67
 
1ba53ba
 
d3f61c4
 
 
607f6c3
 
0d0471f
 
607f6c3
 
1f0a8ac
d3f61c4
1ba53ba
56ce2bf
1792c67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Caption Generator w/English-to-Spanish Translation
# A. Harper | ARIN 460 | December 2025

# Load into Hugging Face Space (using the Gradio Framework)
# Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers)

# To run, navigate to the App tab. Click the red Generate button. 
# The app will randomly select image, generate (English) caption, 
    # then generate Spanish translation. 


# Import gradio - app framework
import gradio as gr


# Two image datasources are available. 
# Minor adjustments (add/remove # to deactivate/activate) to switch between datasources.
# AA comments refer to images in the DataFrame / from Coco database
# BB comments refer to images stored in local Gradio app folder


# Import os and random to support random selection of image (from folder)
import os
import random


# Import pandas datasets, transformers, torch
import pandas as pd

from datasets import load_dataset

from transformers import (
    BlipProcessor, 
    BlipForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    MarianMTModel, 
    MarianTokenizer
)

from PIL import Image
import torch


# AA: Load dataset. Initial image source. 
#Load dataset (henryscheible/coco_val2014_tiny)
dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation")


# Reduce dataset to 20 rows, i.e., get sample
samples = dataset.select(range(20))


#Convert to dataframe
df = pd.DataFrame(samples)


# BB: Direct to Photos folder
IMAGE_FOLDER = "Photos"

image_paths = [
    os.path.join(IMAGE_FOLDER, f)
    for f in os.listdir(IMAGE_FOLDER)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
]

#Load the image captioning model (Salesforce/blip-image-captioning-large)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")


#Load transformer for translating captions from English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
trans_tokenizer = MarianTokenizer.from_pretrained(model_name)
trans_model = MarianMTModel.from_pretrained(model_name)


#Configure captioning function

def caption_random_image():


    # AA: pick random row - from DF
    ##sample = df.sample(1).iloc[0]


    # BB: Pick a random image path - image from folder
    img_path = random.choice(image_paths)


    # BB: Load into PIL - image from folder - image from folder
    image = Image.open(img_path).convert("RGB")

    
    # AA: Image - for DF
    ##image = sample["image"]


    # Unconditional image captioning
    inputs = processor(image, return_tensors="pt")


    out = model.generate(**inputs)
    caption_eng = processor.decode(out[0], skip_special_tokens=True)


    # Translate caption from English to Spanish
    trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt")
    trans_out = trans_model.generate(trans_inputs)
    caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True)


    return image, caption_eng, caption_es




demo = gr.Interface(
    fn=caption_random_image,
    inputs=None,
    outputs=[
        gr.Image(type="pil", label="Random Image"),
        gr.Textbox(label="Caption (English)"),
        gr.Textbox(label="Caption (Spanish)")
    ],
    title="Image Captioning (with English to Spanish translation)",
    description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish."
)


demo.launch()