Spaces:
Sleeping
Sleeping
| # Caption Generator w/English-to-Spanish Translation | |
| # A. Harper | ARIN 460 | December 2025 | |
| # Load into Hugging Face Space (using the Gradio Framework) | |
| # Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers) | |
| # To run, navigate to the App tab. Click the red Generate button. | |
| # The app will randomly select image, generate (English) caption, | |
| # then generate Spanish translation. | |
| # Import gradio - app framework | |
| import gradio as gr | |
| # Two image datasources are available. | |
| # Minor adjustments (add/remove # to deactivate/activate) to switch between datasources. | |
| # AA comments refer to images in the DataFrame / from Coco database | |
| # BB comments refer to images stored in local Gradio app folder | |
| # Import os and random to support random selection of image (from folder) | |
| import os | |
| import random | |
| # Import pandas datasets, transformers, torch | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from transformers import ( | |
| BlipProcessor, | |
| BlipForConditionalGeneration, | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM, | |
| MarianMTModel, | |
| MarianTokenizer | |
| ) | |
| from PIL import Image | |
| import torch | |
| # AA: Load dataset. Initial image source. | |
| #Load dataset (henryscheible/coco_val2014_tiny) | |
| dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation") | |
| # Reduce dataset to 20 rows, i.e., get sample | |
| samples = dataset.select(range(20)) | |
| #Convert to dataframe | |
| df = pd.DataFrame(samples) | |
| # BB: Direct to Photos folder | |
| IMAGE_FOLDER = "Photos" | |
| image_paths = [ | |
| os.path.join(IMAGE_FOLDER, f) | |
| for f in os.listdir(IMAGE_FOLDER) | |
| if f.lower().endswith((".jpg", ".jpeg", ".png")) | |
| ] | |
| #Load the image captioning model (Salesforce/blip-image-captioning-large) | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| #Load transformer for translating captions from English to Spanish | |
| model_name = "Helsinki-NLP/opus-mt-en-es" | |
| trans_tokenizer = MarianTokenizer.from_pretrained(model_name) | |
| trans_model = MarianMTModel.from_pretrained(model_name) | |
| #Configure captioning function | |
| def caption_random_image(): | |
| # AA: pick random row - from DF | |
| ##sample = df.sample(1).iloc[0] | |
| # BB: Pick a random image path - image from folder | |
| img_path = random.choice(image_paths) | |
| # BB: Load into PIL - image from folder - image from folder | |
| image = Image.open(img_path).convert("RGB") | |
| # AA: Image - for DF | |
| ##image = sample["image"] | |
| # Unconditional image captioning | |
| inputs = processor(image, return_tensors="pt") | |
| out = model.generate(**inputs) | |
| caption_eng = processor.decode(out[0], skip_special_tokens=True) | |
| # Translate caption from English to Spanish | |
| trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt") | |
| trans_out = trans_model.generate(trans_inputs) | |
| caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True) | |
| return image, caption_eng, caption_es | |
| demo = gr.Interface( | |
| fn=caption_random_image, | |
| inputs=None, | |
| outputs=[ | |
| gr.Image(type="pil", label="Random Image"), | |
| gr.Textbox(label="Caption (English)"), | |
| gr.Textbox(label="Caption (Spanish)") | |
| ], | |
| title="Image Captioning (with English to Spanish translation)", | |
| description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish." | |
| ) | |
| demo.launch() | |