Spaces:

therealsaed
/

Image_Captioning_model

Runtime error

App Files Files Community

Image_Captioning_model / app.py

therealsaed

Update app.py

24b38ad verified 8 months ago

raw

history blame contribute delete

7.26 kB

	"""
	Hugging Face Spaces App - Image Captioning
	Deploy this to HF Spaces for free hosting
	"""

	import gradio as gr
	import torch
	from PIL import Image
	import time

	def load_models():
	"""Load models with error handling"""
	models = {}

	try:
	from transformers import BlipProcessor, BlipForConditionalGeneration
	print("Loading BLIP model...")
	models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
	print("✅ BLIP loaded successfully")
	except Exception as e:
	print(f"❌ BLIP failed to load: {e}")
	models['blip_error'] = str(e)

	try:
	from transformers import AutoProcessor, AutoModelForCausalLM
	print("Loading GIT model...")
	models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base")
	models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
	print("✅ GIT loaded successfully")
	except Exception as e:
	print(f"❌ GIT failed to load: {e}")
	models['git_error'] = str(e)

	return models

	# Load models at startup
	print("🚀 Loading AI models...")
	models = load_models()
	print(f"📦 Models loading completed")

	def generate_captions(image, true_caption=""):
	"""Generate captions using available models"""
	if image is None:
	return "❌ Please upload an image first."

	# Ensure image is in RGB format
	if image.mode != 'RGB':
	image = image.convert('RGB')

	results = []
	start_time = time.time()

	# Add true caption if provided
	if true_caption.strip():
	results.append(f"🎯 True Caption:")
	results.append(f"{true_caption.strip()}")
	results.append("")

	# BLIP model
	if 'blip_model' in models:
	try:
	blip_start = time.time()
	inputs = models['blip_processor'](image, return_tensors="pt")
	out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5)
	blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True)
	blip_time = time.time() - blip_start

	results.append(f"🤖 BLIP Model: ({blip_time:.2f}s)")
	results.append(f"{blip_caption}")
	results.append("")
	except Exception as e:
	results.append(f"🤖 BLIP Model: Error - {str(e)}")
	results.append("")
	elif 'blip_error' in models:
	results.append(f"🤖 BLIP Model: Not available - {models['blip_error']}")
	results.append("")

	# GIT model
	if 'git_model' in models:
	try:
	git_start = time.time()
	inputs = models['git_processor'](images=image, return_tensors="pt")
	generated_ids = models['git_model'].generate(
	pixel_values=inputs.pixel_values,
	max_length=50,
	num_beams=5
	)
	git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0]
	git_time = time.time() - git_start

	results.append(f"🧠 GIT Model: ({git_time:.2f}s)")
	results.append(f"{git_caption}")
	results.append("")
	except Exception as e:
	results.append(f"🧠 GIT Model: Error - {str(e)}")
	results.append("")
	elif 'git_error' in models:
	results.append(f"🧠 GIT Model: Not available - {models['git_error']}")
	results.append("")

	total_time = time.time() - start_time
	results.append("---")
	results.append(f"⏱️ Total Processing Time: {total_time:.2f} seconds")
	results.append("")
	results.append("📊 About the Models:")
	results.append("• BLIP: Salesforce's Bootstrapping Language-Image Pre-training")
	results.append("• GIT: Microsoft's Generative Image-to-text Transformer")

	return "\n".join(results)

	# Create Gradio interface
	with gr.Blocks(
	title="AI Image Captioning",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1200px !important;
	}
	"""
	) as demo:

	gr.Markdown("""
	# 🤖 AI Image Captioning

	Upload an image and get captions from multiple state-of-the-art AI models!

	Available Models:
	- 🤖 BLIP (Salesforce): Fast and accurate image captioning
	- 🧠 GIT (Microsoft): Advanced generative image-to-text model

	Simply upload an image or try one of the examples below!
	""")

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(
	type="pil",
	label="📸 Upload Your Image",
	height=400
	)

	true_caption_input = gr.Textbox(
	label="🎯 True Caption (Optional)",
	placeholder="Enter the correct caption to compare with AI predictions...",
	lines=2
	)

	generate_btn = gr.Button(
	"✨ Generate Captions",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	output = gr.Textbox(
	label="🤖 AI Generated Captions",
	lines=20,
	max_lines=25,
	show_copy_button=True
	)

	# Example images
	gr.Markdown("### 📋 Try These Examples:")

	example_images = [
	["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"],
	["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"],
	["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"],
	["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"],
	["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"],
	]

	gr.Examples(
	examples=example_images,
	inputs=[image_input, true_caption_input],
	outputs=output,
	fn=generate_captions,
	cache_examples=False
	)

	# Event handlers
	generate_btn.click(
	fn=generate_captions,
	inputs=[image_input, true_caption_input],
	outputs=output
	)

	# Auto-generate when image is uploaded
	image_input.change(
	fn=generate_captions,
	inputs=[image_input, true_caption_input],
	outputs=output
	)

	gr.Markdown("""
	---

	🔧 Technical Details:
	- Models run on Hugging Face's infrastructure
	- Processing time varies based on image size and complexity
	- All models are open-source and publicly available

	📝 Tips:
	- Try different types of images (people, objects, landscapes, etc.)
	- Compare the AI captions with your own description
	- Larger images may take longer to process
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()