Spaces:

HarshitX
/

Multi_LLM_Image_Captioning

Sleeping

App Files Files Community

Multi_LLM_Image_Captioning / app.py

HarshitX

Rename main.py to app.py

8cc196b verified 7 months ago

raw

history blame contribute delete

12.6 kB

	from caption_history import CaptionHistory
	from caption_generation import MultiModelCaptionGenerator
	from caption_overlay import ImageCaptionOverlay

	import io
	import os

	import cv2
	import numpy as np
	from PIL import Image
	import streamlit as st
	from dotenv import load_dotenv

	load_dotenv()

	openai_key = os.getenv("OPENAI_API_KEY_IC")
	gemini_key = os.getenv("GEMINI_API_KEY_IC")
	groq_key = os.getenv("GROQ_API_KEY_IC")

	def main():
	st.set_page_config(
	page_title="Multi-Model Image Caption Generator",
	page_icon="🖼️",
	layout="wide"
	)

	st.title("🖼️ Multi-Model Image Caption Generator")
	st.markdown("Generate captions using OpenAI GPT-4V, Google Gemini, and GROQ Vision models")

	# Initialize session state
	if 'caption_history' not in st.session_state:
	st.session_state.caption_history = CaptionHistory()

	if 'caption_generator' not in st.session_state:
	st.session_state.caption_generator = MultiModelCaptionGenerator()

	# Sidebar for API configuration
	with st.sidebar:
	st.header("🔑 API Configuration")

	# Show API status
	if openai_key:
	st.success("✅ OpenAI API Key loaded from .env")
	else:
	st.warning("⚠️ OpenAI API Key not found in .env")

	if gemini_key:
	st.success("✅ Gemini API Key loaded from .env")
	else:
	st.warning("⚠️ Gemini API Key not found in .env")

	if groq_key:
	st.success("✅ GROQ API Key loaded from .env")
	else:
	st.warning("⚠️ GROQ API Key not found in .env")

	if st.button("Configure APIs"):
	try:
	st.session_state.caption_generator.configure_apis(
	openai_key=openai_key,
	gemini_key=gemini_key,
	groq_key=groq_key
	)
	st.success("APIs configured successfully!")
	except Exception as e:
	st.error(f"Error configuring APIs: {str(e)}")

	st.markdown("---")

	# Caption overlay settings
	st.header("🎨 Caption Settings")
	caption_method = st.selectbox(
	"Caption Method",
	["Overlay on Image", "Background Behind Image"]
	)

	if caption_method == "Overlay on Image":
	position = st.selectbox("Position", ["bottom", "top", "center"])
	font_size = st.slider("Font Size", 0.5, 3.0, 1.0, 0.1)
	thickness = st.slider("Thickness", 1, 5, 2)
	else:
	bg_color = st.color_picker("Background Color", "#000000")
	text_color = st.color_picker("Text Color", "#FFFFFF")
	margin = st.slider("Margin", 20, 100, 50)

	# Optional: Custom font path
	custom_font = st.text_input(
	"Custom Font Path (optional)",
	placeholder="e.g., fonts/Poppins-Regular.ttf"
	)

	st.markdown("---")

	# History management
	st.header("📝 Caption History")
	if st.button("View History"):
	st.session_state.show_history = True

	if st.button("Hide History"):
	st.session_state.show_history = False

	if st.button("Clear History"):
	st.session_state.caption_history.clear_history()
	st.success("History cleared!")

	# Main content area
	col1, col2 = st.columns([1, 1])

	with col1:
	st.header("📤 Upload Image")
	uploaded_file = st.file_uploader(
	"Choose an image...",
	type=['png', 'jpg', 'jpeg', 'bmp', 'tiff']
	)

	if uploaded_file is not None:
	# Display original image
	image = Image.open(uploaded_file)
	st.image(image, caption="Original Image", use_container_width=True)

	# Model selection
	st.header("🤖 Select Model")
	models = {
	"OpenAI GPT-4o": "openai", # Updated model name
	"Google Gemini": "gemini",
	"GROQ Vision": "groq"
	}

	selected_model = st.selectbox("Choose a model", list(models.keys()))

	# Show model-specific info
	model_info = {
	"OpenAI GPT-4o": "Uses GPT-4o vision model for detailed image analysis",
	"Google Gemini": "Uses Gemini-1.5-flash for fast and accurate captions",
	"GROQ Vision": "Uses Llama-3.2-11b-vision for high-speed processing"
	}
	st.info(model_info[selected_model])

	if st.button("Generate Caption", type="primary"):
	# Check if APIs are configured
	if not any([openai_key, gemini_key, groq_key]):
	st.error("Please add API keys to your .env file and click 'Configure APIs'")
	return

	try:
	model_key = models[selected_model]

	# Check specific API availability
	if model_key == "openai" and not openai_key:
	st.error("OpenAI API key not available. Please add it to your .env file.")
	return
	elif model_key == "gemini" and not gemini_key:
	st.error("Gemini API key not available. Please add it to your .env file.")
	return
	elif model_key == "groq" and not groq_key:
	st.error("GROQ API key not available. Please add it to your .env file.")
	return

	with st.spinner(f"Generating caption with {selected_model}..."):
	if model_key == "openai":
	caption = st.session_state.caption_generator.generate_caption_openai(image)
	elif model_key == "gemini":
	caption = st.session_state.caption_generator.generate_caption_gemini(image)
	elif model_key == "groq":
	caption = st.session_state.caption_generator.generate_caption_groq(image)

	st.session_state.current_caption = caption
	st.session_state.current_image = image
	st.session_state.current_model = selected_model

	# Add to history
	st.session_state.caption_history.add_interaction(
	uploaded_file.name,
	selected_model,
	caption
	)

	st.success(f"Caption generated successfully with {selected_model}!")

	except Exception as e:
	st.error(f"Error generating caption: {str(e)}")
	st.error("Please check your API keys and internet connection.")

	with col2:
	st.header("✨ Generated Caption & Preview")

	if hasattr(st.session_state, 'current_caption'):
	# Editable caption
	edited_caption = st.text_area(
	"Generated Caption (editable)",
	st.session_state.current_caption,
	height=100,
	help="You can edit the caption before applying it to the image"
	)

	# Update the caption if edited
	if edited_caption != st.session_state.current_caption:
	st.session_state.current_caption = edited_caption

	# Generate preview with caption
	if hasattr(st.session_state, 'current_image'):
	# Convert PIL to OpenCV format
	cv_image = cv2.cvtColor(np.array(st.session_state.current_image), cv2.COLOR_RGB2BGR)

	try:
	if caption_method == "Overlay on Image":
	result_image = ImageCaptionOverlay.add_caption_overlay(
	cv_image,
	st.session_state.current_caption,
	position=position,
	font_size=font_size,
	thickness=thickness
	)
	else:
	# Convert hex colors to RGB tuples
	bg_rgb = tuple(int(bg_color[i:i+2], 16) for i in (1, 3, 5))
	text_rgb = tuple(int(text_color[i:i+2], 16) for i in (1, 3, 5))

	# Use custom font if provided
	font_path = custom_font if custom_font and os.path.exists(custom_font) else None

	result_image = ImageCaptionOverlay.add_caption_background(
	cv_image,
	st.session_state.current_caption,
	font_path=font_path,
	background_color=bg_rgb,
	text_color=text_rgb,
	margin=margin
	)

	# Convert back to PIL for display
	result_pil = Image.fromarray(cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB))
	st.image(result_pil, caption="Image with Caption", use_container_width=True)

	# Download button
	img_buffer = io.BytesIO()
	result_pil.save(img_buffer, format='PNG')

	st.download_button(
	label="📥 Download Image with Caption",
	data=img_buffer.getvalue(),
	file_name=f"captioned_{uploaded_file.name if uploaded_file else 'image'}.png",
	mime="image/png"
	)

	except Exception as e:
	st.error(f"Error processing image: {str(e)}")
	else:
	st.info("👆 Upload an image and generate a caption to see the preview here")

	# History display
	if getattr(st.session_state, 'show_history', False):
	st.markdown("---")
	st.header("📋 Caption Generation History")

	history = st.session_state.caption_history.get_history()

	if history:
	# Add search/filter functionality
	search_term = st.text_input("🔍 Search history", placeholder="Search by image name or caption...")

	filtered_history = history
	if search_term:
	filtered_history = [
	item for item in history
	if search_term.lower() in item['image_name'].lower()
	or search_term.lower() in item['caption'].lower()
	or search_term.lower() in item['model'].lower()
	]

	if filtered_history:
	for i, item in enumerate(reversed(filtered_history[-20:])): # Show last 20 items
	with st.expander(f"{item['timestamp'][:19]} - {item['image_name']} ({item['model']})"):
	st.write(f"Model: {item['model']}")
	st.write(f"Image: {item['image_name']}")
	st.write(f"Caption: {item['caption']}")
	st.write(f"Timestamp: {item['timestamp']}")
	else:
	st.info("No matching history found.")
	else:
	st.info("No caption history available.")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center'>
	<p>Built with Streamlit, LangChain, OpenCV, and multi-model AI APIs</p>
	<p>Supports OpenAI GPT-4o, Google Gemini, and GROQ Vision models</p>
	<p><small>Make sure to add your API keys to the .env file</small></p>
	</div>
	""", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()