HarshitX's picture
Rename main.py to app.py
8cc196b verified
from caption_history import CaptionHistory
from caption_generation import MultiModelCaptionGenerator
from caption_overlay import ImageCaptionOverlay
import io
import os
import cv2
import numpy as np
from PIL import Image
import streamlit as st
from dotenv import load_dotenv
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY_IC")
gemini_key = os.getenv("GEMINI_API_KEY_IC")
groq_key = os.getenv("GROQ_API_KEY_IC")
def main():
st.set_page_config(
page_title="Multi-Model Image Caption Generator",
page_icon="πŸ–ΌοΈ",
layout="wide"
)
st.title("πŸ–ΌοΈ Multi-Model Image Caption Generator")
st.markdown("Generate captions using OpenAI GPT-4V, Google Gemini, and GROQ Vision models")
# Initialize session state
if 'caption_history' not in st.session_state:
st.session_state.caption_history = CaptionHistory()
if 'caption_generator' not in st.session_state:
st.session_state.caption_generator = MultiModelCaptionGenerator()
# Sidebar for API configuration
with st.sidebar:
st.header("πŸ”‘ API Configuration")
# Show API status
if openai_key:
st.success("βœ… OpenAI API Key loaded from .env")
else:
st.warning("⚠️ OpenAI API Key not found in .env")
if gemini_key:
st.success("βœ… Gemini API Key loaded from .env")
else:
st.warning("⚠️ Gemini API Key not found in .env")
if groq_key:
st.success("βœ… GROQ API Key loaded from .env")
else:
st.warning("⚠️ GROQ API Key not found in .env")
if st.button("Configure APIs"):
try:
st.session_state.caption_generator.configure_apis(
openai_key=openai_key,
gemini_key=gemini_key,
groq_key=groq_key
)
st.success("APIs configured successfully!")
except Exception as e:
st.error(f"Error configuring APIs: {str(e)}")
st.markdown("---")
# Caption overlay settings
st.header("🎨 Caption Settings")
caption_method = st.selectbox(
"Caption Method",
["Overlay on Image", "Background Behind Image"]
)
if caption_method == "Overlay on Image":
position = st.selectbox("Position", ["bottom", "top", "center"])
font_size = st.slider("Font Size", 0.5, 3.0, 1.0, 0.1)
thickness = st.slider("Thickness", 1, 5, 2)
else:
bg_color = st.color_picker("Background Color", "#000000")
text_color = st.color_picker("Text Color", "#FFFFFF")
margin = st.slider("Margin", 20, 100, 50)
# Optional: Custom font path
custom_font = st.text_input(
"Custom Font Path (optional)",
placeholder="e.g., fonts/Poppins-Regular.ttf"
)
st.markdown("---")
# History management
st.header("πŸ“ Caption History")
if st.button("View History"):
st.session_state.show_history = True
if st.button("Hide History"):
st.session_state.show_history = False
if st.button("Clear History"):
st.session_state.caption_history.clear_history()
st.success("History cleared!")
# Main content area
col1, col2 = st.columns([1, 1])
with col1:
st.header("πŸ“€ Upload Image")
uploaded_file = st.file_uploader(
"Choose an image...",
type=['png', 'jpg', 'jpeg', 'bmp', 'tiff']
)
if uploaded_file is not None:
# Display original image
image = Image.open(uploaded_file)
st.image(image, caption="Original Image", use_container_width=True)
# Model selection
st.header("πŸ€– Select Model")
models = {
"OpenAI GPT-4o": "openai", # Updated model name
"Google Gemini": "gemini",
"GROQ Vision": "groq"
}
selected_model = st.selectbox("Choose a model", list(models.keys()))
# Show model-specific info
model_info = {
"OpenAI GPT-4o": "Uses GPT-4o vision model for detailed image analysis",
"Google Gemini": "Uses Gemini-1.5-flash for fast and accurate captions",
"GROQ Vision": "Uses Llama-3.2-11b-vision for high-speed processing"
}
st.info(model_info[selected_model])
if st.button("Generate Caption", type="primary"):
# Check if APIs are configured
if not any([openai_key, gemini_key, groq_key]):
st.error("Please add API keys to your .env file and click 'Configure APIs'")
return
try:
model_key = models[selected_model]
# Check specific API availability
if model_key == "openai" and not openai_key:
st.error("OpenAI API key not available. Please add it to your .env file.")
return
elif model_key == "gemini" and not gemini_key:
st.error("Gemini API key not available. Please add it to your .env file.")
return
elif model_key == "groq" and not groq_key:
st.error("GROQ API key not available. Please add it to your .env file.")
return
with st.spinner(f"Generating caption with {selected_model}..."):
if model_key == "openai":
caption = st.session_state.caption_generator.generate_caption_openai(image)
elif model_key == "gemini":
caption = st.session_state.caption_generator.generate_caption_gemini(image)
elif model_key == "groq":
caption = st.session_state.caption_generator.generate_caption_groq(image)
st.session_state.current_caption = caption
st.session_state.current_image = image
st.session_state.current_model = selected_model
# Add to history
st.session_state.caption_history.add_interaction(
uploaded_file.name,
selected_model,
caption
)
st.success(f"Caption generated successfully with {selected_model}!")
except Exception as e:
st.error(f"Error generating caption: {str(e)}")
st.error("Please check your API keys and internet connection.")
with col2:
st.header("✨ Generated Caption & Preview")
if hasattr(st.session_state, 'current_caption'):
# Editable caption
edited_caption = st.text_area(
"Generated Caption (editable)",
st.session_state.current_caption,
height=100,
help="You can edit the caption before applying it to the image"
)
# Update the caption if edited
if edited_caption != st.session_state.current_caption:
st.session_state.current_caption = edited_caption
# Generate preview with caption
if hasattr(st.session_state, 'current_image'):
# Convert PIL to OpenCV format
cv_image = cv2.cvtColor(np.array(st.session_state.current_image), cv2.COLOR_RGB2BGR)
try:
if caption_method == "Overlay on Image":
result_image = ImageCaptionOverlay.add_caption_overlay(
cv_image,
st.session_state.current_caption,
position=position,
font_size=font_size,
thickness=thickness
)
else:
# Convert hex colors to RGB tuples
bg_rgb = tuple(int(bg_color[i:i+2], 16) for i in (1, 3, 5))
text_rgb = tuple(int(text_color[i:i+2], 16) for i in (1, 3, 5))
# Use custom font if provided
font_path = custom_font if custom_font and os.path.exists(custom_font) else None
result_image = ImageCaptionOverlay.add_caption_background(
cv_image,
st.session_state.current_caption,
font_path=font_path,
background_color=bg_rgb,
text_color=text_rgb,
margin=margin
)
# Convert back to PIL for display
result_pil = Image.fromarray(cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB))
st.image(result_pil, caption="Image with Caption", use_container_width=True)
# Download button
img_buffer = io.BytesIO()
result_pil.save(img_buffer, format='PNG')
st.download_button(
label="πŸ“₯ Download Image with Caption",
data=img_buffer.getvalue(),
file_name=f"captioned_{uploaded_file.name if uploaded_file else 'image'}.png",
mime="image/png"
)
except Exception as e:
st.error(f"Error processing image: {str(e)}")
else:
st.info("πŸ‘† Upload an image and generate a caption to see the preview here")
# History display
if getattr(st.session_state, 'show_history', False):
st.markdown("---")
st.header("πŸ“‹ Caption Generation History")
history = st.session_state.caption_history.get_history()
if history:
# Add search/filter functionality
search_term = st.text_input("πŸ” Search history", placeholder="Search by image name or caption...")
filtered_history = history
if search_term:
filtered_history = [
item for item in history
if search_term.lower() in item['image_name'].lower()
or search_term.lower() in item['caption'].lower()
or search_term.lower() in item['model'].lower()
]
if filtered_history:
for i, item in enumerate(reversed(filtered_history[-20:])): # Show last 20 items
with st.expander(f"{item['timestamp'][:19]} - {item['image_name']} ({item['model']})"):
st.write(f"**Model:** {item['model']}")
st.write(f"**Image:** {item['image_name']}")
st.write(f"**Caption:** {item['caption']}")
st.write(f"**Timestamp:** {item['timestamp']}")
else:
st.info("No matching history found.")
else:
st.info("No caption history available.")
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>Built with Streamlit, LangChain, OpenCV, and multi-model AI APIs</p>
<p>Supports OpenAI GPT-4o, Google Gemini, and GROQ Vision models</p>
<p><small>Make sure to add your API keys to the .env file</small></p>
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()