import os from concurrent.futures import ThreadPoolExecutor import streamlit as st from dotenv import load_dotenv from models.blip_model import get_blip_caption from models.groq_vision import answer_followup_stream, get_initial_analysis, get_suggested_questions from utils.image_utils import get_image_info, validate_and_process load_dotenv() st.set_page_config( page_title="VisualMind AI", page_icon="πŸ‘οΈ", layout="wide", initial_sidebar_state="expanded", ) def get_api_key(key: str) -> str: try: val = st.secrets.get(key) if val: return val except Exception: pass return os.getenv(key, "") # ── Theme init (must happen before CSS is applied) ──────────────────────────── if "theme" not in st.session_state: st.session_state.theme = "light" def get_css(dark: bool) -> str: if dark: bg = "#07070d" sb_bg = "rgba(8,8,18,0.97)" sb_border = "rgba(255,255,255,0.055)" surface = "rgba(255,255,255,0.025)" border = "rgba(255,255,255,0.06)" text_p = "#f1f5f9" text_s = "rgba(248,250,252,0.78)" text_m = "rgba(248,250,252,0.42)" text_mm = "rgba(248,250,252,0.35)" text_lbl = "rgba(248,250,252,0.6)" text_slbl = "rgba(248,250,252,0.65)" input_bg = "rgba(255,255,255,0.038)" input_bd = "rgba(255,255,255,0.09)" input_bgf = "rgba(255,255,255,0.055)" dl_bg = "rgba(255,255,255,0.04)" dl_color = "rgba(248,250,252,0.75)" dl_border = "rgba(255,255,255,0.1)" hr_grad = "transparent, rgba(255,255,255,0.07), transparent" hr_sb = "rgba(255,255,255,0.05)" caption_c = "rgba(248,250,252,0.38)" md_p = "rgba(248,250,252,0.78)" exp_sum = "rgba(248,250,252,0.8)" chat_hover = "rgba(255,255,255,0.035)" img_shadow = "0 8px 48px rgba(0,0,0,0.45)" img_shadow2= "0 14px 64px rgba(0,0,0,0.55)" else: bg = "#f0f2f8" sb_bg = "#ffffff" sb_border = "rgba(0,0,0,0.07)" surface = "rgba(0,0,0,0.028)" border = "rgba(0,0,0,0.07)" text_p = "#0f172a" text_s = "rgba(15,23,42,0.72)" text_m = "rgba(15,23,42,0.48)" text_mm = "rgba(15,23,42,0.38)" text_lbl = "rgba(15,23,42,0.6)" text_slbl = "rgba(15,23,42,0.65)" input_bg = "rgba(0,0,0,0.035)" input_bd = "rgba(0,0,0,0.1)" input_bgf = "rgba(124,58,237,0.05)" dl_bg = "rgba(0,0,0,0.04)" dl_color = "rgba(15,23,42,0.65)" dl_border = "rgba(0,0,0,0.1)" hr_grad = "transparent, rgba(0,0,0,0.07), transparent" hr_sb = "rgba(0,0,0,0.06)" caption_c = "rgba(15,23,42,0.4)" md_p = "rgba(15,23,42,0.72)" exp_sum = "rgba(15,23,42,0.8)" chat_hover = "rgba(0,0,0,0.035)" img_shadow = "0 8px 48px rgba(0,0,0,0.12)" img_shadow2= "0 14px 64px rgba(0,0,0,0.2)" return f""" """ st.markdown(get_css(st.session_state.theme == "dark"), unsafe_allow_html=True) # ── Read API keys silently from environment / Streamlit secrets ─────────────── groq_key = get_api_key("GROQ_API_KEY") hf_token = get_api_key("HF_TOKEN") # ── Sidebar ─────────────────────────────────────────────────────────────────── with st.sidebar: st.markdown( """
πŸ‘οΈ
VisualMind AI
Dual-Model Vision
""", unsafe_allow_html=True, ) st.divider() # Theme toggle def _on_theme_change(): st.session_state.theme = "dark" if st.session_state._theme_toggle else "light" st.toggle( "Dark mode", value=(st.session_state.theme == "dark"), key="_theme_toggle", on_change=_on_theme_change, ) st.divider() st.markdown( "
Session Stats
", unsafe_allow_html=True, ) if "total_tokens" not in st.session_state: st.session_state.total_tokens = 0 if "qa_turns" not in st.session_state: st.session_state.qa_turns = 0 col_a, col_b = st.columns(2) with col_a: st.metric("Tokens", f"{st.session_state.total_tokens:,}") with col_b: st.metric("Q&A Turns", st.session_state.qa_turns) st.divider() st.markdown( """
Models
πŸ€— BLIP-large
HuggingFace Inference API
⚑ Llama 4 Scout
Groq β€” 17B vision model
""", unsafe_allow_html=True, ) # Show a warning if keys are missing (no inputs β€” just a hint) if not groq_key or not hf_token: st.divider() missing = [] if not groq_key: missing.append("GROQ_API_KEY") if not hf_token: missing.append("HF_TOKEN") st.warning(f"Missing env vars: {', '.join(missing)}") # ── Hero ────────────────────────────────────────────────────────────────────── st.markdown( """
✦  Dual-Model Visual Intelligence
VisualMind AI
Upload an image. Instantly understand everything in it.
""", unsafe_allow_html=True, ) # ── Session state init ──────────────────────────────────────────────────────── _defaults: dict = { "image_bytes": None, "blip_caption": None, "groq_analysis": None, "conversation_history": [], "image_info": None, "suggested_questions": [], "pending_question": "", } for _key, _default in _defaults.items(): if _key not in st.session_state: st.session_state[_key] = _default # ── File uploader ───────────────────────────────────────────────────────────── uploaded_file = st.file_uploader( "Drop an image here or click to browse", type=["jpg", "jpeg", "png", "webp", "gif"], help="Max 4 MB β€” JPEG, PNG, WebP, GIF", label_visibility="visible", ) if uploaded_file: image_bytes, error = validate_and_process(uploaded_file) if error: st.error(error) else: if st.session_state.image_bytes != image_bytes: st.session_state.image_bytes = image_bytes st.session_state.blip_caption = None st.session_state.groq_analysis = None st.session_state.conversation_history = [] st.session_state.image_info = get_image_info(image_bytes) st.session_state.suggested_questions = [] st.session_state.pending_question = "" need_blip = st.session_state.blip_caption is None need_groq = st.session_state.groq_analysis is None if need_blip or need_groq: if hf_token and groq_key and need_blip and need_groq: with st.spinner("BLIP + Llama 4 Scout analyzing in parallel…"): with ThreadPoolExecutor(max_workers=2) as executor: fut_blip = executor.submit(get_blip_caption, image_bytes, hf_token) fut_groq = executor.submit(get_initial_analysis, image_bytes, groq_key) st.session_state.blip_caption = fut_blip.result() analysis, tokens = fut_groq.result() st.session_state.groq_analysis = analysis st.session_state.total_tokens += tokens else: if need_blip: if hf_token: with st.spinner("BLIP generating caption…"): st.session_state.blip_caption = get_blip_caption(image_bytes, hf_token) else: st.session_state.blip_caption = "⚠️ HuggingFace token not set." if need_groq: if groq_key: with st.spinner("Llama 4 Scout analyzing image…"): analysis, tokens = get_initial_analysis(image_bytes, groq_key) st.session_state.groq_analysis = analysis st.session_state.total_tokens += tokens else: st.session_state.groq_analysis = "⚠️ Groq API key not set." # Generate suggested questions once analysis is available if ( not st.session_state.suggested_questions and st.session_state.groq_analysis and not st.session_state.groq_analysis.startswith(("⚠️", "Groq API error")) ): st.session_state.suggested_questions = get_suggested_questions( st.session_state.groq_analysis ) st.divider() # ── Image + Analysis ────────────────────────────────────────────────── col_img, col_analysis = st.columns([0.42, 0.58], gap="large") with col_img: st.markdown( "
" "πŸ“Έ  Uploaded Image
", unsafe_allow_html=True, ) st.image(image_bytes, use_container_width=True) info = st.session_state.image_info st.caption( f"{info['width']} Γ— {info['height']} px Β· {info['size_kb']} KB Β· {info['mode']}" ) with col_analysis: st.markdown( "
" "πŸ€–  AI Analysis
", unsafe_allow_html=True, ) # BLIP card st.markdown( """
πŸ€—  HuggingFace BLIP-large β€” Caption
""", unsafe_allow_html=True, ) st.info(st.session_state.blip_caption or "β€”") st.markdown("
", unsafe_allow_html=True) # Groq card st.markdown( """
⚑  Llama 4 Scout (Groq) β€” Deep Analysis
""", unsafe_allow_html=True, ) st.success(st.session_state.groq_analysis or "β€”") # Action row st.markdown("
", unsafe_allow_html=True) btn_dl, btn_clear, _ = st.columns([0.38, 0.35, 0.27]) with btn_dl: combined_text = ( f"=== VisualMind AI Analysis ===\n\n" f"Image: {info['width']}Γ—{info['height']}px, {info['size_kb']} KB\n\n" f"--- BLIP Caption ---\n{st.session_state.blip_caption}\n\n" f"--- Llama 4 Scout Analysis ---\n{st.session_state.groq_analysis}\n\n" f"--- Conversation ---\n" + "\n".join( f"[{m['role'].upper()}] {m['content']}" for m in st.session_state.conversation_history ) ) st.download_button( "⬇ Download", data=combined_text, file_name="visualmind_analysis.txt", mime="text/plain", use_container_width=True, ) with btn_clear: if st.button("πŸ—‘ Clear Chat", use_container_width=True): st.session_state.conversation_history = [] st.session_state.pending_question = "" st.rerun() # Copy-friendly plain-text view st.markdown("
", unsafe_allow_html=True) with st.expander("πŸ“‹ Copy Analysis Text", expanded=False): st.code( f"BLIP Caption:\n{st.session_state.blip_caption or 'β€”'}\n\n" f"Llama 4 Scout Analysis:\n{st.session_state.groq_analysis or 'β€”'}", language=None, ) # ── Chat ────────────────────────────────────────────────────────────── st.divider() st.markdown( """
πŸ’¬  Follow-Up Q&A
Ask anything about the image β€” objects, text, colors, context, comparisons.
""", unsafe_allow_html=True, ) # Suggested questions (shown only before first Q&A turn) if st.session_state.suggested_questions and not st.session_state.conversation_history: st.markdown( "
✦ Suggested questions
", unsafe_allow_html=True, ) sq_cols = st.columns(len(st.session_state.suggested_questions), gap="small") for sq_col, sq in zip(sq_cols, st.session_state.suggested_questions): if sq_col.button(sq, use_container_width=True, key=f"sq_{sq[:24]}"): st.session_state.pending_question = sq st.rerun() st.markdown("
", unsafe_allow_html=True) for message in st.session_state.conversation_history: with st.chat_message(message["role"]): st.markdown(message["content"]) user_question = st.chat_input("What would you like to know about this image?") active_question = user_question if st.session_state.pending_question: active_question = st.session_state.pending_question st.session_state.pending_question = "" if active_question: if not groq_key: st.warning("GROQ_API_KEY is not set β€” add it to your .env or Space secrets.") else: with st.chat_message("user"): st.markdown(active_question) with st.chat_message("assistant"): token_bucket: list[int] = [] answer = st.write_stream( answer_followup_stream( st.session_state.image_bytes, active_question, st.session_state.conversation_history, groq_key, token_bucket, ) ) tokens = token_bucket[0] if token_bucket else 0 if tokens: st.caption(f"Tokens used this turn: {tokens:,}") st.session_state.conversation_history.append({"role": "user", "content": active_question}) st.session_state.conversation_history.append({"role": "assistant", "content": answer}) st.session_state.total_tokens += tokens st.session_state.qa_turns += 1 # ── Landing (no image yet) ──────────────────────────────────────────────────── else: st.markdown("
", unsafe_allow_html=True) st.markdown( "
" "What can VisualMind AI do?
", unsafe_allow_html=True, ) use_cases = [ ("πŸ›οΈ", "E-Commerce", "Product tagging, quality checks, listing copy generation"), ("β™Ώ", "Accessibility", "Auto alt-text for websites, screen-reader descriptions"), ("πŸ”", "Research", "Charts, slides, documents, scene Q&A, OCR context"), ] cols = st.columns(3, gap="medium") for col, (icon, title, desc) in zip(cols, use_cases): with col: st.markdown( f"""
{icon}
{title}
{desc}
""", unsafe_allow_html=True, )