import os
from concurrent.futures import ThreadPoolExecutor
import streamlit as st
from dotenv import load_dotenv
from models.blip_model import get_blip_caption
from models.groq_vision import answer_followup_stream, get_initial_analysis, get_suggested_questions
from utils.image_utils import get_image_info, validate_and_process
load_dotenv()
st.set_page_config(
page_title="VisualMind AI",
page_icon="ποΈ",
layout="wide",
initial_sidebar_state="expanded",
)
def get_api_key(key: str) -> str:
try:
val = st.secrets.get(key)
if val:
return val
except Exception:
pass
return os.getenv(key, "")
# ββ Theme init (must happen before CSS is applied) ββββββββββββββββββββββββββββ
if "theme" not in st.session_state:
st.session_state.theme = "light"
def get_css(dark: bool) -> str:
if dark:
bg = "#07070d"
sb_bg = "rgba(8,8,18,0.97)"
sb_border = "rgba(255,255,255,0.055)"
surface = "rgba(255,255,255,0.025)"
border = "rgba(255,255,255,0.06)"
text_p = "#f1f5f9"
text_s = "rgba(248,250,252,0.78)"
text_m = "rgba(248,250,252,0.42)"
text_mm = "rgba(248,250,252,0.35)"
text_lbl = "rgba(248,250,252,0.6)"
text_slbl = "rgba(248,250,252,0.65)"
input_bg = "rgba(255,255,255,0.038)"
input_bd = "rgba(255,255,255,0.09)"
input_bgf = "rgba(255,255,255,0.055)"
dl_bg = "rgba(255,255,255,0.04)"
dl_color = "rgba(248,250,252,0.75)"
dl_border = "rgba(255,255,255,0.1)"
hr_grad = "transparent, rgba(255,255,255,0.07), transparent"
hr_sb = "rgba(255,255,255,0.05)"
caption_c = "rgba(248,250,252,0.38)"
md_p = "rgba(248,250,252,0.78)"
exp_sum = "rgba(248,250,252,0.8)"
chat_hover = "rgba(255,255,255,0.035)"
img_shadow = "0 8px 48px rgba(0,0,0,0.45)"
img_shadow2= "0 14px 64px rgba(0,0,0,0.55)"
else:
bg = "#f0f2f8"
sb_bg = "#ffffff"
sb_border = "rgba(0,0,0,0.07)"
surface = "rgba(0,0,0,0.028)"
border = "rgba(0,0,0,0.07)"
text_p = "#0f172a"
text_s = "rgba(15,23,42,0.72)"
text_m = "rgba(15,23,42,0.48)"
text_mm = "rgba(15,23,42,0.38)"
text_lbl = "rgba(15,23,42,0.6)"
text_slbl = "rgba(15,23,42,0.65)"
input_bg = "rgba(0,0,0,0.035)"
input_bd = "rgba(0,0,0,0.1)"
input_bgf = "rgba(124,58,237,0.05)"
dl_bg = "rgba(0,0,0,0.04)"
dl_color = "rgba(15,23,42,0.65)"
dl_border = "rgba(0,0,0,0.1)"
hr_grad = "transparent, rgba(0,0,0,0.07), transparent"
hr_sb = "rgba(0,0,0,0.06)"
caption_c = "rgba(15,23,42,0.4)"
md_p = "rgba(15,23,42,0.72)"
exp_sum = "rgba(15,23,42,0.8)"
chat_hover = "rgba(0,0,0,0.035)"
img_shadow = "0 8px 48px rgba(0,0,0,0.12)"
img_shadow2= "0 14px 64px rgba(0,0,0,0.2)"
return f"""
"""
st.markdown(get_css(st.session_state.theme == "dark"), unsafe_allow_html=True)
# ββ Read API keys silently from environment / Streamlit secrets βββββββββββββββ
groq_key = get_api_key("GROQ_API_KEY")
hf_token = get_api_key("HF_TOKEN")
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with st.sidebar:
st.markdown(
"""
ποΈ
VisualMind AI
Dual-Model Vision
""",
unsafe_allow_html=True,
)
st.divider()
# Theme toggle
def _on_theme_change():
st.session_state.theme = "dark" if st.session_state._theme_toggle else "light"
st.toggle(
"Dark mode",
value=(st.session_state.theme == "dark"),
key="_theme_toggle",
on_change=_on_theme_change,
)
st.divider()
st.markdown(
"Session Stats
",
unsafe_allow_html=True,
)
if "total_tokens" not in st.session_state:
st.session_state.total_tokens = 0
if "qa_turns" not in st.session_state:
st.session_state.qa_turns = 0
col_a, col_b = st.columns(2)
with col_a:
st.metric("Tokens", f"{st.session_state.total_tokens:,}")
with col_b:
st.metric("Q&A Turns", st.session_state.qa_turns)
st.divider()
st.markdown(
"""
Models
π€ BLIP-large
HuggingFace Inference API
β‘ Llama 4 Scout
Groq β 17B vision model
""",
unsafe_allow_html=True,
)
# Show a warning if keys are missing (no inputs β just a hint)
if not groq_key or not hf_token:
st.divider()
missing = []
if not groq_key:
missing.append("GROQ_API_KEY")
if not hf_token:
missing.append("HF_TOKEN")
st.warning(f"Missing env vars: {', '.join(missing)}")
# ββ Hero ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.markdown(
"""
β¦ Dual-Model Visual Intelligence
VisualMind AI
Upload an image. Instantly understand everything in it.
""",
unsafe_allow_html=True,
)
# ββ Session state init ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_defaults: dict = {
"image_bytes": None,
"blip_caption": None,
"groq_analysis": None,
"conversation_history": [],
"image_info": None,
"suggested_questions": [],
"pending_question": "",
}
for _key, _default in _defaults.items():
if _key not in st.session_state:
st.session_state[_key] = _default
# ββ File uploader βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
uploaded_file = st.file_uploader(
"Drop an image here or click to browse",
type=["jpg", "jpeg", "png", "webp", "gif"],
help="Max 4 MB β JPEG, PNG, WebP, GIF",
label_visibility="visible",
)
if uploaded_file:
image_bytes, error = validate_and_process(uploaded_file)
if error:
st.error(error)
else:
if st.session_state.image_bytes != image_bytes:
st.session_state.image_bytes = image_bytes
st.session_state.blip_caption = None
st.session_state.groq_analysis = None
st.session_state.conversation_history = []
st.session_state.image_info = get_image_info(image_bytes)
st.session_state.suggested_questions = []
st.session_state.pending_question = ""
need_blip = st.session_state.blip_caption is None
need_groq = st.session_state.groq_analysis is None
if need_blip or need_groq:
if hf_token and groq_key and need_blip and need_groq:
with st.spinner("BLIP + Llama 4 Scout analyzing in parallelβ¦"):
with ThreadPoolExecutor(max_workers=2) as executor:
fut_blip = executor.submit(get_blip_caption, image_bytes, hf_token)
fut_groq = executor.submit(get_initial_analysis, image_bytes, groq_key)
st.session_state.blip_caption = fut_blip.result()
analysis, tokens = fut_groq.result()
st.session_state.groq_analysis = analysis
st.session_state.total_tokens += tokens
else:
if need_blip:
if hf_token:
with st.spinner("BLIP generating captionβ¦"):
st.session_state.blip_caption = get_blip_caption(image_bytes, hf_token)
else:
st.session_state.blip_caption = "β οΈ HuggingFace token not set."
if need_groq:
if groq_key:
with st.spinner("Llama 4 Scout analyzing imageβ¦"):
analysis, tokens = get_initial_analysis(image_bytes, groq_key)
st.session_state.groq_analysis = analysis
st.session_state.total_tokens += tokens
else:
st.session_state.groq_analysis = "β οΈ Groq API key not set."
# Generate suggested questions once analysis is available
if (
not st.session_state.suggested_questions
and st.session_state.groq_analysis
and not st.session_state.groq_analysis.startswith(("β οΈ", "Groq API error"))
):
st.session_state.suggested_questions = get_suggested_questions(
st.session_state.groq_analysis
)
st.divider()
# ββ Image + Analysis ββββββββββββββββββββββββββββββββββββββββββββββββββ
col_img, col_analysis = st.columns([0.42, 0.58], gap="large")
with col_img:
st.markdown(
""
"πΈ Uploaded Image
",
unsafe_allow_html=True,
)
st.image(image_bytes, use_container_width=True)
info = st.session_state.image_info
st.caption(
f"{info['width']} Γ {info['height']} px Β· {info['size_kb']} KB Β· {info['mode']}"
)
with col_analysis:
st.markdown(
""
"π€ AI Analysis
",
unsafe_allow_html=True,
)
# BLIP card
st.markdown(
"""
π€ HuggingFace BLIP-large β Caption
""",
unsafe_allow_html=True,
)
st.info(st.session_state.blip_caption or "β")
st.markdown("", unsafe_allow_html=True)
# Groq card
st.markdown(
"""
β‘ Llama 4 Scout (Groq) β Deep Analysis
""",
unsafe_allow_html=True,
)
st.success(st.session_state.groq_analysis or "β")
# Action row
st.markdown("", unsafe_allow_html=True)
btn_dl, btn_clear, _ = st.columns([0.38, 0.35, 0.27])
with btn_dl:
combined_text = (
f"=== VisualMind AI Analysis ===\n\n"
f"Image: {info['width']}Γ{info['height']}px, {info['size_kb']} KB\n\n"
f"--- BLIP Caption ---\n{st.session_state.blip_caption}\n\n"
f"--- Llama 4 Scout Analysis ---\n{st.session_state.groq_analysis}\n\n"
f"--- Conversation ---\n"
+ "\n".join(
f"[{m['role'].upper()}] {m['content']}"
for m in st.session_state.conversation_history
)
)
st.download_button(
"β¬ Download",
data=combined_text,
file_name="visualmind_analysis.txt",
mime="text/plain",
use_container_width=True,
)
with btn_clear:
if st.button("π Clear Chat", use_container_width=True):
st.session_state.conversation_history = []
st.session_state.pending_question = ""
st.rerun()
# Copy-friendly plain-text view
st.markdown("", unsafe_allow_html=True)
with st.expander("π Copy Analysis Text", expanded=False):
st.code(
f"BLIP Caption:\n{st.session_state.blip_caption or 'β'}\n\n"
f"Llama 4 Scout Analysis:\n{st.session_state.groq_analysis or 'β'}",
language=None,
)
# ββ Chat ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.divider()
st.markdown(
"""
π¬ Follow-Up Q&A
Ask anything about the image β objects, text, colors, context, comparisons.
""",
unsafe_allow_html=True,
)
# Suggested questions (shown only before first Q&A turn)
if st.session_state.suggested_questions and not st.session_state.conversation_history:
st.markdown(
"β¦ Suggested questions
",
unsafe_allow_html=True,
)
sq_cols = st.columns(len(st.session_state.suggested_questions), gap="small")
for sq_col, sq in zip(sq_cols, st.session_state.suggested_questions):
if sq_col.button(sq, use_container_width=True, key=f"sq_{sq[:24]}"):
st.session_state.pending_question = sq
st.rerun()
st.markdown("", unsafe_allow_html=True)
for message in st.session_state.conversation_history:
with st.chat_message(message["role"]):
st.markdown(message["content"])
user_question = st.chat_input("What would you like to know about this image?")
active_question = user_question
if st.session_state.pending_question:
active_question = st.session_state.pending_question
st.session_state.pending_question = ""
if active_question:
if not groq_key:
st.warning("GROQ_API_KEY is not set β add it to your .env or Space secrets.")
else:
with st.chat_message("user"):
st.markdown(active_question)
with st.chat_message("assistant"):
token_bucket: list[int] = []
answer = st.write_stream(
answer_followup_stream(
st.session_state.image_bytes,
active_question,
st.session_state.conversation_history,
groq_key,
token_bucket,
)
)
tokens = token_bucket[0] if token_bucket else 0
if tokens:
st.caption(f"Tokens used this turn: {tokens:,}")
st.session_state.conversation_history.append({"role": "user", "content": active_question})
st.session_state.conversation_history.append({"role": "assistant", "content": answer})
st.session_state.total_tokens += tokens
st.session_state.qa_turns += 1
# ββ Landing (no image yet) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
else:
st.markdown("", unsafe_allow_html=True)
st.markdown(
""
"What can VisualMind AI do?
",
unsafe_allow_html=True,
)
use_cases = [
("ποΈ", "E-Commerce", "Product tagging, quality checks, listing copy generation"),
("βΏ", "Accessibility", "Auto alt-text for websites, screen-reader descriptions"),
("π", "Research", "Charts, slides, documents, scene Q&A, OCR context"),
]
cols = st.columns(3, gap="medium")
for col, (icon, title, desc) in zip(cols, use_cases):
with col:
st.markdown(
f"""
""",
unsafe_allow_html=True,
)