import streamlit as st from transformers import AutoModelForCausalLM, AutoProcessor from PIL import Image import torch # Load the model and processor @st.cache_resource def load_model(): st.text("Loading model...") processor = AutoProcessor.from_pretrained("vikhyatk/moondream2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream2", revision="2025-03-27", trust_remote_code=True, # device_map="auto" # Enable if running on GPU ) st.text("Model loaded successfully!") return model, processor model, processor = load_model() # File uploader uploaded_file = st.file_uploader("Upload an image of a dish", type=["jpg", "jpeg", "png"]) if uploaded_file: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) # Auto-ask the food question question = "What food is in this image?" # or try: "What is the name of the dish?" if st.button("Classify Dish"): inputs = processor(image, question, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=64) answer = processor.batch_decode(output, skip_special_tokens=True)[0].strip() st.success(f"🍽️ Predicted Dish: **{answer}**")