import streamlit as st from PIL import Image import torch import easyocr from transformers import CLIPProcessor, CLIPModel # ---- Load CLIP Model ---- # @st.cache_resource def load_clip_model(): model = CLIPModel.from_pretrained( "fxmarty/clip-vision-model-tiny", ignore_mismatched_sizes=True # Fix model size mismatch ) processor = CLIPProcessor.from_pretrained("fxmarty/clip-vision-model-tiny") return model, processor model, processor = load_clip_model() # ---- Load OCR (EasyOCR) ---- # @st.cache_resource def load_ocr(): return easyocr.Reader(['en']) reader = load_ocr() # ---- Streamlit UI ---- # st.set_page_config(page_title="Multimodal AI Assistant", layout="wide") st.title("🖼️ Multimodal AI Assistant") st.write("Upload an image and ask a question about it!") # ---- Upload Image ---- # uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: # Display Image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) # Extract Text using OCR with st.spinner("🔍 Extracting text from image..."): extracted_text = reader.readtext(uploaded_file, detail=0) st.write("### 📝 Extracted Text:") if extracted_text: st.success(extracted_text) else: st.warning("No readable text found in the image.") # ---- Ask a Question About the Image ---- # user_question = st.text_input("🤖 Ask a question about the image:") if user_question: with st.spinner("🔍 Analyzing image and generating response..."): inputs = processor(text=[user_question], images=image, return_tensors="pt") outputs = model.get_image_features(**inputs) st.write("### 🏆 AI Response:") st.write("CLIP Model has processed the image! (Further improvements coming soon)")