import streamlit as st from PIL import Image import torch import easyocr import numpy as np import openai # Using OpenAI GPT (or replace with GROQ API) import io from transformers import CLIPModel, CLIPImageProcessor # ✅ Fix: set_page_config() must be the first Streamlit command st.set_page_config(page_title="Multimodal AI Assistant", layout="wide") # ---- Load CLIP Model (Vision Only) ---- # @st.cache_resource def load_clip_model(): model = CLIPModel.from_pretrained( "fxmarty/clip-vision-model-tiny", ignore_mismatched_sizes=True # ✅ Fix size mismatch ) processor = CLIPImageProcessor.from_pretrained("fxmarty/clip-vision-model-tiny") return model, processor model, processor = load_clip_model() # ---- Load OCR (EasyOCR) ---- # @st.cache_resource def load_ocr(): return easyocr.Reader(['en']) reader = load_ocr() # ---- Streamlit UI ---- # st.title("🖼️ Multimodal AI Assistant") st.write("Upload an image, extract text, and ask questions!") # ---- Upload Image ---- # uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "png", "jpeg"]) extracted_text = None # Variable to store extracted text if uploaded_file is not None: # Convert file to image format image = Image.open(uploaded_file).convert("RGB") # ✅ Fix: use `use_container_width` instead of `use_column_width` st.image(image, caption="Uploaded Image", use_container_width=True) # ✅ Convert PIL image to NumPy array for EasyOCR image_np = np.array(image) # ✅ Fix: Pass the correct format to EasyOCR with st.spinner("🔍 Extracting text from image..."): extracted_text_list = reader.readtext(image_np, detail=0) extracted_text = " ".join(extracted_text_list) # Combine extracted text st.write("### 📝 Extracted Text:") if extracted_text: st.success(extracted_text) else: st.warning("No readable text found in the image.") # ---- Question Answering Section ---- # if extracted_text: user_question = st.text_input("💡 Ask a question about the extracted text:") if user_question: with st.spinner("🤖 Thinking..."): # Using OpenAI GPT API (replace with GROQ or Hugging Face LLM if needed) openai.api_key = "YOUR_OPENAI_API_KEY" # Store securely in a .env file response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are an AI assistant helping answer questions based on extracted text from an image."}, {"role": "user", "content": f"Extracted text: {extracted_text}\n\nQuestion: {user_question}"} ] ) answer = response["choices"][0]["message"]["content"] st.write("### 🤖 AI Answer:") st.success(answer)