import gradio as gr from transformers import CLIPProcessor, CLIPModel import torch import requests from PIL import Image from io import BytesIO fashion_items = ['top', 'trousers', 'bottom', 'jumper'] # Load model and processor - using standard CLIP model instead model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # CLIP processes text and images together, so no need for separate text preprocessing # Prediction function def predict_from_url(url): # Check if the URL is empty if not url: return {"Error": "Please input a URL"} try: image = Image.open(BytesIO(requests.get(url).content)) except Exception as e: return {"Error": f"Failed to load image: {str(e)}"} inputs = processor(images=image, text=fashion_items, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image text_probs = logits_per_image.softmax(dim=-1) return {fashion_items[i]: float(text_probs[0, i]) for i in range(len(fashion_items))} # Gradio interface demo = gr.Interface( fn=predict_from_url, inputs=gr.Textbox(label="Enter Image URL"), outputs=gr.Label(label="Classification Results"), title="Fashion Item Classifier", allow_flagging="never" ) # Launch the interface demo.launch()