import gradio as gr from transformers import AutoModel, AutoProcessor import torch import requests from PIL import Image from io import BytesIO fashion_items = ['top', 'trousers', 'jumper'] # Load model and processor with proper meta tensor handling model_name = 'Marqo/marqo-fashionSigLIP' # Force CPU usage to avoid device mapping issues device = torch.device('cpu') # Handle meta tensor initialization properly try: # Load model with empty weights initialization to avoid meta tensor issues model = AutoModel.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float32 ) # Check if model has the to_empty method and use it for meta tensor initialization if hasattr(model, 'model') and hasattr(model.model, 'to_empty'): model.model.to_empty(device=device) elif hasattr(model, 'to_empty'): model.to_empty(device=device) else: # Fallback to regular to() method model = model.to(device) except Exception as e: print(f"Primary loading method failed: {e}") # Fallback method - load with minimal configuration try: model = AutoModel.from_pretrained( model_name, trust_remote_code=True ) # Move to CPU after loading model = model.to(device) except Exception as e2: print(f"Fallback method also failed: {e2}") # Last resort - try loading with low CPU memory usage model = AutoModel.from_pretrained( model_name, trust_remote_code=True, low_cpu_mem_usage=False # Disable to avoid accelerate issues ) model = model.to(device) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) # Preprocess and normalize text data with torch.no_grad(): # Ensure truncation and padding are activated processed_texts = processor( text=fashion_items, return_tensors="pt", truncation=True, # Ensure text is truncated to fit model input size padding=True # Pad shorter sequences so that all are the same length )['input_ids'] text_features = model.get_text_features(processed_texts) text_features = text_features / text_features.norm(dim=-1, keepdim=True) # Prediction function def predict_from_url(url): # Check if the URL is empty if not url: return {"Error": "Please input a URL"} try: image = Image.open(BytesIO(requests.get(url).content)) except Exception as e: return {"Error": f"Failed to load image: {str(e)}"} processed_image = processor(images=image, return_tensors="pt")['pixel_values'] with torch.no_grad(): image_features = model.get_image_features(processed_image) image_features = image_features / image_features.norm(dim=-1, keepdim=True) text_probs = (100 * image_features @ text_features.T).softmax(dim=-1) return {fashion_items[i]: float(text_probs[0, i]) for i in range(len(fashion_items))} # Gradio interface demo = gr.Interface( fn=predict_from_url, inputs=gr.Textbox(label="Enter Image URL"), outputs=gr.Label(label="Classification Results"), title="Fashion Item Classifier", allow_flagging="never" ) # Launch the interface demo.launch()