Spaces:

nachi1326
/

Predict-Product-Attributes

Build error

App Files Files Community

nachi1326 commited on Nov 25, 2024

Commit

bf2a6a5

verified ·

1 Parent(s): 9dafe60

Create app.py

Browse files

Files changed (1) hide show

app.py +300 -0

app.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import torch.nn as nn
+from PIL import Image
+import open_clip
+from pathlib import Path
+import json
+import torch
+import gradio as gr
+from PIL import Image
+# Load category mapping from JSON file
+def load_category_mapping():
+    with open("cat_attr_map.json", "r", encoding="utf-8") as f:
+        return json.load(f)
+CATEGORY_MAPPING = load_category_mapping()
+class CategoryAwareAttributePredictor(nn.Module):
+    def __init__(
+        self,
+        clip_dim=512,
+        category_attributes=None,
+        attribute_dims=None,
+        hidden_dim=512,
+        dropout_rate=0.2,
+        num_hidden_layers=1,
+    ):
+        super(CategoryAwareAttributePredictor, self).__init__()
+        self.category_attributes = category_attributes
+        # Create prediction heads for each category-attribute combination
+        self.attribute_predictors = nn.ModuleDict()
+        for category, attributes in category_attributes.items():
+            for attr_name in attributes.keys():
+                key = f"{category}_{attr_name}"
+                if key in attribute_dims:
+                    layers = []
+                    # Input layer
+                    layers.append(nn.Linear(clip_dim, hidden_dim))
+                    layers.append(nn.LayerNorm(hidden_dim))
+                    layers.append(nn.ReLU())
+                    layers.append(nn.Dropout(dropout_rate))
+                    # Additional hidden layers
+                    for _ in range(num_hidden_layers - 1):
+                        layers.append(nn.Linear(hidden_dim, hidden_dim // 2))
+                        layers.append(nn.ReLU())
+                        layers.append(nn.Dropout(dropout_rate))
+                        hidden_dim = hidden_dim // 2
+                    # Output layer
+                    layers.append(nn.Linear(hidden_dim, attribute_dims[key]))
+                    self.attribute_predictors[key] = nn.Sequential(*layers)
+    def forward(self, clip_features, category):
+        results = {}
+        category_attrs = self.category_attributes[category]
+        clip_features = clip_features.float()
+        for attr_name in category_attrs.keys():
+            key = f"{category}_{attr_name}"
+            if key in self.attribute_predictors:
+                results[key] = self.attribute_predictors[key](clip_features)
+        return results
+class SingleImageInference:
+    def __init__(self, model_path_gelu, model_path_convnext, device="cuda", cache_dir=None):
+        self.device = device
+        # Load models
+        (
+            self.model_gelu,
+            self.clip_model_gelu,
+            self.clip_preprocess_gelu,
+            self.checkpoint_gelu,
+            self.model_convnext,
+            self.clip_model_convnext,
+            self.clip_preprocess_convnext,
+            self.checkpoint_convnext,
+        ) = self.load_models(model_path_gelu, model_path_convnext, self.device, cache_dir)
+    def clean_state_dict(self, state_dict):
+        """Clean checkpoint state dict."""
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            name = k.replace("_orig_mod.", "")
+            new_state_dict[name] = v
+        return new_state_dict
+    def create_clip_model_convnext(self, device, cache_dir=None):
+        model, preprocess_train, _ = open_clip.create_model_and_transforms(
+            "convnext_xxlarge",
+            device=device,
+            pretrained="laion2b_s34b_b82k_augreg_soup",
+            precision="fp32",
+            cache_dir=cache_dir,
+        )
+        model = model.float()
+        return model, preprocess_train
+    def create_clip_model_gelu(self, device, cache_dir=None):
+        model, preprocess_train, _ = open_clip.create_model_and_transforms(
+            "ViT-H-14-quickgelu",
+            device=device,
+            pretrained="dfn5b",
+            precision="fp32",  # Explicitly set precision to fp32
+            cache_dir=cache_dir,
+        )
+        model = model.float()
+        return model, preprocess_train
+    def load_models(self, model_path_gelu, model_path_convnext, device, cache_dir=None):
+        # Load the CLIP model gelu
+        checkpoint_gelu = torch.load(model_path_gelu, map_location="cpu",weights_only = False)
+        clean_clip_checkpoint_gelu = self.clean_state_dict(
+            checkpoint_gelu["clip_model_state_dict"]
+        )
+        clip_model_gelu, clip_preprocess_gelu = self.create_clip_model_gelu("cpu", cache_dir)
+        clip_model_gelu.load_state_dict(clean_clip_checkpoint_gelu)
+        clip_model_gelu = clip_model_gelu.to(device)
+        del clean_clip_checkpoint_gelu
+        torch.cuda.empty_cache()
+        # Load the CLIP model convnext
+        checkpoint_convnext = torch.load(model_path_convnext, map_location="cpu",weights_only = False)
+        clean_clip_checkpoint_convnext = self.clean_state_dict(
+            checkpoint_convnext["clip_model_state_dict"]
+        )
+        clip_model_convnext, clip_preprocess_convnext = self.create_clip_model_convnext(
+            "cpu", cache_dir
+        )
+        clip_model_convnext.load_state_dict(clean_clip_checkpoint_convnext)
+        clip_model_convnext = clip_model_convnext.to(device)
+        del clean_clip_checkpoint_convnext
+        torch.cuda.empty_cache()
+        # Load the attribute predictor models
+        model_gelu = CategoryAwareAttributePredictor(
+            clip_dim=checkpoint_gelu["model_config"]["clip_dim"],
+            category_attributes=checkpoint_gelu["dataset_info"]["category_mapping"],
+            attribute_dims={
+                key: len(values)
+                for key, values in checkpoint_gelu["dataset_info"][
+                    "attribute_classes"
+                ].items()
+            },
+            hidden_dim=checkpoint_gelu["model_config"]["hidden_dim"],
+            dropout_rate=checkpoint_gelu["model_config"]["dropout_rate"],
+            num_hidden_layers=checkpoint_gelu["model_config"]["num_hidden_layers"],
+        ).to(device)
+        model_convnext = CategoryAwareAttributePredictor(
+            clip_dim=checkpoint_convnext["model_config"]["clip_dim"],
+            category_attributes=checkpoint_convnext["dataset_info"]["category_mapping"],
+            attribute_dims={
+                key: len(values)
+                for key, values in checkpoint_convnext["dataset_info"][
+                    "attribute_classes"
+                ].items()
+            },
+            hidden_dim=checkpoint_convnext["model_config"]["hidden_dim"],
+            dropout_rate=checkpoint_convnext["model_config"]["dropout_rate"],
+            num_hidden_layers=checkpoint_convnext["model_config"]["num_hidden_layers"],
+        ).to(device)
+        clean_cat_checkpoint_gelu = self.clean_state_dict(checkpoint_gelu["model_state_dict"])
+        model_gelu.load_state_dict(clean_cat_checkpoint_gelu)
+        del clean_cat_checkpoint_gelu
+        clean_cat_checkpoint_convnext = self.clean_state_dict(
+            checkpoint_convnext["model_state_dict"]
+        )
+        model_convnext.load_state_dict(clean_cat_checkpoint_convnext)
+        del clean_cat_checkpoint_convnext
+        if hasattr(torch, "compile"):
+            model_gelu = torch.compile(model_gelu)
+            clip_model_gelu = torch.compile(clip_model_gelu)
+            model_convnext = torch.compile(model_convnext)
+            clip_model_convnext = torch.compile(clip_model_convnext)
+        model_gelu.eval()
+        clip_model_gelu.eval()
+        model_convnext.eval()
+        clip_model_convnext.eval()
+        return (
+            model_gelu,
+            clip_model_gelu,
+            clip_preprocess_gelu,
+            checkpoint_gelu["dataset_info"],
+            model_convnext,
+            clip_model_convnext,
+            clip_preprocess_convnext,
+            checkpoint_convnext["dataset_info"],
+        )
+    def predict_single_image(self, image_path, category):
+        """Perform inference on a single image."""
+        if not Path(image_path).exists():
+            raise FileNotFoundError(f"Image {image_path} does not exist!")
+        # Preprocess image
+        image = Image.open(image_path).convert("RGB")
+        image_gelu = self.clip_preprocess_gelu(image).unsqueeze(0).to(self.device)
+        image_convnext = self.clip_preprocess_convnext(image).unsqueeze(0).to(self.device)
+        # Extract CLIP features
+        with torch.no_grad():
+            clip_features_gelu = self.clip_model_gelu.encode_image(image_gelu).float()
+            clip_features_convnext = self.clip_model_convnext.encode_image(image_convnext).float()
+            # Predict attributes
+            predictions_gelu = self.model_gelu(clip_features_gelu, category)
+            predictions_convnext = self.model_convnext(clip_features_convnext, category)
+            # Ensemble predictions
+            ensemble_predictions = {}
+            for key, pred_gelu in predictions_gelu.items():
+                pred_convnext = predictions_convnext[key].to(self.device)
+                ensemble_predictions[key] = 0.5 * pred_gelu + 0.5 * pred_convnext
+            # Convert predictions to attributes
+            predicted_attributes = {}
+            for key, pred in ensemble_predictions.items():
+                _, predicted_idx = torch.max(pred, 1)
+                predicted_idx = predicted_idx.item()
+                attr_name = key.split("_", 1)[1]
+                attr_values = self.checkpoint_gelu["attribute_classes"][key]
+                if predicted_idx < len(attr_values):
+                    predicted_attributes[attr_name] = attr_values[predicted_idx]
+            return predicted_attributes
+# Function to make predictions using the provided image and category
+def predict_attributes(image, category):
+    try:
+        # Save the uploaded image temporarily for processing
+        image_path = "temp_image.jpg"
+        image.save(image_path)
+        # Call the inference method
+        predictions = inference.predict_single_image(image_path, category)
+        # Format predictions as a markdown table
+        markdown_output = "### Predicted Attributes\n\n| Attribute | Value |\n|-----------|-------|\n"
+        for attr, value in predictions.items():
+            markdown_output += f"| {attr} | {value} |\n"
+        return markdown_output
+    except Exception as e:
+        return {"error": str(e)}
+# Define Gradio interface
+def gradio_interface():
+    # Define input components
+    image_input = gr.Image(label="Upload an Image", type="pil")
+    category_input = gr.Dropdown(label="Choose Category", choices=['Men Tshirts', 'Women Tshirts', 'Sarees', 'Kurtis', 'Women Tops & Tunics'])
+    # category_input = gr.Textbox(label="Enter Category", placeholder="e.g., shoes, clothes")
+    # Define output
+    output = gr.Markdown(label="Predicted Attributes")
+    # Create Gradio interface
+    interface = gr.Interface(
+        fn=predict_attributes,
+        inputs=[image_input, category_input],
+        outputs=output,
+        title="Attribute Prediction",
+        description="Upload an image and specify its category to get the predicted attributes.",
+        theme="default",
+        flagging_mode="never"
+    )
+    return interface
+# Launch the Gradio app
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_path_gelu = "vith14_gelu_highest_f1.pth"
+    model_path_convnext = "Final_clip_convnext_xxlarge_laion3_4_train_032301.pth"
+    inference = SingleImageInference(
+        model_path_gelu=model_path_gelu,
+        model_path_convnext=model_path_convnext,
+        device=device
+    )
+    gradio_interface().launch()