Spaces:

Mandour-101
/

AVE

Paused

App Files Files Community

Mandour commited on Jun 26, 2025

Commit

a8207bc

1 Parent(s): 0e9d4a5

upload initial files

Browse files

Files changed (5) hide show

.gitignore +12 -0
README.md +14 -0
app.py +531 -0
models.py +290 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# Ignore Python virtual environments
+/gp-env/
+/demo-env/
+/hf_cache/
+# Ignore environment variable files
+.env
+.env.example
+# Ignore __pycache__
+__pycache__/
+*.pyc

README.md CHANGED Viewed

@@ -12,3 +12,17 @@ short_description: Attribute Value Extraction Demo
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Environment Variables
+Set the following environment variables in your Hugging Face Space (Settings → Secrets and environment variables):
+- `ROBERTA_TOKEN`: Hugging Face token with access to the Roberta model weights.
+- `MERGER_MODEL_TOKEN`: Hugging Face token with access to the Merger model weights.
+If running locally, you can create a `.env` file with these variables:
+```
+ROBERTA_TOKEN=your_hf_token_here
+MERGER_MODEL_TOKEN=your_hf_token_here
+```

app.py ADDED Viewed

	@@ -0,0 +1,531 @@

+import gradio as gr
+import pandas as pd
+import json
+import time
+from typing import Tuple
+from PIL import Image
+import torch
+import numpy as np
+from torchvision import transforms
+import os
+from models import (
+    get_device, get_tokenizers, get_image_processor,
+    load_merger_model, get_predicated_values
+)
+# Load environment variables (optional for local dev; Spaces use web UI for env vars)
+if os.path.exists('.env'):
+    from dotenv import load_dotenv
+    load_dotenv()
+# Global constants
+ATTRIBUTES_LIST = ['sleeve', 'color', 'type', 'pattern',
+                   'material', 'style', 'neck', 'gender', 'brand']
+MAX_SEQ_LENGTH = 256
+DECODER_MAX_SEQ_LENGTH = 64
+# Global variables for model components
+MODEL_COMPONENTS = None
+MODEL_LOADED = False
+def initialize_model_and_tokenizers():
+    """Initialize model and tokenizers once"""
+    global MODEL_COMPONENTS, MODEL_LOADED
+    if MODEL_LOADED and MODEL_COMPONENTS:
+        return MODEL_COMPONENTS
+    try:
+        print("🔄 Loading AI model components...")
+        device = get_device()
+        bert_tokenizer, roberta_tokenizer = get_tokenizers()
+        image_processor = get_image_processor()
+        model = load_merger_model(bert_tokenizer, device)
+        MODEL_COMPONENTS = {
+            'model': model,
+            'bert_tokenizer': bert_tokenizer,
+            'roberta_tokenizer': roberta_tokenizer,
+            'image_processor': image_processor,
+            'device': device
+        }
+        MODEL_LOADED = True
+        print("✅ Model loaded successfully!")
+        return MODEL_COMPONENTS
+    except Exception as e:
+        print(f"❌ Failed to load model: {str(e)}")
+        raise e
+def validate_inputs(image, text_input: str, category: str) -> Tuple[bool, str]:
+    """Validate that all inputs are provided"""
+    if image is None:
+        return False, "❌ Please upload an image file"
+    if not text_input or text_input.strip() == "":
+        return False, "❌ Please provide a product description"
+    if not category:
+        return False, "❌ Please select a product category"
+    return True, "✅ Inputs validated successfully"
+def resize_image_for_display(image: Image.Image, target_size=(512, 512)) -> Image.Image:
+    """Resize image for consistent display"""
+    if image.mode != 'RGBA':
+        image = image.convert('RGBA')
+    # Compute new size preserving aspect ratio
+    orig_w, orig_h = image.size
+    max_w, max_h = target_size
+    # Determine scale factor
+    scale = min(max_w / orig_w, max_h / orig_h)
+    new_w = int(orig_w * scale)
+    new_h = int(orig_h * scale)
+    # Resize with high-quality resampling
+    resized = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
+    return resized
+def preprocess_image(image: Image.Image) -> torch.Tensor:
+    """Preprocess image for model input"""
+    if image.mode != 'RGBA':
+        image = image.convert('RGBA')
+    # Apply transformations
+    image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1)
+    image_tensor = image_tensor.unsqueeze(0)
+    return image_tensor
+def run_inference(image_tensor: torch.Tensor, description: str, category: str, model_components: dict) -> dict:
+    """Run model inference using get_predicated_values API"""
+    model = model_components['model']
+    bert_tokenizer = model_components['bert_tokenizer']
+    roberta_tokenizer = model_components['roberta_tokenizer']
+    image_processor = model_components['image_processor']
+    device = model_components['device']
+    # Convert tensor to PIL Image for processor
+    pil_img = transforms.ToPILImage()(image_tensor.squeeze(0).cpu())
+    start_time = time.time()
+    results = get_predicated_values(
+        model, category, pil_img, description,
+        image_processor, bert_tokenizer, roberta_tokenizer, device
+    )
+    end_time = time.time()
+    # Format for UI
+    total_attributes = len([a for a in results if a["value"] and a["value"] != "N/A"])
+    avg_confidence = np.mean([a["confidence"] for a in results if a["value"]
+                             and a["value"] != "N/A"]) if total_attributes > 0 else 0
+    return {
+        "attributes": results,
+        "total_attributes": total_attributes,
+        "avg_confidence": avg_confidence,
+        "processing_time": end_time - start_time
+    }
+def get_confidence_color(confidence: float) -> str:
+    """Get color based on confidence level"""
+    if confidence >= 0.8:
+        return "#28a745"  # Green
+    elif confidence >= 0.6:
+        return "#ffc107"  # Yellow
+    else:
+        return "#dc3545"  # Red
+def format_results_html(results: dict) -> str:
+    """Format results as HTML for display"""
+    if not results or results["total_attributes"] == 0:
+        return """
+        <div style="padding: 20px; text-align: center; background-color: #fff3cd; border-radius: 10px; border: 1px solid #ffeaa7;">
+            <h3 style="color: #856404; margin: 0;">🔍 No attributes extracted</h3>
+            <p style="color: #856404; margin: 10px 0 0 0;">Try with a different image or more detailed description.</p>
+        </div>
+        """
+    html = """
+    <div style="padding: 20px;">
+        <h3 style="color: #333; margin-bottom: 20px; font-size: 1.5em;">📊 Extracted Attributes</h3>
+    """
+    for attr in results["attributes"]:
+        if attr["value"] != "N/A":
+            confidence = attr["confidence"]
+            color = get_confidence_color(confidence)
+            html += f"""
+            <div style="
+                background: white;
+                padding: 15px;
+                margin-bottom: 10px;
+                border-radius: 10px;
+                box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+                border-left: 4px solid #667eea;
+                display: flex;
+                justify-content: space-between;
+                align-items: center;
+            ">
+                <div>
+                    <strong style="color: #333; font-size: 1.1em;">{attr["name"].title()}</strong>
+                    <span style="color: #666; margin-left: 10px;">{attr["value"]}</span>
+                </div>
+                <div style="
+                    background-color: {color};
+                    color: white;
+                    padding: 4px 8px;
+                    border-radius: 12px;
+                    font-size: 0.8em;
+                    font-weight: bold;
+                ">
+                    {confidence:.1%}
+                </div>
+            </div>
+            """
+    # Add summary statistics
+    html += f"""
+    <div style="
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 15px;
+        border-radius: 10px;
+        margin-top: 20px;
+        text-align: center;
+    ">
+        <h4 style="margin: 0;">📈 Summary</h4>
+        <p style="margin: 10px 0 0 0;">
+            <strong>{results["total_attributes"]}</strong> attributes extracted |
+            <strong>{results["avg_confidence"]:.1%}</strong> avg confidence |
+            <strong>{results["processing_time"]:.2f}s</strong> processing time
+        </p>
+    </div>
+    </div>
+    """
+    return html
+def create_download_files(results: dict) -> Tuple[str, str]:
+    """Create JSON and CSV files for download"""
+    if not results:
+        return None, None
+    # JSON file
+    json_content = json.dumps(results, indent=2)
+    json_file = "attributes.json"
+    with open(json_file, "w") as f:
+        f.write(json_content)
+    # CSV file
+    df = pd.DataFrame(results["attributes"])
+    csv_file = "attributes.csv"
+    df.to_csv(csv_file, index=False)
+    return json_file, csv_file
+def process_inputs(image, category, description, progress=gr.Progress()):
+    """Main processing function"""
+    global MODEL_COMPONENTS
+    # Initialize model if needed
+    if not MODEL_LOADED:
+        progress(0.1, desc="Loading AI model...")
+        try:
+            MODEL_COMPONENTS = initialize_model_and_tokenizers()
+        except Exception as e:
+            error_msg = f"❌ Failed to load model: {str(e)}"
+            return None, error_msg, None, None, None
+    # Validate inputs
+    is_valid, validation_message = validate_inputs(image, description, category)
+    if not is_valid:
+        return None, validation_message, None, None, None
+    try:
+        # Step 1: Image preprocessing
+        progress(0.3, desc="📸 Preprocessing image...")
+        resized_image = resize_image_for_display(image, (512, 512))
+        image_tensor = preprocess_image(resized_image)
+        # Step 2: Model inference
+        progress(0.7, desc="🧠 Running AI inference...")
+        results = run_inference(image_tensor, description, category, MODEL_COMPONENTS)
+        # Step 3: Format results
+        progress(0.9, desc="📊 Formatting results...")
+        results_html = format_results_html(results)
+        # Create download files
+        json_file, csv_file = create_download_files(results)
+        progress(1.0, desc="✅ Processing complete!")
+        success_msg = f"🎉 Successfully extracted {results['total_attributes']} attributes!"
+        return resized_image, success_msg, results_html, json_file, csv_file
+    except Exception as e:
+        error_msg = f"❌ Processing failed: {str(e)}"
+        return None, error_msg, None, None, None
+# Custom CSS for styling
+custom_css = """
+/* Global styling */
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+/* Header styling */
+.header-text {
+    text-align: center;
+    color: #333;
+    margin-bottom: 30px;
+}
+/* Input section styling */
+.input-section {
+    background: #f8f9fa;
+    padding: 20px;
+    border-radius: 15px;
+    margin-bottom: 20px;
+}
+/* Button styling */
+.primary-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    color: white !important;
+    font-weight: bold !important;
+    padding: 12px 24px !important;
+    border-radius: 25px !important;
+    font-size: 16px !important;
+}
+/* Results section styling */
+.results-section {
+    background: white;
+    padding: 20px;
+    border-radius: 15px;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
+}
+/* Status message styling */
+.status-positive {
+    color: #28a745;
+    font-weight: bold;
+    padding: 10px;
+    background-color: #d4edda;
+    border-radius: 8px;
+    border: 1px solid #c3e6cb;
+}
+.status-negative {
+    color: #721c24;
+    font-weight: bold;
+    padding: 10px;
+    background-color: #f8d7da;
+    border-radius: 8px;
+    border: 1px solid #f5c6cb;
+}
+/* Info box styling */
+.info-box {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 15px;
+    margin: 20px 0;
+}
+/* Tips styling */
+.tips-section {
+    background: #e9ecef;
+    padding: 15px;
+    border-radius: 10px;
+    margin-top: 20px;
+}
+"""
+# Create Gradio interface
+def create_interface():
+    """Create the main Gradio interface"""
+    with gr.Blocks(css=custom_css, title="AI Attribute Extractor", theme=gr.themes.Soft()) as demo:
+        # Header
+        gr.HTML("""
+        <div class="header-text">
+            <h1>🔍 AI Attribute Extractor</h1>
+            <p style="font-size: 1.1em; color: #666;">Upload an image and provide text to extract detailed attributes using AI</p>
+        </div>
+        """)
+        with gr.Row():
+            # Left column - Input section
+            with gr.Column(scale=1):
+                gr.HTML("<h2>📤 Input Section</h2>")
+                # Image upload
+                image_input = gr.Image(
+                    label="Upload Product Image",
+                    type="pil",
+                    height=300,
+                    elem_classes=["input-section"]
+                )
+                # Category selection
+                category_input = gr.Dropdown(
+                    choices=["clothing", "bags", "shoes", "accessories"],
+                    label="Product Category",
+                    value="clothing",
+                    elem_classes=["input-section"]
+                )
+                # Text description
+                text_input = gr.Textbox(
+                    label="Product Description",
+                    placeholder="Describe the product in detail...",
+                    lines=4,
+                    elem_classes=["input-section"]
+                )
+                # Process button
+                process_btn = gr.Button(
+                    "🚀 Extract Attributes",
+                    variant="primary",
+                    size="lg",
+                    elem_classes=["primary-button"]
+                )
+                # Status message
+                status_msg = gr.HTML(label="Status")
+            # Right column - Results section
+            with gr.Column(scale=1):
+                gr.HTML("<h2>📊 Results Section</h2>")
+                # Processed image display
+                processed_image = gr.Image(
+                    label="Processed Image",
+                    height=300,
+                    elem_classes=["results-section"]
+                )
+                # Results display
+                results_html = gr.HTML(
+                    label="Extracted Attributes",
+                    elem_classes=["results-section"]
+                )
+                # Download buttons
+                with gr.Row():
+                    json_download = gr.File(
+                        label="📄 Download JSON",
+                        visible=False
+                    )
+                    csv_download = gr.File(
+                        label="📊 Download CSV",
+                        visible=False
+                    )
+        # Info section
+        with gr.Row():
+            with gr.Column():
+                gr.HTML("""
+                <div class="info-box">
+                    <h3>ℹ️ About This Tool</h3>
+                    <p>This AI-powered tool extracts product attributes from images and text descriptions using:</p>
+                    <ul>
+                        <li><strong>🖼️ Vision Transformer (DeiT)</strong> for image analysis</li>
+                        <li><strong>🔤 BERT & RoBERTa</strong> for text understanding</li>
+                        <li><strong>🧠 Hierarchical Fusion</strong> for multimodal learning</li>
+                        <li><strong>⚡ LoRA/DoRA</strong> for efficient fine-tuning</li>
+                    </ul>
+                </div>
+                """)
+            with gr.Column():
+                gr.HTML(f"""
+                <div class="tips-section">
+                    <h3>🎯 Tips for Better Results</h3>
+                    <ul>
+                        <li>Use clear, well-lit images</li>
+                        <li>Provide detailed descriptions</li>
+                        <li>Include specific product details</li>
+                        <li>Avoid blurry or low-quality images</li>
+                    </ul>
+                    <h4>Supported Attributes:</h4>
+                    <p>{', '.join([attr.title() for attr in ATTRIBUTES_LIST])}</p>
+                </div>
+                """)
+        # Event handlers
+        def update_status(message: str, is_error: bool = False):
+            """Update status message with styling"""
+            class_name = "status-negative" if is_error else "status-positive"
+            return f'<div class="{class_name}">{message}</div>'
+        def process_and_update(image, category, description):
+            """Process inputs and update all outputs"""
+            processed_img, status, results, json_file, csv_file = process_inputs(
+                image, category, description
+            )
+            # Update status with styling
+            is_error = status.startswith("❌")
+            styled_status = update_status(status, is_error)
+            # Show download buttons if successful
+            json_visible = json_file is not None
+            csv_visible = csv_file is not None
+            return (
+                processed_img,
+                styled_status,
+                results,
+                gr.update(value=json_file, visible=json_visible),
+                gr.update(value=csv_file, visible=csv_visible)
+            )
+        # Connect the process button
+        process_btn.click(
+            fn=process_and_update,
+            inputs=[image_input, category_input, text_input],
+            outputs=[processed_image, status_msg, results_html, json_download, csv_download]
+        )
+        # Example inputs
+        gr.Examples(
+            examples=[
+                [
+                    "https://example.com/sample_image.jpg",  # You can replace with actual sample images
+                    "clothing",
+                    "A stylish red cotton t-shirt with short sleeves and a round neck, perfect for casual wear."
+                ]
+            ],
+            inputs=[image_input, category_input, text_input],
+            label="Try these examples"
+        )
+    return demo
+# Launch the app
+if __name__ == "__main__":
+    # Initialize model on startup
+    print("Initializing AI Attribute Extractor...")
+    # Create and launch the interface
+    demo = create_interface()
+    # Launch configuration
+    demo.launch(
+        server_name="0.0.0.0",  # For Hugging Face Spaces
+        server_port=7860,       # Default port for Hugging Face Spaces
+        share=False,            # Set to True for public sharing
+        debug=False,            # Set to True for development
+        show_error=True,        # Show error messages
+        quiet=False             # Set to True to reduce logging
+    )

models.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from transformers import (AutoProcessor,
+                          RobertaConfig,
+                          BertTokenizerFast,
+                          RobertaTokenizerFast,
+                          RobertaModel,
+                          BlipForQuestionAnswering)
+from huggingface_hub import hf_hub_download
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+# Load environment variables (optional for local dev; Spaces use web UI for env vars)
+if os.path.exists('.env'):
+    from dotenv import load_dotenv
+    load_dotenv()
+ATTRIBUTES_LIST = ['sleeve', 'type', 'pattern', 'material',
+                   'neck', 'color', 'style', 'brand', 'gender']
+HF_CACHE_DIR = "./hf_cache"
+def get_device():
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def get_tokenizers():
+    bert_tokenizer = BertTokenizerFast.from_pretrained(
+        "google-bert/bert-base-uncased", cache_dir=HF_CACHE_DIR)
+    roberta_tokenizer = RobertaTokenizerFast.from_pretrained(
+        "FacebookAI/roberta-base", cache_dir=HF_CACHE_DIR)
+    bert_tokenizer.add_special_tokens({'bos_token': '[DEC]'})
+    return bert_tokenizer, roberta_tokenizer
+def get_image_processor():
+    return AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=HF_CACHE_DIR)
+class AttentionModalityMerger(nn.Module):
+    def __init__(self, text_dim, image_dim):
+        super().__init__()
+        self.text_layer_norm = nn.LayerNorm(text_dim)
+        self.image_layer_norm = nn.LayerNorm(image_dim)
+        self.linear = nn.Linear(
+            in_features=image_dim + text_dim, out_features=1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, text_embedds, image_features, attention_mask):
+        input_mask_expanded = attention_mask.unsqueeze(
+            -1).expand(text_embedds.size()).float()
+        text_embedds = input_mask_expanded * text_embedds
+        text_embedds = text_embedds.sum(dim=1)
+        text_embedds_norm = self.text_layer_norm(text_embedds)
+        image_features = image_features.sum(dim=1)
+        image_features_norm = self.image_layer_norm(image_features)
+        text_image_embedds = torch.cat(
+            [text_embedds_norm, image_features_norm], axis=-1)
+        gate_output = self.linear(text_image_embedds)
+        p_txt = self.sigmoid(gate_output)
+        p_img = 1 - p_txt
+        scaled_text = p_txt * text_embedds_norm
+        scaled_image = p_img * image_features_norm
+        final_output = torch.cat([scaled_text, scaled_image], dim=-1)
+        return final_output, p_txt, p_img
+class RobertaTokenClassificationWithCRF(nn.Module):
+    def __init__(self, vocab_size, device, roberta_token=None):
+        if roberta_token is None:
+            roberta_token = os.getenv("ROBERTA_TOKEN")
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.config = RobertaConfig()
+        self.roberta = RobertaModel.from_pretrained(
+            "FacebookAI/roberta-base", output_hidden_states=True, cache_dir=HF_CACHE_DIR)
+        self.freeze_layers()
+        self._loadTextWeights(device, roberta_token)
+    def _loadTextWeights(self, device, roberta_token):
+        repo_id = "LomaaZakaria/Roberta_Attribute_Value_Extraction_Model"
+        weights_file_name = "RobertaCRFWithNOAnswerClassifier_OnFashionGenData_2epochs.pth"
+        weights_file_path = hf_hub_download(
+            repo_id=repo_id, filename=weights_file_name, token=roberta_token, cache_dir=HF_CACHE_DIR)
+        state_dict = torch.load(
+            weights_file_path, weights_only=True, map_location=device)
+        text_model_state_dict = self.roberta.state_dict()
+        filtered_state_dict = {
+            k: v for k, v in state_dict.items()
+            if k in text_model_state_dict and v.shape == text_model_state_dict[k].shape
+        }
+        self.roberta.load_state_dict(filtered_state_dict, strict=False)
+    def freeze_layers(self):
+        self.roberta.embeddings.requires_grad_(False)
+        for layers in self.roberta.encoder.layer[:8]:
+            for p in layers.parameters():
+                p.requires_grad = False
+    def forward(self, token_ids, attention_mask):
+        outputs = self.roberta(input_ids=token_ids,
+                               attention_mask=attention_mask)
+        last_hidden_state = outputs.hidden_states[-1]
+        return last_hidden_state
+class ImageModel(nn.Module):
+    def __init__(self):
+        super(ImageModel, self).__init__()
+        self.vision_model = BlipForQuestionAnswering.from_pretrained(
+            "Salesforce/blip-vqa-base", cache_dir=HF_CACHE_DIR).vision_model
+        self._freezeLayers()
+    def _freezeLayers(self):
+        self.vision_model.embeddings.requires_grad_(False)
+        for layer in self.vision_model.encoder.layers[:8]:
+            for p in layer.parameters():
+                p.requires_grad = False
+    def forward(self, x):
+        return self.vision_model(x).last_hidden_state
+class MergerModel(nn.Module):
+    def __init__(self, vocab_size, device, roberta_token=None):
+        if roberta_token is None:
+            roberta_token = os.getenv("ROBERTA_TOKEN")
+        super().__init__()
+        self.text_decoder = BlipForQuestionAnswering.from_pretrained(
+            "Salesforce/blip-vqa-base", cache_dir=HF_CACHE_DIR).text_decoder
+        self.text_encoder = RobertaTokenClassificationWithCRF(
+            vocab_size, device, roberta_token)
+        self.vision_model = ImageModel()
+        text_dim, image_dim = self.text_encoder.config.hidden_size, 768
+        self.attention_merger = AttentionModalityMerger(text_dim, image_dim)
+        self.linear = nn.Linear(in_features=text_dim +
+                                image_dim, out_features=text_dim)
+    def forward(self, **inputs):
+        text_encoder = self.text_encoder(
+            token_ids=inputs['encoder_token_ids'], attention_mask=inputs['encoder_attention_mask'])
+        vision_encoder = self.vision_model(x=inputs['image'])
+        merger_output, p_txt, p_img = self.attention_merger(
+            text_encoder, vision_encoder, attention_mask=inputs['encoder_attention_mask'])
+        merger_output = merger_output.unsqueeze(1)
+        batch_size = vision_encoder.shape[0]
+        merger_output_mask = torch.ones(
+            (batch_size, 1), dtype=torch.long, device=vision_encoder.device)
+        merger_output_linear = self.linear(merger_output)
+        decoder_output = self.text_decoder(
+            input_ids=inputs['decoder_input_token_ids'],
+            attention_mask=inputs['decoder_input_attention_mask'],
+            encoder_hidden_states=merger_output_linear,
+            encoder_attention_mask=merger_output_mask,
+            return_dict=True,
+            return_logits=True
+        )
+        logits = decoder_output
+        return logits, p_txt, p_img
+def load_merger_model(bert_tokenizer, device, model_token=None):
+    if model_token is None:
+        model_token = os.getenv("MERGER_MODEL_TOKEN")
+    vocab_size = len(bert_tokenizer)
+    model = MergerModel(vocab_size, device)
+    repo_id = "MohamedMosilhy/AttentionMergerModality"
+    weights_file_name = "Freezing_More_NewViTBlipAttentionMergerModality_4epochs_2e_5_withwarmup.pth"
+    weights_file_path = hf_hub_download(
+        repo_id=repo_id, filename=weights_file_name, token=model_token, cache_dir=HF_CACHE_DIR)
+    model.load_state_dict(torch.load(
+        weights_file_path, weights_only=True, map_location=device))
+    model.to(device)
+    model.eval()
+    return model
+def model_generate(model, data, text_tokenizer, device, labels=None, max_generated_length=50, testing=False, return_confidence=False):
+    if labels is None:
+        labels = '[DEC]'
+        token_labels = text_tokenizer.convert_tokens_to_ids([labels])
+    else:
+        token_labels = text_tokenizer.convert_tokens_to_ids([labels])
+    model.eval()
+    confidences = []
+    for index in range(max_generated_length):
+        decoder_inputs = text_tokenizer(
+            text=labels, max_length=65, padding='max_length', add_special_tokens=False, return_tensors="pt")
+        decoder_data = {
+            "decoder_input_token_ids": decoder_inputs['input_ids'],
+            "decoder_input_attention_mask": decoder_inputs['attention_mask']
+        }
+        inputs = {
+            "image": data['image'].unsqueeze(0).to(device),
+            "encoder_token_ids": data['encoder_token_ids'].unsqueeze(0).to(device),
+            "encoder_attention_mask": data['encoder_attention_mask'].unsqueeze(0).to(device),
+            "decoder_input_token_ids": decoder_data['decoder_input_token_ids'].to(device),
+            "decoder_input_attention_mask": decoder_data['decoder_input_attention_mask'].to(device)
+        }
+        with torch.no_grad():
+            logits, _, _ = model(**inputs)
+            probs = F.softmax(logits, dim=-1)
+            predicated_label = torch.argmax(
+                probs[:, index, :], dim=-1).cpu().numpy()
+            # Get confidence for this token
+            confidence = float(
+                probs[0, index, predicated_label[0]].cpu().item())
+            confidences.append(confidence)
+        token_labels.append(predicated_label[0])
+        predicted_tokens = text_tokenizer.convert_ids_to_tokens(
+            predicated_label)
+        labels = text_tokenizer.decode(token_labels)
+        if predicted_tokens[0] == text_tokenizer.sep_token:
+            break
+    predicated_attribute_value = text_tokenizer.decode(token_labels)
+    if testing:
+        token_labels = np.array(token_labels)
+        dec_token_id = text_tokenizer.bos_token_id
+        token_labels = token_labels[token_labels != dec_token_id]
+        return token_labels
+    if return_confidence:
+        # Use the minimum confidence across the generated tokens as the attribute confidence
+        return predicated_attribute_value, min(confidences) if confidences else 0.0
+    return predicated_attribute_value
+# Define which attributes are relevant for each category
+CATEGORY_ATTRIBUTES = {
+    "clothing": ['sleeve', 'type', 'pattern', 'material', 'neck', 'color', 'style', 'brand', 'gender'],
+    "bags":     ['type', 'pattern', 'material', 'color', 'style', 'brand', 'gender'],
+    "shoes":    ['type', 'pattern', 'material', 'color', 'style', 'brand', 'gender'],
+    "accessories": ['type', 'pattern', 'material', 'color', 'style', 'brand', 'gender'],
+}
+def get_predicated_values(
+    model, category, img, desc, image_processor, bert_tokenizer, roberta_tokenizer, device, max_seq_length=256
+    ):
+    results = []
+    def _combined_with_CategoriesAttributes(desc, category, attribute):
+        return category + ' ' + attribute
+    def imageProcesser(img):
+        return image_processor(img)
+    def _tokenizeText(image, desc, category, attribute):
+        combined_desc = _combined_with_CategoriesAttributes(
+            desc, category, attribute)
+        image_inputs = imageProcesser(image)
+        text_encoder_inputs = roberta_tokenizer(
+            combined_desc,
+            desc,
+            max_length=max_seq_length,
+            padding='max_length',
+            return_tensors='np'
+        )
+        return image_inputs, text_encoder_inputs
+    # Normalize category to lower-case and pick attributes
+    category_key = str(category).strip().lower()
+    attributes = CATEGORY_ATTRIBUTES.get(category_key, CATEGORY_ATTRIBUTES["clothing"])
+    image = img
+    for attribute in attributes:
+        image_inputs, text_encoder_inputs = _tokenizeText(
+            image, desc, category, attribute)
+        image_data = torch.from_numpy(np.array(image_inputs['pixel_values']))
+        encoder_token_ids = torch.from_numpy(
+            np.array(text_encoder_inputs['input_ids']))
+        encoder_attn_mask = torch.from_numpy(
+            np.array(text_encoder_inputs['attention_mask']))
+        inputs = {
+            "image": image_data.squeeze(0),
+            "encoder_token_ids": encoder_token_ids.squeeze(0),
+            "encoder_attention_mask": encoder_attn_mask.squeeze(0),
+        }
+        predicated_value, confidence = model_generate(
+            model, inputs, text_tokenizer=bert_tokenizer, device=device, return_confidence=True
+        )
+        # Remove [DEC] and [SEP] tokens and strip whitespace
+        clean_value = predicated_value.replace('[DEC]', '').replace('[SEP]', '').strip()
+        if clean_value != 'not specified':
+            results.append(
+                {"name": attribute, "value": clean_value,
+                    "confidence": float(confidence)}
+            )
+    return results

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+torch>=1.9.0
+torchvision>=0.10.0
+transformers>=4.20.0
+huggingface-hub
+python-dotenv
+Pillow>=9.0.0
+pandas>=1.3.0
+numpy>=1.21.0