Leacb4
/

gap-clip

@@ -1,272 +1,31 @@
 import os
-import time
 import json
 import torch
 from torch.utils.data import Dataset, DataLoader
 from torchvision import transforms, models
 from PIL import Image
-import requests
-from io import BytesIO
 import torch.nn as nn
 import torch.nn.functional as F
 import pandas as pd
-from tqdm.auto import tqdm
-import asyncio
-import aiohttp
-import pandas as pd
-import os
-from pathlib import Path
-from tqdm.asyncio import tqdm
-import ssl
 import logging
-from typing import Optional, List, Tuple
-from urllib.parse import urlparse
-import hashlib
-from config import local_dataset_path
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-class ImageDownloader:
-    """Enhanced image downloader with better error handling, retry logic, and progress tracking."""
-    def __init__(self,
-                 output_dir: str = "athleta_images",
-                 max_concurrent: int = 10,
-                 timeout: int = 30,
-                 retry_attempts: int = 3,
-                 verify_ssl: bool = True):
-        """
-        Initialize the ImageDownloader.
-        Args:
-            output_dir: Directory to save downloaded images
-            max_concurrent: Maximum number of concurrent downloads
-            timeout: Request timeout in seconds
-            retry_attempts: Number of retry attempts for failed downloads
-            verify_ssl: Whether to verify SSL certificates
-        """
-        self.output_dir = Path(output_dir)
-        self.max_concurrent = max_concurrent
-        self.timeout = aiohttp.ClientTimeout(total=timeout)
-        self.retry_attempts = retry_attempts
-        self.verify_ssl = verify_ssl
-        # Create output directory
-        self.output_dir.mkdir(exist_ok=True)
-        # Statistics
-        self.stats = {
-            'total': 0,
-            'downloaded': 0,
-            'skipped': 0,
-            'failed': 0,
-            'retries': 0
-        }
-    def _create_ssl_context(self) -> Optional[ssl.SSLContext]:
-        """Create SSL context based on verification settings."""
-        if not self.verify_ssl:
-            ssl_context = ssl.create_default_context()
-            ssl_context.check_hostname = False
-            ssl_context.verify_mode = ssl.CERT_NONE
-            return ssl_context
-        return None
-    def _generate_filename(self, url: str, index: int) -> str:
-        """Generate a safe filename from URL or index."""
-        try:
-            # Try to extract filename from URL
-            parsed_url = urlparse(url)
-            filename = os.path.basename(parsed_url.path)
-            if filename and '.' in filename:
-                return filename
-        except Exception:
-            pass
-        # Fallback: use URL hash or index
-        try:
-            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
-            return f"image_{url_hash}.jpg"
-        except Exception:
-            return f"image_{index}.jpg"
-    async def _download_single_image(self,
-                                   session: aiohttp.ClientSession,
-                                   url: str,
-                                   save_path: Path,
-                                   index: int) -> bool:
-        """
-        Download a single image with retry logic.
-        Returns:
-            bool: True if successful, False otherwise
-        """
-        for attempt in range(self.retry_attempts):
-            try:
-                if attempt > 0:
-                    self.stats['retries'] += 1
-                    logger.info(f"Retry {attempt}/{self.retry_attempts} for {url}")
-                ssl_context = self._create_ssl_context()
-                connector = aiohttp.TCPConnector(ssl=ssl_context) if ssl_context else None
-                async with session.get(url, ssl=ssl_context, connector=connector) as response:
-                    if response.status == 200:
-                        content = await response.read()
-                        # Validate that it's actually an image
-                        if len(content) < 1024:  # Too small to be a real image
-                            logger.warning(f"Image too small, skipping: {url}")
-                            return False
-                        # Ensure directory exists
-                        save_path.parent.mkdir(parents=True, exist_ok=True)
-                        # Write file
-                        with open(save_path, 'wb') as f:
-                            f.write(content)
-                        logger.debug(f"Successfully downloaded: {save_path}")
-                        return True
-                    elif response.status == 404:
-                        logger.warning(f"Image not found (404): {url}")
-                        return False
-                    else:
-                        logger.warning(f"HTTP {response.status} for {url}")
-                        if attempt == self.retry_attempts - 1:
-                            return False
-            except asyncio.TimeoutError:
-                logger.warning(f"Timeout downloading {url} (attempt {attempt + 1})")
-                if attempt == self.retry_attempts - 1:
-                    return False
-            except Exception as e:
-                logger.error(f"Error downloading {url}: {str(e)}")
-                if attempt == self.retry_attempts - 1:
-                    return False
-        return False
-    async def _download_batch(self,
-                            session: aiohttp.ClientSession,
-                            batch: List[Tuple[str, Path, int]]) -> None:
-        """Download a batch of images concurrently."""
-        semaphore = asyncio.Semaphore(self.max_concurrent)
-        async def download_with_semaphore(url, save_path, index):
-            async with semaphore:
-                if save_path.exists():
-                    logger.debug(f"File already exists, skipping: {save_path}")
-                    self.stats['skipped'] += 1
-                    return
-                success = await self._download_single_image(session, url, save_path, index)
-                if success:
-                    self.stats['downloaded'] += 1
-                else:
-                    self.stats['failed'] += 1
-        tasks = [download_with_semaphore(url, save_path, index)
-                for url, save_path, index in batch]
-        await asyncio.gather(*tasks, return_exceptions=True)
-    def _prepare_download_tasks(self, df: pd.DataFrame) -> List[Tuple[str, Path, int]]:
-        """Prepare download tasks from DataFrame."""
-        tasks = []
-        for index, row in df.iterrows():
-            # Check if image URL is valid
-            if pd.isna(row.get('image')) or not isinstance(row.get('image'), str):
-                logger.debug(f"Skipping row {index}: invalid image URL")
-                continue
-            url = row['image'].strip()
-            if not url or not url.startswith(('http://', 'https://')):
-                logger.debug(f"Skipping row {index}: invalid URL format")
-                continue
-            # Generate filename
-            filename = self._generate_filename(url, index)
-            save_path = self.output_dir / filename
-            tasks.append((url, save_path, index))
-        return tasks
-    async def download_all_images(self, df: pd.DataFrame) -> None:
-        """Download all images from the DataFrame."""
-        logger.info("Preparing download tasks...")
-        tasks = self._prepare_download_tasks(df)
-        self.stats['total'] = len(tasks)
-        if not tasks:
-            logger.warning("No valid image URLs found in the dataset")
-            return
-        logger.info(f"Found {len(tasks)} valid image URLs to download")
-        # Create session with proper configuration
-        ssl_context = self._create_ssl_context()
-        connector = aiohttp.TCPConnector(ssl=ssl_context) if ssl_context else None
-        async with aiohttp.ClientSession(
-            timeout=self.timeout,
-            connector=connector,
-            headers={'User-Agent': 'Mozilla/5.0 (compatible; ImageDownloader/1.0)'}
-        ) as session:
-            # Process in batches to avoid overwhelming the server
-            batch_size = self.max_concurrent * 2
-            for i in range(0, len(tasks), batch_size):
-                batch = tasks[i:i + batch_size]
-                logger.info(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}")
-                await self._download_batch(session, batch)
-                # Small delay between batches to be respectful
-                if i + batch_size < len(tasks):
-                    await asyncio.sleep(1)
-    def print_statistics(self) -> None:
-        """Print download statistics."""
-        logger.info("Download Statistics:")
-        logger.info(f"  Total URLs processed: {self.stats['total']}")
-        logger.info(f"  Successfully downloaded: {self.stats['downloaded']}")
-        logger.info(f"  Skipped (already exists): {self.stats['skipped']}")
-        logger.info(f"  Failed: {self.stats['failed']}")
-        logger.info(f"  Retry attempts: {self.stats['retries']}")
-        if self.stats['total'] > 0:
-            success_rate = (self.stats['downloaded'] / self.stats['total']) * 100
-            logger.info(f"  Success rate: {success_rate:.1f}%")
-import os
-import time
-import json
-import torch
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms, models
-from PIL import Image
-import requests
-from io import BytesIO
-import torch.nn as nn
-import torch.nn.functional as F
-import pandas as pd
-from tqdm.auto import tqdm
 class ColorDataset(Dataset):
     def __init__(self, dataframe, tokenizer, transform=None):
         """
-        dataframe : pd.DataFrame avec colonnes 'image_url' et 'text'
-        tokenizer : fonction qui convertit texte -> list d'entiers (tokens)
-        transform : transformations image
         """
         self.df = dataframe.reset_index(drop=True)
         self.tokenizer = tokenizer
@@ -282,20 +41,15 @@ class ColorDataset(Dataset):
     def __getitem__(self, idx):
         row = self.df.iloc[idx]
-        try:
-            src = row.get('local_image_path', None)
-            if not src or not os.path.isfile(src):
-                return None  # filtered by collate
-            img = Image.open(src).convert("RGB")
-            img = self.transform(img)
-            tokens = torch.tensor(self.tokenizer(row['text']), dtype=torch.long)
-            return img, tokens
-        except Exception:
-            return None
-from collections import defaultdict
-class SimpleTokenizer:
     def __init__(self):
         self.word2idx = defaultdict(lambda: 0)  # 0 = pad/unknown
         self.idx2word = {}
@@ -339,8 +93,11 @@ class SimpleTokenizer:
         self.idx2word = {int(v): k for k, v in word2idx_dict.items() if int(v) > 0}
         self.counter = max(self.word2idx.values(), default=0) + 1
 class ImageEncoder(nn.Module):
-    def __init__(self, embedding_dim=16):
         super().__init__()
         self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
         self.backbone.fc = nn.Sequential(
@@ -353,7 +110,7 @@ class ImageEncoder(nn.Module):
         return F.normalize(x, dim=-1)
 class TextEncoder(nn.Module):
-    def __init__(self, vocab_size, embedding_dim=16):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)  # Keep 32 dimensions
         self.dropout = nn.Dropout(0.1)  # Add regularization
@@ -370,15 +127,52 @@ class TextEncoder(nn.Module):
         return F.normalize(self.fc(mean), dim=-1)
 class ColorCLIP(nn.Module):
-    def __init__(self, vocab_size, embedding_dim=16):  # Keep 16 dimensions
         super().__init__()
         self.image_encoder = ImageEncoder(embedding_dim)
         self.text_encoder = TextEncoder(vocab_size, embedding_dim)
     def forward(self, image, text, lengths=None):
         return self.image_encoder(image), self.text_encoder(text, lengths)
     def get_text_embeddings(self, texts: List[str]) -> torch.Tensor:
         token_lists = [self.tokenizer(t) for t in texts]
         max_len = max((len(toks) for toks in token_lists), default=0)
         padded = [toks + [0] * (max_len - len(toks)) for toks in token_lists]
@@ -387,17 +181,143 @@ class ColorCLIP(nn.Module):
         with torch.no_grad():
             emb = self.text_encoder(input_ids, lengths)
         return emb
 def clip_loss(image_emb, text_emb, temperature=0.07):
     logits = image_emb @ text_emb.T / temperature
     labels = torch.arange(len(image_emb), device=image_emb.device)
     loss_i2t = F.cross_entropy(logits, labels)
     loss_t2i = F.cross_entropy(logits.T, labels)
     return (loss_i2t + loss_t2i) / 2
-# Collate qui pad les séquences et filtre les None
 def collate_batch(batch):
     batch = [b for b in batch if b is not None]
     if len(batch) == 0:
         return None
@@ -410,33 +330,32 @@ def collate_batch(batch):
 if __name__ == "__main__":
-    # Chargement + split train/test + cache local
-    tokenizer = SimpleTokenizer()
-    df = pd.read_csv('df_color_with_local_paths.csv')
-    # Reduce to main colors only (11 classes instead of 34)
     main_colors = ['beige', 'black', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'white', 'yellow']
-    df = df[df['color'].isin(main_colors)].copy()
     print(f"📊 Filtered dataset: {len(df)} samples with {len(main_colors)} colors")
-    print(f"🎨 Colors: {sorted(df['color'].unique())}")
-    tokenizer.fit(df['text'].tolist())
-    # If no local paths column, download/calc it once
-    if 'local_image_path' not in df.columns or df['local_image_path'].isna().all():
-        downloader = ImageDownloader(
-            csv_path='new/df_color_with_local_paths.csv',
-            images_dir='data/images',
-            max_workers=16,
-            timeout=10
-        )
-        df_local = downloader.download_all_images()
-    else:
-        df_local = df
     # Filter only rows with a valid local file
-    df_local = df_local[df_local['local_image_path'].astype(str).str.len() > 0]
-    df_local = df_local[df_local['local_image_path'].apply(lambda p: os.path.isfile(p))]
     df_local = df_local.reset_index(drop=True)
@@ -450,30 +369,27 @@ if __name__ == "__main__":
     train_dataset = ColorDataset(df_train, tokenizer)
     test_dataset = ColorDataset(df_test, tokenizer)
-    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_batch, num_workers=0)
-    test_loader  = DataLoader(test_dataset,  batch_size=16, shuffle=False, collate_fn=collate_batch, num_workers=0)
-    device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
     print(f"Using device: {device}")
-    model = ColorCLIP(vocab_size=tokenizer.counter).to(device)
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)  # Add weight decay
     # Save tokenizer vocab once (or update) so evaluation can reload the same mapping
     here = os.path.dirname(__file__)
-    vocab_out = os.path.join(here, "tokenizer_vocab.json")
     with open(vocab_out, "w") as f:
         json.dump(dict(tokenizer.word2idx), f)
-    from collections import defaultdict
-    EPOCHS = 50  # Increased from 10 to 50
-    for epoch in range(EPOCHS):
         model.train()
-        pbar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS} - train", leave=False)
-        last_loss = None
         for batch in train_loader:
             if batch is None:
                 pbar.update(1)
@@ -487,20 +403,22 @@ if __name__ == "__main__":
             loss = clip_loss(img_emb, text_emb)
             loss.backward()
             optimizer.step()
-            last_loss = loss.item()
-            pbar.set_postfix({"loss": f"{last_loss:.4f}"})
             pbar.update(1)
         pbar.close()
-        if last_loss is not None:
-            print(f"[Train] Epoch {epoch+1}/{EPOCHS} - last batch loss: {last_loss:.4f}")
         else:
-            print(f"[Train] Epoch {epoch+1}/{EPOCHS} - no valid batches")
         # Eval rapide sur test avec barre
         model.eval()
         test_losses = []
         with torch.no_grad():
-            pbar_t = tqdm(total=len(test_loader), desc=f"Epoch {epoch+1}/{EPOCHS} - test", leave=False)
             for batch in test_loader:
                 if batch is None:
                     pbar_t.update(1)
@@ -514,15 +432,20 @@ if __name__ == "__main__":
                 pbar_t.update(1)
             pbar_t.close()
         if len(test_losses) > 0:
-            print(f"[Test ] Epoch {epoch+1}/{EPOCHS} - avg loss: {sum(test_losses)/len(test_losses):.4f}")
         else:
-            print(f"[Test ] Epoch {epoch+1}/{EPOCHS} - no valid batches")
         # --- Save checkpoint at every epoch ---
         ckpt_dir = here
-        latest_path = os.path.join(ckpt_dir, "colorclip_image_text.pt")
-        epoch_path = os.path.join(ckpt_dir, f"colorclip_image_text_epoch_{epoch+1}.pt")
-        state_dict = model.state_dict()
-        torch.save(state_dict, latest_path)
-        torch.save(state_dict, epoch_path)
         print(f"[Save ] Saved checkpoints: {latest_path} and {epoch_path}")

+import config
 import os
 import json
 import torch
 from torch.utils.data import Dataset, DataLoader
 from torchvision import transforms, models
 from PIL import Image
 import torch.nn as nn
 import torch.nn.functional as F
 import pandas as pd
+from tqdm.auto import tqdm
+from collections import defaultdict
+from typing import Optional, List
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# -------------------------------
+# Dataset Classes
+# -------------------------------
 class ColorDataset(Dataset):
     def __init__(self, dataframe, tokenizer, transform=None):
         """
+        dataframe : pd.DataFrame with columns image and text columns
+        tokenizer : function that converts text -> list of integers (tokens)
+        transform : transformations on the image
         """
         self.df = dataframe.reset_index(drop=True)
         self.tokenizer = tokenizer
     def __getitem__(self, idx):
         row = self.df.iloc[idx]
+        img = Image.open(config.column_local_image_path).convert("RGB")
+        img = self.transform(img)
+        tokens = torch.tensor(self.tokenizer(row[config.text_column]), dtype=torch.long)
+        return img, tokens
+# -------------------------------
+# Tokenizer
+# -------------------------------
+class Tokenizer:
     def __init__(self):
         self.word2idx = defaultdict(lambda: 0)  # 0 = pad/unknown
         self.idx2word = {}
         self.idx2word = {int(v): k for k, v in word2idx_dict.items() if int(v) > 0}
         self.counter = max(self.word2idx.values(), default=0) + 1
+# -------------------------------
+# Model Components
+# -------------------------------
 class ImageEncoder(nn.Module):
+    def __init__(self, embedding_dim=config.color_emb_dim):
         super().__init__()
         self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
         self.backbone.fc = nn.Sequential(
         return F.normalize(x, dim=-1)
 class TextEncoder(nn.Module):
+    def __init__(self, vocab_size, embedding_dim=config.color_emb_dim):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)  # Keep 32 dimensions
         self.dropout = nn.Dropout(0.1)  # Add regularization
         return F.normalize(self.fc(mean), dim=-1)
 class ColorCLIP(nn.Module):
+    """
+    Color CLIP model for learning color-aligned image-text embeddings.
+    """
+    def __init__(self, vocab_size, embedding_dim=config.color_emb_dim, tokenizer=None):
+        """
+        Initialize ColorCLIP model.
+        Args:
+            vocab_size: Size of the vocabulary for text encoding
+            embedding_dim: Dimension of the embedding space (default: color_emb_dim)
+            tokenizer: Optional Tokenizer instance (will create one if None)
+        """
         super().__init__()
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
         self.image_encoder = ImageEncoder(embedding_dim)
         self.text_encoder = TextEncoder(vocab_size, embedding_dim)
+        self.tokenizer = tokenizer
     def forward(self, image, text, lengths=None):
+        """
+        Forward pass through the model.
+        Args:
+            image: Image tensor [B, C, H, W]
+            text: Text token tensor [B, T]
+            lengths: Optional sequence lengths tensor [B]
+        Returns:
+            Tuple of (image_embeddings, text_embeddings)
+        """
         return self.image_encoder(image), self.text_encoder(text, lengths)
     def get_text_embeddings(self, texts: List[str]) -> torch.Tensor:
+        """
+        Get text embeddings for a list of text strings.
+        Args:
+            texts: List of text strings
+        Returns:
+            Text embeddings tensor [batch_size, embedding_dim]
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer must be set before calling get_text_embeddings")
         token_lists = [self.tokenizer(t) for t in texts]
         max_len = max((len(toks) for toks in token_lists), default=0)
         padded = [toks + [0] * (max_len - len(toks)) for toks in token_lists]
         with torch.no_grad():
             emb = self.text_encoder(input_ids, lengths)
         return emb
+    @classmethod
+    def from_pretrained(cls, model_path: str, vocab_path: Optional[str] = None, device: str = "cpu", repo_id: Optional[str] = None):
+        """
+        Load a pretrained ColorCLIP model from a file path or Hugging Face Hub.
+        Args:
+            model_path: Path to the model checkpoint (.pt file) or filename if using repo_id
+            vocab_path: Optional path to tokenizer vocabulary JSON file or filename if using repo_id
+            device: Device to load the model on (default: "cpu")
+            repo_id: Optional Hugging Face repository ID (e.g., "username/model-name")
+                     If provided, model_path and vocab_path should be filenames within the repo
+        Returns:
+            ColorCLIP model instance
+        Example:
+            # Load from local file
+            model = ColorCLIP.from_pretrained("color_model.pt", "tokenizer_vocab.json")
+            # Load from Hugging Face Hub
+            from huggingface_hub import hf_hub_download
+            model_file = hf_hub_download(repo_id="username/model-name", filename="color_model.pt")
+            vocab_file = hf_hub_download(repo_id="username/model-name", filename="tokenizer_vocab.json")
+            model = ColorCLIP.from_pretrained(model_file, vocab_file)
+        """
+        device_obj = torch.device(device)
+        # Support loading from Hugging Face Hub if repo_id is provided
+        if repo_id:
+            try:
+                from huggingface_hub import hf_hub_download
+                model_path = hf_hub_download(repo_id=repo_id, filename=model_path)
+                if vocab_path:
+                    vocab_path = hf_hub_download(repo_id=repo_id, filename=vocab_path)
+            except ImportError:
+                raise ImportError("huggingface_hub is required to load models from Hugging Face. Install it with: pip install huggingface-hub")
+        # Load model checkpoint
+        checkpoint = torch.load(model_path, map_location=device_obj)
+        # Extract vocab size and embedding dimension from checkpoint
+        if isinstance(checkpoint, dict):
+            # Try to get vocab_size from metadata first
+            vocab_size = checkpoint.get('vocab_size', None)
+            embedding_dim = checkpoint.get('embedding_dim', 16)
+            # If not in metadata, try to infer from model state
+            if vocab_size is None:
+                state_dict = checkpoint.get('model_state_dict', checkpoint)
+                if 'text_encoder.embedding.weight' in state_dict:
+                    vocab_size = state_dict['text_encoder.embedding.weight'].shape[0]
+                else:
+                    raise ValueError("Could not determine vocab_size from checkpoint")
+            # Load state dict
+            state_dict = checkpoint.get('model_state_dict', checkpoint)
+        else:
+            raise ValueError("Checkpoint must be a dictionary")
+        # Initialize model
+        model = cls(vocab_size=vocab_size, embedding_dim=embedding_dim)
+        model.load_state_dict(state_dict)
+        model = model.to(device_obj)
+        # Load tokenizer if vocab path is provided
+        if vocab_path and os.path.exists(vocab_path):
+            tokenizer = Tokenizer()
+            with open(vocab_path, 'r') as f:
+                vocab_dict = json.load(f)
+            tokenizer.load_vocab(vocab_dict)
+            model.tokenizer = tokenizer
+        model.eval()
+        return model
+    def save_pretrained(self, save_directory: str, vocab_path: Optional[str] = None):
+        """
+        Save the model and optionally the tokenizer vocabulary.
+        Args:
+            save_directory: Directory to save the model
+            vocab_path: Optional path to save tokenizer vocabulary
+        """
+        os.makedirs(save_directory, exist_ok=True)
+        # Save model checkpoint
+        model_path = os.path.join(save_directory, config.color_model_path)
+        checkpoint = {
+            'model_state_dict': self.state_dict(),
+            'vocab_size': self.vocab_size,
+            'embedding_dim': self.embedding_dim
+        }
+        torch.save(checkpoint, model_path)
+        # Save tokenizer vocabulary if available
+        if self.tokenizer is not None:
+            vocab_dict = dict(self.tokenizer.word2idx)
+            if vocab_path is None:
+                vocab_path = os.path.join(save_directory, config.tokeniser_path)
+            with open(vocab_path, 'w') as f:
+                json.dump(vocab_dict, f)
+        return model_path, vocab_path
+# -------------------------------
+# Loss Functions and Utilities
+# -------------------------------
 def clip_loss(image_emb, text_emb, temperature=0.07):
+    """
+    CLIP contrastive loss function.
+    Args:
+        image_emb: Image embeddings [batch_size, embedding_dim]
+        text_emb: Text embeddings [batch_size, embedding_dim]
+        temperature: Temperature scaling parameter
+    Returns:
+        Contrastive loss value
+    """
     logits = image_emb @ text_emb.T / temperature
     labels = torch.arange(len(image_emb), device=image_emb.device)
     loss_i2t = F.cross_entropy(logits, labels)
     loss_t2i = F.cross_entropy(logits.T, labels)
     return (loss_i2t + loss_t2i) / 2
 def collate_batch(batch):
+    """
+    Collate function for DataLoader that pads sequences and filters None values.
+    Args:
+        batch: List of (image, tokens) tuples or None
+    Returns:
+        Tuple of (images, padded_tokens, lengths) or None if batch is empty
+    """
     batch = [b for b in batch if b is not None]
     if len(batch) == 0:
         return None
 if __name__ == "__main__":
+    """
+    Training script for ColorCLIP model.
+    This code only runs when the file is executed directly, not when imported.
+    """
+    # Configuration
+    batch_size = 16
+    lr = 1e-4
+    epochs=50
+    # Load dataset and split train/test
+    tokenizer = Tokenizer()
+    df = pd.read_csv(config.local_dataset_path)
+    # Data preparation: Reduce to main colors only (11 classes instead of 34)
     main_colors = ['beige', 'black', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'white', 'yellow']
+    df = df[df[config.color_column].isin(main_colors)].copy()
     print(f"📊 Filtered dataset: {len(df)} samples with {len(main_colors)} colors")
+    print(f"🎨 Colors: {sorted(df[config.color_column].unique())}")
+    tokenizer.fit(df[config.text_column].tolist())
     # Filter only rows with a valid local file
+    df_local = df[df[config.column_local_image_path].astype(str).str.len() > 0]
+    df_local = df_local[df_local[config.column_local_image_path].apply(lambda p: os.path.isfile(p))]
     df_local = df_local.reset_index(drop=True)
     train_dataset = ColorDataset(df_train, tokenizer)
     test_dataset = ColorDataset(df_test, tokenizer)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=0)
+    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=0)
+    device = config.device
     print(f"Using device: {device}")
+    model = ColorCLIP(vocab_size=tokenizer.counter, embedding_dim=config.color_emb_dim, tokenizer=tokenizer).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)  # Add weight decay
     # Save tokenizer vocab once (or update) so evaluation can reload the same mapping
     here = os.path.dirname(__file__)
+    vocab_out = os.path.join(here, config.tokeniser_path)
     with open(vocab_out, "w") as f:
         json.dump(dict(tokenizer.word2idx), f)
+    print(f"Tokenizer vocabulary saved to: {vocab_out}")
+    for epoch in range(epochs):
         model.train()
+        pbar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs} - train", leave=False)
+        epoch_losses = []
         for batch in train_loader:
             if batch is None:
                 pbar.update(1)
             loss = clip_loss(img_emb, text_emb)
             loss.backward()
             optimizer.step()
+            epoch_losses.append(loss.item())
+            pbar.set_postfix({"loss": f"{loss.item():.4f}", "avg": f"{sum(epoch_losses)/len(epoch_losses):.4f}"})
             pbar.update(1)
         pbar.close()
+        avg_train_loss = sum(epoch_losses) / len(epoch_losses) if epoch_losses else None
+        if avg_train_loss is not None:
+            print(f"[Train] Epoch {epoch+1}/{epochs} - avg loss: {avg_train_loss:.4f}")
         else:
+            print(f"[Train] Epoch {epoch+1}/{epochs} - no valid batches")
         # Eval rapide sur test avec barre
         model.eval()
         test_losses = []
         with torch.no_grad():
+            pbar_t = tqdm(total=len(test_loader), desc=f"Epoch {epoch+1}/{epochs} - test", leave=False)
             for batch in test_loader:
                 if batch is None:
                     pbar_t.update(1)
                 pbar_t.update(1)
             pbar_t.close()
         if len(test_losses) > 0:
+            avg_test_loss = sum(test_losses) / len(test_losses)
+            print(f"[Test ] Epoch {epoch+1}/{epochs} - avg loss: {avg_test_loss:.4f}")
         else:
+            print(f"[Test ] Epoch {epoch+1}/{epochs} - no valid batches")
         # --- Save checkpoint at every epoch ---
         ckpt_dir = here
+        latest_path = os.path.join(ckpt_dir, config.color_model_path)
+        epoch_path = os.path.join(ckpt_dir, f"color_model_epoch_{epoch+1}.pt")
+        checkpoint = {
+            'model_state_dict': model.state_dict(),
+            'vocab_size': model.vocab_size,
+            'embedding_dim': model.embedding_dim
+        }
+        torch.save(checkpoint, latest_path)
+        torch.save(checkpoint, epoch_path)
         print(f"[Save ] Saved checkpoints: {latest_path} and {epoch_path}")