Leacb4
/

gap-clip

+import os
+import time
+import json
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms, models
+from PIL import Image
+import requests
+from io import BytesIO
+import torch.nn as nn
+import torch.nn.functional as F
+import pandas as pd
+from tqdm.auto import tqdm
+import asyncio
+import aiohttp
+import pandas as pd
+import os
+from pathlib import Path
+from tqdm.asyncio import tqdm
+import ssl
+import logging
+from typing import Optional, List, Tuple
+from urllib.parse import urlparse
+import hashlib
+from config import local_dataset_path
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class ImageDownloader:
+    """Enhanced image downloader with better error handling, retry logic, and progress tracking."""
+    def __init__(self,
+                 output_dir: str = "athleta_images",
+                 max_concurrent: int = 10,
+                 timeout: int = 30,
+                 retry_attempts: int = 3,
+                 verify_ssl: bool = True):
+        """
+        Initialize the ImageDownloader.
+        Args:
+            output_dir: Directory to save downloaded images
+            max_concurrent: Maximum number of concurrent downloads
+            timeout: Request timeout in seconds
+            retry_attempts: Number of retry attempts for failed downloads
+            verify_ssl: Whether to verify SSL certificates
+        """
+        self.output_dir = Path(output_dir)
+        self.max_concurrent = max_concurrent
+        self.timeout = aiohttp.ClientTimeout(total=timeout)
+        self.retry_attempts = retry_attempts
+        self.verify_ssl = verify_ssl
+        # Create output directory
+        self.output_dir.mkdir(exist_ok=True)
+        # Statistics
+        self.stats = {
+            'total': 0,
+            'downloaded': 0,
+            'skipped': 0,
+            'failed': 0,
+            'retries': 0
+        }
+    def _create_ssl_context(self) -> Optional[ssl.SSLContext]:
+        """Create SSL context based on verification settings."""
+        if not self.verify_ssl:
+            ssl_context = ssl.create_default_context()
+            ssl_context.check_hostname = False
+            ssl_context.verify_mode = ssl.CERT_NONE
+            return ssl_context
+        return None
+    def _generate_filename(self, url: str, index: int) -> str:
+        """Generate a safe filename from URL or index."""
+        try:
+            # Try to extract filename from URL
+            parsed_url = urlparse(url)
+            filename = os.path.basename(parsed_url.path)
+            if filename and '.' in filename:
+                return filename
+        except Exception:
+            pass
+        # Fallback: use URL hash or index
+        try:
+            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+            return f"image_{url_hash}.jpg"
+        except Exception:
+            return f"image_{index}.jpg"
+    async def _download_single_image(self,
+                                   session: aiohttp.ClientSession,
+                                   url: str,
+                                   save_path: Path,
+                                   index: int) -> bool:
+        """
+        Download a single image with retry logic.
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        for attempt in range(self.retry_attempts):
+            try:
+                if attempt > 0:
+                    self.stats['retries'] += 1
+                    logger.info(f"Retry {attempt}/{self.retry_attempts} for {url}")
+                ssl_context = self._create_ssl_context()
+                connector = aiohttp.TCPConnector(ssl=ssl_context) if ssl_context else None
+                async with session.get(url, ssl=ssl_context, connector=connector) as response:
+                    if response.status == 200:
+                        content = await response.read()
+                        # Validate that it's actually an image
+                        if len(content) < 1024:  # Too small to be a real image
+                            logger.warning(f"Image too small, skipping: {url}")
+                            return False
+                        # Ensure directory exists
+                        save_path.parent.mkdir(parents=True, exist_ok=True)
+                        # Write file
+                        with open(save_path, 'wb') as f:
+                            f.write(content)
+                        logger.debug(f"Successfully downloaded: {save_path}")
+                        return True
+                    elif response.status == 404:
+                        logger.warning(f"Image not found (404): {url}")
+                        return False
+                    else:
+                        logger.warning(f"HTTP {response.status} for {url}")
+                        if attempt == self.retry_attempts - 1:
+                            return False
+            except asyncio.TimeoutError:
+                logger.warning(f"Timeout downloading {url} (attempt {attempt + 1})")
+                if attempt == self.retry_attempts - 1:
+                    return False
+            except Exception as e:
+                logger.error(f"Error downloading {url}: {str(e)}")
+                if attempt == self.retry_attempts - 1:
+                    return False
+        return False
+    async def _download_batch(self,
+                            session: aiohttp.ClientSession,
+                            batch: List[Tuple[str, Path, int]]) -> None:
+        """Download a batch of images concurrently."""
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+        async def download_with_semaphore(url, save_path, index):
+            async with semaphore:
+                if save_path.exists():
+                    logger.debug(f"File already exists, skipping: {save_path}")
+                    self.stats['skipped'] += 1
+                    return
+                success = await self._download_single_image(session, url, save_path, index)
+                if success:
+                    self.stats['downloaded'] += 1
+                else:
+                    self.stats['failed'] += 1
+        tasks = [download_with_semaphore(url, save_path, index)
+                for url, save_path, index in batch]
+        await asyncio.gather(*tasks, return_exceptions=True)
+    def _prepare_download_tasks(self, df: pd.DataFrame) -> List[Tuple[str, Path, int]]:
+        """Prepare download tasks from DataFrame."""
+        tasks = []
+        for index, row in df.iterrows():
+            # Check if image URL is valid
+            if pd.isna(row.get('image')) or not isinstance(row.get('image'), str):
+                logger.debug(f"Skipping row {index}: invalid image URL")
+                continue
+            url = row['image'].strip()
+            if not url or not url.startswith(('http://', 'https://')):
+                logger.debug(f"Skipping row {index}: invalid URL format")
+                continue
+            # Generate filename
+            filename = self._generate_filename(url, index)
+            save_path = self.output_dir / filename
+            tasks.append((url, save_path, index))
+        return tasks
+    async def download_all_images(self, df: pd.DataFrame) -> None:
+        """Download all images from the DataFrame."""
+        logger.info("Preparing download tasks...")
+        tasks = self._prepare_download_tasks(df)
+        self.stats['total'] = len(tasks)
+        if not tasks:
+            logger.warning("No valid image URLs found in the dataset")
+            return
+        logger.info(f"Found {len(tasks)} valid image URLs to download")
+        # Create session with proper configuration
+        ssl_context = self._create_ssl_context()
+        connector = aiohttp.TCPConnector(ssl=ssl_context) if ssl_context else None
+        async with aiohttp.ClientSession(
+            timeout=self.timeout,
+            connector=connector,
+            headers={'User-Agent': 'Mozilla/5.0 (compatible; ImageDownloader/1.0)'}
+        ) as session:
+            # Process in batches to avoid overwhelming the server
+            batch_size = self.max_concurrent * 2
+            for i in range(0, len(tasks), batch_size):
+                batch = tasks[i:i + batch_size]
+                logger.info(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}")
+                await self._download_batch(session, batch)
+                # Small delay between batches to be respectful
+                if i + batch_size < len(tasks):
+                    await asyncio.sleep(1)
+    def print_statistics(self) -> None:
+        """Print download statistics."""
+        logger.info("Download Statistics:")
+        logger.info(f"  Total URLs processed: {self.stats['total']}")
+        logger.info(f"  Successfully downloaded: {self.stats['downloaded']}")
+        logger.info(f"  Skipped (already exists): {self.stats['skipped']}")
+        logger.info(f"  Failed: {self.stats['failed']}")
+        logger.info(f"  Retry attempts: {self.stats['retries']}")
+        if self.stats['total'] > 0:
+            success_rate = (self.stats['downloaded'] / self.stats['total']) * 100
+            logger.info(f"  Success rate: {success_rate:.1f}%")
+import os
+import time
+import json
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms, models
+from PIL import Image
+import requests
+from io import BytesIO
+import torch.nn as nn
+import torch.nn.functional as F
+import pandas as pd
+from tqdm.auto import tqdm
+class ColorDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, transform=None):
+        """
+        dataframe : pd.DataFrame avec colonnes 'image_url' et 'text'
+        tokenizer : fonction qui convertit texte -> list d'entiers (tokens)
+        transform : transformations image
+        """
+        self.df = dataframe.reset_index(drop=True)
+        self.tokenizer = tokenizer
+        self.transform = transform or transforms.Compose([
+            transforms.Resize((224,224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485,0.456,0.406],
+                                 std=[0.229,0.224,0.225])
+        ])
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        try:
+            src = row.get('local_image_path', None)
+            if not src or not os.path.isfile(src):
+                return None  # filtered by collate
+            img = Image.open(src).convert("RGB")
+            img = self.transform(img)
+            tokens = torch.tensor(self.tokenizer(row['text']), dtype=torch.long)
+            return img, tokens
+        except Exception:
+            return None
+from collections import defaultdict
+class SimpleTokenizer:
+    def __init__(self):
+        self.word2idx = defaultdict(lambda: 0)  # 0 = pad/unknown
+        self.idx2word = {}
+        self.counter = 1
+    def preprocess_text(self, text):
+        """Extract color-related keywords from text"""
+        # Color-related keywords to keep
+        color_keywords = ['red', 'blue', 'green', 'yellow', 'purple', 'pink', 'orange',
+                         'brown', 'black', 'white', 'gray', 'navy', 'beige', 'aqua', 'lime',
+                         'violet', 'turquoise', 'teal', 'tan', 'snow', 'silver', 'plum',
+                         'olive', 'fuchsia', 'gold', 'cream', 'ivory', 'maroon']
+        # Keep only color-related words and basic descriptive words
+        descriptive_words = ['shirt', 'dress', 'top', 'bottom', 'shoe', 'bag', 'hat', 'short', 'long', 'sleeve']
+        words = text.lower().split()
+        filtered_words = []
+        for word in words:
+            # Keep color words and some descriptive words
+            if word in color_keywords or word in descriptive_words:
+                filtered_words.append(word)
+        return ' '.join(filtered_words) if filtered_words else text.lower()
+    def fit(self, texts):
+        for text in texts:
+            processed_text = self.preprocess_text(text)
+            for word in processed_text.split():
+                if word not in self.word2idx:
+                    self.word2idx[word] = self.counter
+                    self.idx2word[self.counter] = word
+                    self.counter += 1
+    def __call__(self, text):
+        processed_text = self.preprocess_text(text)
+        return [self.word2idx[word] for word in processed_text.split()]
+    def load_vocab(self, word2idx_dict):
+        self.word2idx = defaultdict(lambda: 0, {k: int(v) for k, v in word2idx_dict.items()})
+        self.idx2word = {int(v): k for k, v in word2idx_dict.items() if int(v) > 0}
+        self.counter = max(self.word2idx.values(), default=0) + 1
+class ImageEncoder(nn.Module):
+    def __init__(self, embedding_dim=16):
+        super().__init__()
+        self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
+        self.backbone.fc = nn.Sequential(
+            nn.Dropout(0.1),  # Add regularization
+            nn.Linear(self.backbone.fc.in_features, embedding_dim)
+        )
+    def forward(self, x):
+        x = self.backbone(x)
+        return F.normalize(x, dim=-1)
+class TextEncoder(nn.Module):
+    def __init__(self, vocab_size, embedding_dim=16):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)  # Keep 32 dimensions
+        self.dropout = nn.Dropout(0.1)  # Add regularization
+        self.fc = nn.Linear(32, embedding_dim)
+    def forward(self, x, lengths=None):
+        emb = self.embedding(x)  # [B, T, 32]
+        emb = self.dropout(emb)  # Apply dropout
+        if lengths is not None:
+            summed = emb.sum(dim=1)  # [B, 32]
+            mean = summed / lengths.unsqueeze(1).clamp_min(1)
+        else:
+            mean = emb.mean(dim=1)
+        return F.normalize(self.fc(mean), dim=-1)
+class ColorCLIP(nn.Module):
+    def __init__(self, vocab_size, embedding_dim=16):  # Keep 16 dimensions
+        super().__init__()
+        self.image_encoder = ImageEncoder(embedding_dim)
+        self.text_encoder = TextEncoder(vocab_size, embedding_dim)
+    def forward(self, image, text, lengths=None):
+        return self.image_encoder(image), self.text_encoder(text, lengths)
+    def get_text_embeddings(self, texts: List[str]) -> torch.Tensor:
+        token_lists = [self.tokenizer(t) for t in texts]
+        max_len = max((len(toks) for toks in token_lists), default=0)
+        padded = [toks + [0] * (max_len - len(toks)) for toks in token_lists]
+        input_ids = torch.tensor(padded, dtype=torch.long, device=next(self.parameters()).device)
+        lengths = torch.tensor([len(toks) for toks in token_lists], dtype=torch.long, device=input_ids.device)
+        with torch.no_grad():
+            emb = self.text_encoder(input_ids, lengths)
+        return emb
+def clip_loss(image_emb, text_emb, temperature=0.07):
+    logits = image_emb @ text_emb.T / temperature
+    labels = torch.arange(len(image_emb), device=image_emb.device)
+    loss_i2t = F.cross_entropy(logits, labels)
+    loss_t2i = F.cross_entropy(logits.T, labels)
+    return (loss_i2t + loss_t2i) / 2
+# Collate qui pad les séquences et filtre les None
+def collate_batch(batch):
+    batch = [b for b in batch if b is not None]
+    if len(batch) == 0:
+        return None
+    imgs, tokens = zip(*batch)
+    imgs = torch.stack(imgs, dim=0)
+    lengths = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
+    tokens_padded = nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=0)
+    return imgs, tokens_padded, lengths
+if __name__ == "__main__":
+    # Chargement + split train/test + cache local
+    tokenizer = SimpleTokenizer()
+    df = pd.read_csv('df_color_with_local_paths.csv')
+    # Reduce to main colors only (11 classes instead of 34)
+    main_colors = ['beige', 'black', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'white', 'yellow']
+    df = df[df['color'].isin(main_colors)].copy()
+    print(f"📊 Filtered dataset: {len(df)} samples with {len(main_colors)} colors")
+    print(f"🎨 Colors: {sorted(df['color'].unique())}")
+    tokenizer.fit(df['text'].tolist())
+    # If no local paths column, download/calc it once
+    if 'local_image_path' not in df.columns or df['local_image_path'].isna().all():
+        downloader = ImageDownloader(
+            csv_path='new/df_color_with_local_paths.csv',
+            images_dir='data/images',
+            max_workers=16,
+            timeout=10
+        )
+        df_local = downloader.download_all_images()
+    else:
+        df_local = df
+    # Filter only rows with a valid local file
+    df_local = df_local[df_local['local_image_path'].astype(str).str.len() > 0]
+    df_local = df_local[df_local['local_image_path'].apply(lambda p: os.path.isfile(p))]
+    df_local = df_local.reset_index(drop=True)
+    # split 90/10
+    df_local = df_local.sample(frac=1.0, random_state=42).reset_index(drop=True)
+    split_idx = int(0.9 * len(df_local))
+    df_train = df_local.iloc[:split_idx].reset_index(drop=True)
+    df_test = df_local.iloc[split_idx:].reset_index(drop=True)
+    train_dataset = ColorDataset(df_train, tokenizer)
+    test_dataset = ColorDataset(df_test, tokenizer)
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_batch, num_workers=0)
+    test_loader  = DataLoader(test_dataset,  batch_size=16, shuffle=False, collate_fn=collate_batch, num_workers=0)
+    device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
+    print(f"Using device: {device}")
+    model = ColorCLIP(vocab_size=tokenizer.counter).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)  # Add weight decay
+    # Save tokenizer vocab once (or update) so evaluation can reload the same mapping
+    here = os.path.dirname(__file__)
+    vocab_out = os.path.join(here, "tokenizer_vocab.json")
+    with open(vocab_out, "w") as f:
+        json.dump(dict(tokenizer.word2idx), f)
+    from collections import defaultdict
+    EPOCHS = 50  # Increased from 10 to 50
+    for epoch in range(EPOCHS):
+        model.train()
+        pbar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS} - train", leave=False)
+        last_loss = None
+        for batch in train_loader:
+            if batch is None:
+                pbar.update(1)
+                continue
+            imgs, texts, lengths = batch
+            imgs = imgs.to(device)
+            texts = texts.to(device)
+            lengths = lengths.to(device)
+            optimizer.zero_grad()
+            img_emb, text_emb = model(imgs, texts, lengths)
+            loss = clip_loss(img_emb, text_emb)
+            loss.backward()
+            optimizer.step()
+            last_loss = loss.item()
+            pbar.set_postfix({"loss": f"{last_loss:.4f}"})
+            pbar.update(1)
+        pbar.close()
+        if last_loss is not None:
+            print(f"[Train] Epoch {epoch+1}/{EPOCHS} - last batch loss: {last_loss:.4f}")
+        else:
+            print(f"[Train] Epoch {epoch+1}/{EPOCHS} - no valid batches")
+        # Eval rapide sur test avec barre
+        model.eval()
+        test_losses = []
+        with torch.no_grad():
+            pbar_t = tqdm(total=len(test_loader), desc=f"Epoch {epoch+1}/{EPOCHS} - test", leave=False)
+            for batch in test_loader:
+                if batch is None:
+                    pbar_t.update(1)
+                    continue
+                imgs, texts, lengths = batch
+                imgs = imgs.to(device)
+                texts = texts.to(device)
+                lengths = lengths.to(device)
+                img_emb, text_emb = model(imgs, texts, lengths)
+                test_losses.append(clip_loss(img_emb, text_emb).item())
+                pbar_t.update(1)
+            pbar_t.close()
+        if len(test_losses) > 0:
+            print(f"[Test ] Epoch {epoch+1}/{EPOCHS} - avg loss: {sum(test_losses)/len(test_losses):.4f}")
+        else:
+            print(f"[Test ] Epoch {epoch+1}/{EPOCHS} - no valid batches")
+        # --- Save checkpoint at every epoch ---
+        ckpt_dir = here
+        latest_path = os.path.join(ckpt_dir, "colorclip_image_text.pt")
+        epoch_path = os.path.join(ckpt_dir, f"colorclip_image_text_epoch_{epoch+1}.pt")
+        state_dict = model.state_dict()
+        torch.save(state_dict, latest_path)
+        torch.save(state_dict, epoch_path)
+        print(f"[Save ] Saved checkpoints: {latest_path} and {epoch_path}")