syntheticbot
/

clip-face-attribute-classifier

+import pandas as pd
+import torch
+from PIL import Image
+from sklearn.metrics import classification_report, accuracy_score
+from transformers import CLIPImageProcessor
+import os
+from tqdm import tqdm
+# IMPORTANT: This line imports your custom model class from the training script.
+# Ensure 'train_clip.py' is in the same directory.
+from train_clip import MultiTaskClipVisionModel
+# --- 1. Configuration ---
+# Verify this path is correct. It should point to the directory where the
+# 'pytorch_model.bin' and 'preprocessor_config.json' files for your best model are located.
+MODEL_PATH = "./clip-fairface-finetuned/best_model" # Or "./clip-fairface-finetuned/checkpoint-XXXX"
+VAL_CSV = './fairface_label_val.csv'
+BASE_PATH = './'
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
+print(f"Loading model from: {MODEL_PATH}")
+# --- 2. Load Label Mappings (must be identical to training) ---
+# We load the TRAIN csv to ensure the label mappings are consistent with what the model was trained on.
+train_df = pd.read_csv('./fairface_label_train.csv')
+age_labels = sorted(train_df['age'].unique())
+gender_labels = sorted(train_df['gender'].unique())
+race_labels = sorted(train_df['race'].unique())
+label_mappings = {
+    'age': {label: i for i, label in enumerate(age_labels)},
+    'gender': {label: i for i, label in enumerate(gender_labels)},
+    'race': {label: i for i, label in enumerate(race_labels)},
+}
+# Create reverse mappings from ID back to human-readable label
+id_mappings = {
+    'age': {i: label for label, i in label_mappings['age'].items()},
+    'gender': {i: label for label, i in label_mappings['gender'].items()},
+    'race': {i: label for label, i in label_mappings['race'].items()},
+}
+NUM_LABELS = {
+    'age': len(age_labels),
+    'gender': len(gender_labels),
+    'race': len(race_labels),
+}
+# --- 3. Load Model and Processor ---
+print("Loading processor and model...")
+processor = CLIPImageProcessor.from_pretrained(MODEL_PATH)
+model = MultiTaskClipVisionModel(num_labels=NUM_LABELS)
+# Load the saved model weights. `map_location` ensures it works even if you trained on GPU and now use CPU.
+model.load_state_dict(torch.load(os.path.join(MODEL_PATH, 'pytorch_model.bin'), map_location=torch.device(DEVICE)))
+model.to(DEVICE)
+model.eval() # Set the model to evaluation mode
+print("Model loaded successfully.")
+# --- 4. Evaluation on Validation Set ---
+def evaluate_on_dataset():
+    print(f"\nEvaluating on validation data from: {VAL_CSV}")
+    val_df = pd.read_csv(VAL_CSV)
+    # Lists to store all predictions and true labels
+    all_preds = {'age': [], 'gender': [], 'race': []}
+    all_true = {'age': [], 'gender': [], 'race': []}
+    # Disable gradient calculations for efficiency
+    with torch.no_grad():
+        # Use tqdm for a nice progress bar
+        for index, row in tqdm(val_df.iterrows(), total=val_df.shape[0], desc="Evaluating"):
+            image_path = os.path.join(BASE_PATH, row['file'])
+            image = Image.open(image_path).convert("RGB")
+            # Process the image and move to the correct device
+            inputs = processor(images=image, return_tensors="pt").to(DEVICE)
+            # Get model predictions
+            outputs = model(pixel_values=inputs['pixel_values'])
+            logits = outputs['logits']
+            # Process predictions for each task
+            for task in ['age', 'gender', 'race']:
+                pred_id = torch.argmax(logits[task], dim=-1).item()
+                true_label = row[task]
+                true_id = label_mappings[task][true_label]
+                all_preds[task].append(pred_id)
+                all_true[task].append(true_id)
+    # --- Print Reports ---
+    print("\n--- Evaluation Results ---")
+    for task in ['age', 'gender', 'race']:
+        task_preds = all_preds[task]
+        task_true = all_true[task]
+        task_labels = list(label_mappings[task].keys())
+        task_target_names = [id_mappings[task][i] for i in range(len(task_labels))]
+        accuracy = accuracy_score(task_true, task_preds)
+        report = classification_report(
+            task_true,
+            task_preds,
+            target_names=task_target_names,
+            zero_division=0
+        )
+        print(f"\n--- {task.upper()} CLASSIFICATION REPORT ---")
+        print(f"Overall Accuracy: {accuracy:.4f}")
+        print(report)
+# --- 5. Function for Single Image Prediction ---
+def predict_single_image(image_path):
+    print(f"\n--- Predicting for single image: {image_path} ---")
+    if not os.path.exists(image_path):
+        print(f"Error: Image path not found at '{image_path}'")
+        return
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(images=image, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        outputs = model(pixel_values=inputs['pixel_values'])
+        logits = outputs['logits']
+    predictions = {}
+    for task in ['age', 'gender', 'race']:
+        pred_id = torch.argmax(logits[task], dim=-1).item()
+        pred_label = id_mappings[task][pred_id]
+        predictions[task] = pred_label
+    print("Predictions:")
+    for task, label in predictions.items():
+        print(f"  - {task.capitalize()}: {label}")
+    return predictions
+if __name__ == "__main__":
+    # Run the full evaluation on the validation dataset
+    evaluate_on_dataset()
+    # --- Example of single image prediction ---
+    # IMPORTANT: Change this path to an image you want to test
+    sample_image_path = 'val/1.jpg'
+    predict_single_image(sample_image_path)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e811883e6f247acc61a869a938b9523d1eb1d34fa3c1e882b3f033a49b8cb72d
+size 1212846240

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+# This file lists the required packages for the clip-face-attribute-classifier project.
+# Install them using: pip install -r requirements.txt
+# --- Hugging Face Libraries ---
+# Core library for models, Trainer, TrainingArguments, and processors
+transformers==4.38.2
+# Used for data handling and creating Dataset objects
+datasets==2.18.0
+# For efficient training and hardware acceleration with the Trainer
+accelerate==0.27.2
+# For interacting with the Hugging Face Hub (login, upload, etc.)
+huggingface_hub==0.21.4
+# --- Core Deep Learning Framework ---
+# The fundamental deep learning library
+torch==2.2.1
+# Companion library for computer vision tasks in PyTorch
+torchvision==0.17.1
+# --- Data Handling and Metrics ---
+# For reading and manipulating the .csv label files
+pandas==2.2.1
+# For calculating evaluation metrics like accuracy, precision, recall, and F1-score
+scikit-learn==1.4.1.post1
+# --- Utilities ---
+# For opening and handling image files
+Pillow==10.2.0
+# For creating progress bars during evaluation
+tqdm==4.66.2
+# For loading the safer .safetensors model format
+safetensors==0.4.2

train_clip.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import pandas as pd
+import torch
+import torch.nn as nn
+from PIL import Image
+from sklearn.metrics import accuracy_score
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    CLIPVisionModel,
+    CLIPImageProcessor,
+)
+from torch.utils.data import Dataset
+import os
+os.environ["WANDB_DISABLED"] = "true"
+# --- 1. Configuration ---
+# Define paths and model name
+BASE_PATH = './'  # Assumes the script is run from the 'fairface' directory
+TRAIN_CSV = os.path.join(BASE_PATH, 'fairface_label_train.csv')
+VAL_CSV = os.path.join(BASE_PATH, 'fairface_label_val.csv')
+MODEL_NAME = "openai/clip-vit-large-patch14"
+OUTPUT_DIR = "./clip-fairface-finetuned"
+# --- 2. Load and Prepare Label Mappings ---
+# Load training data to create consistent label-to-ID mappings
+train_df = pd.read_csv(TRAIN_CSV)
+# Create sorted unique label lists to ensure consistent mapping
+age_labels = sorted(train_df['age'].unique())
+gender_labels = sorted(train_df['gender'].unique())
+race_labels = sorted(train_df['race'].unique())
+# Create label-to-ID mappings for each task
+label_mappings = {
+    'age': {label: i for i, label in enumerate(age_labels)},
+    'gender': {label: i for i, label in enumerate(gender_labels)},
+    'race': {label: i for i, label in enumerate(race_labels)},
+}
+NUM_LABELS = {
+    'age': len(age_labels),
+    'gender': len(gender_labels),
+    'race': len(race_labels),
+}
+print(f"Number of labels: Age={NUM_LABELS['age']}, Gender={NUM_LABELS['gender']}, Race={NUM_LABELS['race']}")
+# --- 3. Custom Dataset ---
+class FairFaceDataset(Dataset):
+    def __init__(self, csv_file, image_processor, label_maps, base_path):
+        self.df = pd.read_csv(csv_file)
+        self.image_processor = image_processor
+        self.label_maps = label_maps
+        self.base_path = base_path
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        # Construct the full path to the image
+        image_path = os.path.join(self.base_path, row['file'])
+        image = Image.open(image_path).convert("RGB")
+        # Process the image
+        inputs = {}
+        inputs['pixel_values'] = self.image_processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
+        # Process labels into a dictionary of tensors
+        inputs['labels'] = {
+            'age': torch.tensor(self.label_maps['age'][row['age']], dtype=torch.long),
+            'gender': torch.tensor(self.label_maps['gender'][row['gender']], dtype=torch.long),
+            'race': torch.tensor(self.label_maps['race'][row['race']], dtype=torch.long),
+        }
+        return inputs
+# --- 4. Custom Model Definition ---
+# --- 4. Custom Model Definition (Corrected for Gradient Checkpointing) ---
+class MultiTaskClipVisionModel(nn.Module):
+    # Add this class attribute to signal to the Trainer that we support this
+    supports_gradient_checkpointing = True
+    def __init__(self, num_labels):
+        super(MultiTaskClipVisionModel, self).__init__()
+        self.vision_model = CLIPVisionModel.from_pretrained(MODEL_NAME)
+        # Freeze all parameters of the vision model first
+        for param in self.vision_model.parameters():
+            param.requires_grad = False
+        # Unfreeze the last few layers for fine-tuning.
+        for layer in self.vision_model.vision_model.encoder.layers[-3:]: # Unfreeze last 3 transformer layers
+             for param in layer.parameters():
+                 param.requires_grad = True
+        # Define classification heads for each task
+        hidden_size = self.vision_model.config.hidden_size
+        self.age_head = nn.Linear(hidden_size, num_labels['age'])
+        self.gender_head = nn.Linear(hidden_size, num_labels['gender'])
+        self.race_head = nn.Linear(hidden_size, num_labels['race'])
+    # ADD THIS METHOD: This will be called by the Trainer
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        """Activates gradient checkpointing for the underlying vision model."""
+        self.vision_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
+    def forward(self, pixel_values, labels=None):
+        # The forward pass now works seamlessly with gradient checkpointing enabled
+        outputs = self.vision_model(pixel_values=pixel_values)
+        pooled_output = outputs.pooler_output
+        age_logits = self.age_head(pooled_output)
+        gender_logits = self.gender_head(pooled_output)
+        race_logits = self.race_head(pooled_output)
+        loss = None
+        # If labels are provided, calculate the combined loss
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            age_loss = loss_fct(age_logits, labels['age'])
+            gender_loss = loss_fct(gender_logits, labels['gender'])
+            race_loss = loss_fct(race_logits, labels['race'])
+            # Total loss is the sum of individual task losses
+            loss = age_loss + gender_loss + race_loss
+        return {
+            'loss': loss,
+            'logits': {
+                'age': age_logits,
+                'gender': gender_logits,
+                'race': race_logits,
+            },
+        }
+# --- 5. Data Collator and Metrics ---
+def collate_fn(batch):
+    # Stacks pixel values and organizes labels into a dictionary of tensors
+    pixel_values = torch.stack([item['pixel_values'] for item in batch])
+    labels = {
+        'age': torch.tensor([item['labels']['age'] for item in batch], dtype=torch.long),
+        'gender': torch.tensor([item['labels']['gender'] for item in batch], dtype=torch.long),
+        'race': torch.tensor([item['labels']['race'] for item in batch], dtype=torch.long),
+    }
+    return {'pixel_values': pixel_values, 'labels': labels}
+def compute_metrics(p):
+    # p is an EvalPrediction object containing predictions and label_ids
+    logits = p.predictions
+    labels = p.label_ids
+    # Extract predictions and labels for each task
+    age_preds = logits['age'].argmax(-1)
+    gender_preds = logits['gender'].argmax(-1)
+    race_preds = logits['race'].argmax(-1)
+    age_labels = labels['age']
+    gender_labels = labels['gender']
+    race_labels = labels['race']
+    # Calculate accuracy for each task
+    return {
+        'age_accuracy': accuracy_score(age_labels, age_preds),
+        'gender_accuracy': accuracy_score(gender_labels, gender_preds),
+        'race_accuracy': accuracy_score(race_labels, race_preds),
+    }
+# --- 6. Trainer Setup and Execution ---
+def main():
+    # Initialize the image processor and our custom model
+    image_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    model = MultiTaskClipVisionModel(num_labels=NUM_LABELS)
+    # Initialize the training and validation datasets
+    train_dataset = FairFaceDataset(
+        csv_file=TRAIN_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH
+    )
+    val_dataset = FairFaceDataset(
+        csv_file=VAL_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH
+    )
+    # Define the training arguments
+    # In your main() function, replace the old TrainingArguments with this one
+    # Define the training arguments
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=5,
+        # Set a batch size that fits in memory
+        per_device_train_batch_size=24,
+        per_device_eval_batch_size=32,  # Evaluation does not need accumulation and can use a larger batch size
+        # Set accumulation steps to reach the desired effective batch size (24 * 22 = 528)
+        gradient_accumulation_steps=22,
+        # Enable gradient checkpointing to save more memory
+        gradient_checkpointing=True,
+        warmup_steps=500,
+        weight_decay=0.01,
+        logging_dir='./logs',
+        logging_steps=10,  # Log more frequently to see progress within a large effective batch
+        evaluation_strategy="steps",
+        eval_steps=250, # You might want to evaluate less frequently with larger batches
+        save_strategy="steps",
+        save_steps=250,
+        load_best_model_at_end=True,
+        metric_for_best_model='gender_accuracy',
+        save_total_limit=3,
+        fp16=True,  # Mixed-precision training is essential for large models
+        remove_unused_columns=False,
+        report_to="none", # Disables wandb logging
+    )
+    # Initialize the Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=collate_fn,
+        compute_metrics=compute_metrics,
+    )
+    # Start training
+    print("Starting model training...")
+    trainer.train()
+    # Save the final model and processor
+    print("Saving the best model...")
+    trainer.save_model(os.path.join(OUTPUT_DIR, "best_model"))
+    image_processor.save_pretrained(os.path.join(OUTPUT_DIR, "best_model"))
+    print("Training complete!")
+if __name__ == "__main__":
+    main()