|
|
"""
|
|
|
LoRA (Low-Rank Adaptation) implementation for convolutional layers.
|
|
|
"""
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
from torchvision import models
|
|
|
|
|
|
|
|
|
class LoRALayer(nn.Module):
|
|
|
"""
|
|
|
LoRA (Low-Rank Adaptation) wrapper for convolutional layers.
|
|
|
|
|
|
Args:
|
|
|
original_layer: The Conv2d layer to adapt
|
|
|
rank: LoRA rank (default=8)
|
|
|
- Lower rank (4): Fewer parameters, less overfitting risk, less capacity
|
|
|
- Medium rank (8-16): Balanced trade-off (recommended for most tasks)
|
|
|
- Higher rank (32+): More capacity but approaches full fine-tuning
|
|
|
|
|
|
For small datasets (<1000 images), rank=8 provides sufficient
|
|
|
adaptation capacity while keeping parameters low (~2% of original layer).
|
|
|
"""
|
|
|
|
|
|
def __init__(self, original_layer, rank=8):
|
|
|
super().__init__()
|
|
|
self.original_layer = original_layer
|
|
|
self.rank = rank
|
|
|
|
|
|
|
|
|
out_channels = original_layer.out_channels
|
|
|
in_channels = original_layer.in_channels
|
|
|
kernel_size = original_layer.kernel_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.lora_A = nn.Parameter(
|
|
|
torch.randn(rank, in_channels, *kernel_size) * 0.01
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.lora_B = nn.Parameter(
|
|
|
torch.zeros(out_channels, rank, 1, 1)
|
|
|
)
|
|
|
|
|
|
|
|
|
self.original_layer.weight.requires_grad = False
|
|
|
if self.original_layer.bias is not None:
|
|
|
self.original_layer.bias.requires_grad = False
|
|
|
|
|
|
def forward(self, x):
|
|
|
"""
|
|
|
Forward pass combining original frozen weights with LoRA adaptation.
|
|
|
|
|
|
Mathematical formulation:
|
|
|
output = W_frozen * x + (B * (A * x))
|
|
|
|
|
|
where * denotes convolution operation.
|
|
|
"""
|
|
|
|
|
|
original_output = self.original_layer(x)
|
|
|
|
|
|
|
|
|
|
|
|
lora_output = F.conv2d(
|
|
|
x,
|
|
|
self.lora_A,
|
|
|
stride=self.original_layer.stride,
|
|
|
padding=self.original_layer.padding
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
lora_output = F.conv2d(lora_output, self.lora_B)
|
|
|
|
|
|
|
|
|
return original_output + lora_output
|
|
|
|
|
|
|
|
|
def get_model(num_classes=2, pretrained=True):
|
|
|
"""
|
|
|
Load ResNet34 with optional pretrained weights.
|
|
|
|
|
|
Args:
|
|
|
num_classes: Number of output classes
|
|
|
pretrained: Whether to load ImageNet pretrained weights
|
|
|
|
|
|
Returns:
|
|
|
ResNet34 model
|
|
|
"""
|
|
|
if pretrained:
|
|
|
model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
|
|
|
else:
|
|
|
model = models.resnet34(weights=None)
|
|
|
|
|
|
|
|
|
num_features = model.fc.in_features
|
|
|
model.fc = nn.Linear(num_features, num_classes)
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
def apply_lora_to_model(model, target_layers=['layer3', 'layer4'], rank=8):
|
|
|
"""
|
|
|
Apply LoRA adapters to specific layers in ResNet34.
|
|
|
|
|
|
Strategy: We target layer3 and layer4 (high-level feature extractors) because:
|
|
|
- layer1 & layer2: Extract low-level features (edges, textures) that are
|
|
|
universal across tasks β keep frozen, no adaptation needed
|
|
|
- layer3 & layer4: Extract high-level semantic features (objects, contexts)
|
|
|
that are task-specific β need slight adaptation for smoking detection
|
|
|
- fc: Brand new classifier head β fully trainable
|
|
|
|
|
|
This approach gives us the sweet spot:
|
|
|
- Full fine-tuning: 21.8M params (overfitting risk with small datasets)
|
|
|
- Only fc training: ~1K params (may underfit, features not adapted)
|
|
|
- LoRA on layer3+layer4: ~465K params (2.14% of model, balanced approach)
|
|
|
|
|
|
Args:
|
|
|
model: ResNet34 model
|
|
|
target_layers: List of layer names to apply LoRA to
|
|
|
rank: LoRA rank (default=8, adds ~2% params per adapted layer)
|
|
|
|
|
|
Returns:
|
|
|
Number of convolutional layers where LoRA was applied
|
|
|
"""
|
|
|
|
|
|
for param in model.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
|
|
|
for param in model.fc.parameters():
|
|
|
param.requires_grad = True
|
|
|
|
|
|
lora_count = 0
|
|
|
|
|
|
for layer_name in target_layers:
|
|
|
|
|
|
layer = getattr(model, layer_name)
|
|
|
|
|
|
|
|
|
for block in layer:
|
|
|
|
|
|
for name, module in block.named_modules():
|
|
|
if isinstance(module, nn.Conv2d):
|
|
|
|
|
|
parent = block
|
|
|
attr_names = name.split('.')
|
|
|
|
|
|
|
|
|
for attr in attr_names[:-1]:
|
|
|
parent = getattr(parent, attr)
|
|
|
|
|
|
|
|
|
current_module = getattr(parent, attr_names[-1])
|
|
|
if not isinstance(current_module, LoRALayer):
|
|
|
|
|
|
setattr(parent, attr_names[-1], LoRALayer(current_module, rank=rank))
|
|
|
lora_count += 1
|
|
|
|
|
|
return lora_count
|
|
|
|
|
|
|
|
|
def count_parameters(model):
|
|
|
"""
|
|
|
Count total and trainable parameters in the model.
|
|
|
|
|
|
Returns:
|
|
|
tuple: (total_params, trainable_params, trainable_percentage)
|
|
|
"""
|
|
|
total_params = sum(p.numel() for p in model.parameters())
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
|
trainable_pct = 100. * trainable_params / total_params
|
|
|
|
|
|
return total_params, trainable_params, trainable_pct |