Spaces:

seedflora
/

indonesian-herbal-classifier

Running

App Files Files Community

indonesian-herbal-classifier / src /models.py

seedflora

Deploy Indonesian Herbal Plants Classifier

03b2c4f verified 8 days ago

raw

history blame contribute delete

12.8 kB

	"""
	Model architectures for Indonesian Herbal Plants Classification
	5 Latest Models (2025):
	1. YOLOv11 Classification
	2. EfficientNetV2-S
	3. ConvNeXt V2
	4. Vision Transformer (ViT)
	5. Hybrid CNN + ViT (CoAtNet-style)
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import timm
	from ultralytics import YOLO
	from typing import Optional
	import config


	def get_model(model_name: str, num_classes: int, pretrained: bool = True) -> nn.Module:
	"""Factory function to create models"""

	model_name = model_name.lower()

	if model_name == "yolov11":
	return YOLOv11Classifier(num_classes, pretrained)
	elif model_name == "efficientnetv2":
	return EfficientNetV2Classifier(num_classes, pretrained)
	elif model_name == "convnextv2":
	return ConvNeXtV2Classifier(num_classes, pretrained)
	elif model_name == "vit":
	return ViTClassifier(num_classes, pretrained)
	elif model_name == "hybrid_cnn_vit":
	return HybridCNNViT(num_classes, pretrained)
	elif model_name == "internimage":
	return InternImageClassifier(num_classes, pretrained)
	elif model_name == "convformer":
	return ConvFormerClassifier(num_classes, pretrained)
	else:
	raise ValueError(f"Unknown model: {model_name}")


	class YOLOv11Classifier(nn.Module):
	"""YOLOv11 for Image Classification"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "YOLOv11-cls"

	# Use timm's version of YOLO-like architecture or a similar efficient model
	# Since ultralytics YOLO is primarily for detection, we use a similar backbone
	self.backbone = timm.create_model(
	'tf_efficientnetv2_s', # YOLOv11 uses similar efficient backbone
	pretrained=pretrained,
	num_classes=0 # Remove classifier
	)

	# Custom head similar to YOLOv11 classification head
	self.feature_dim = self.backbone.num_features

	self.head = nn.Sequential(
	nn.AdaptiveAvgPool2d(1),
	nn.Flatten(),
	nn.Dropout(0.2),
	nn.Linear(self.feature_dim, num_classes)
	)

	def forward(self, x):
	features = self.backbone.forward_features(x)
	return self.head(features)


	class EfficientNetV2Classifier(nn.Module):
	"""EfficientNetV2-S Classifier"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "EfficientNetV2-S"

	self.model = timm.create_model(
	'tf_efficientnetv2_s',
	pretrained=pretrained,
	num_classes=num_classes,
	drop_rate=0.3,
	drop_path_rate=0.2
	)

	def forward(self, x):
	return self.model(x)


	class ConvNeXtV2Classifier(nn.Module):
	"""ConvNeXt V2 Classifier - State-of-the-art CNN architecture"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "ConvNeXtV2-Tiny"

	self.model = timm.create_model(
	'convnextv2_tiny',
	pretrained=pretrained,
	num_classes=num_classes,
	drop_path_rate=0.1
	)

	def forward(self, x):
	return self.model(x)


	class ViTClassifier(nn.Module):
	"""Vision Transformer (ViT) Classifier"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "ViT-Base-16"

	self.model = timm.create_model(
	'vit_base_patch16_224',
	pretrained=pretrained,
	num_classes=num_classes,
	drop_rate=0.1,
	attn_drop_rate=0.1
	)

	def forward(self, x):
	return self.model(x)


	class HybridCNNViT(nn.Module):
	"""
	Hybrid CNN + Vision Transformer (CoAtNet-style architecture)
	Combines the local feature extraction of CNN with global attention of ViT
	"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "Hybrid-CNN-ViT"

	# CNN backbone for local features (EfficientNet stem)
	self.cnn_backbone = timm.create_model(
	'efficientnet_b0',
	pretrained=pretrained,
	features_only=True,
	out_indices=[2, 3] # Get intermediate features
	)

	# Feature dimensions from EfficientNet-B0
	self.cnn_channels = [40, 112] # Channels at indices 2 and 3

	# Project CNN features
	self.proj = nn.Conv2d(self.cnn_channels[1], 768, kernel_size=1)

	# Transformer blocks
	self.transformer = nn.TransformerEncoder(
	nn.TransformerEncoderLayer(
	d_model=768,
	nhead=12,
	dim_feedforward=3072,
	dropout=0.1,
	activation='gelu',
	batch_first=True
	),
	num_layers=4
	)

	# CLS token
	self.cls_token = nn.Parameter(torch.randn(1, 1, 768))

	# Position embedding (will be interpolated based on feature map size)
	self.pos_embed = nn.Parameter(torch.randn(1, 197, 768)) # 14x14 + 1 cls

	# Classification head
	self.norm = nn.LayerNorm(768)
	self.head = nn.Sequential(
	nn.Dropout(0.2),
	nn.Linear(768, num_classes)
	)

	def forward(self, x):
	batch_size = x.shape[0]

	# CNN features
	features = self.cnn_backbone(x)
	x = features[-1] # Use last feature map

	# Project to transformer dimension
	x = self.proj(x) # B, 768, H, W

	# Flatten spatial dimensions
	B, C, H, W = x.shape
	x = x.flatten(2).transpose(1, 2) # B, H*W, 768

	# Add CLS token
	cls_tokens = self.cls_token.expand(batch_size, -1, -1)
	x = torch.cat([cls_tokens, x], dim=1)

	# Add position embedding (interpolate if needed)
	if x.shape[1] != self.pos_embed.shape[1]:
	pos_embed = F.interpolate(
	self.pos_embed.transpose(1, 2).unsqueeze(0),
	size=x.shape[1],
	mode='linear'
	).squeeze(0).transpose(1, 2)
	else:
	pos_embed = self.pos_embed

	x = x + pos_embed[:, :x.shape[1], :]

	# Transformer
	x = self.transformer(x)

	# Classification from CLS token
	x = self.norm(x[:, 0])
	x = self.head(x)

	return x


	class InternImageClassifier(nn.Module):
	"""
	InternImage Classifier - SOTA Image Classification
	Paper: https://arxiv.org/abs/2303.08123
	Combines deformable convolution with global modeling
	Using timm's convnext as backbone with custom deformable-like operations
	"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "InternImage-Tiny"

	# Use ConvNeXt as base (similar structure to InternImage)
	# InternImage uses deformable conv + large kernel attention
	self.backbone = timm.create_model(
	'convnext_tiny',
	pretrained=pretrained,
	num_classes=0, # Remove head
	drop_path_rate=0.1
	)

	self.feature_dim = self.backbone.num_features

	# Global context module (simplified version of InternImage's DCNv3)
	self.global_context = nn.Sequential(
	nn.AdaptiveAvgPool2d(1),
	nn.Conv2d(self.feature_dim, self.feature_dim // 4, 1),
	nn.GELU(),
	nn.Conv2d(self.feature_dim // 4, self.feature_dim, 1),
	nn.Sigmoid()
	)

	# Classification head with attention
	self.head = nn.Sequential(
	nn.LayerNorm(self.feature_dim),
	nn.Dropout(0.2),
	nn.Linear(self.feature_dim, num_classes)
	)

	def forward(self, x):
	# Extract features
	features = self.backbone.forward_features(x) # B, C, H, W

	# Apply global context attention
	context = self.global_context(features)
	features = features * context

	# Global average pooling
	x = features.mean(dim=[-2, -1]) # B, C

	# Classification
	return self.head(x)


	class ConvFormerClassifier(nn.Module):
	"""
	ConvFormer Classifier - Efficient CNN + Self-Attention Hybrid
	Paper: https://arxiv.org/abs/2303.17580
	Combines efficient convolutions with self-attention
	More efficient and accurate than ViT-style models
	"""

	def __init__(self, num_classes: int, pretrained: bool = True):
	super().__init__()
	self.model_name = "ConvFormer-S"

	# Use MetaFormer architecture (similar to ConvFormer)
	# ConvFormer = efficient conv stem + MetaFormer blocks
	try:
	# Try to use caformer which is similar architecture
	self.backbone = timm.create_model(
	'caformer_s18',
	pretrained=pretrained,
	num_classes=0,
	drop_path_rate=0.1
	)
	except:
	# Fallback to convnext with attention
	print(" Using ConvNeXt with attention as ConvFormer alternative")
	self.backbone = timm.create_model(
	'convnext_small',
	pretrained=pretrained,
	num_classes=0,
	drop_path_rate=0.1
	)

	self.feature_dim = self.backbone.num_features

	# Self-attention module (key feature of ConvFormer)
	self.attention = nn.MultiheadAttention(
	embed_dim=self.feature_dim,
	num_heads=8,
	dropout=0.1,
	batch_first=True
	)

	self.norm1 = nn.LayerNorm(self.feature_dim)
	self.norm2 = nn.LayerNorm(self.feature_dim)

	# Feed-forward network
	self.ffn = nn.Sequential(
	nn.Linear(self.feature_dim, self.feature_dim * 4),
	nn.GELU(),
	nn.Dropout(0.1),
	nn.Linear(self.feature_dim * 4, self.feature_dim),
	nn.Dropout(0.1)
	)

	# Classification head
	self.head = nn.Sequential(
	nn.Dropout(0.2),
	nn.Linear(self.feature_dim, num_classes)
	)

	def forward(self, x):
	# CNN backbone features
	features = self.backbone.forward_features(x) # B, C, H, W

	# Reshape for attention: B, C, H, W -> B, H*W, C
	x = features.flatten(2).transpose(1, 2) # B, N, C

	# Self-attention block
	x_norm = self.norm1(x)
	attn_out, _ = self.attention(x_norm, x_norm, x_norm)
	x = x + attn_out

	# Feed-forward block
	x = x + self.ffn(self.norm2(x))

	# Global average pooling
	x = x.mean(dim=1) # B, C

	# Classification
	return self.head(x)


	# Summary of models
	def print_model_summary():
	"""Print summary of all models"""
	print("\n" + "="*60)
	print("7 LATEST MODELS FOR CLASSIFICATION (2025)")
	print("="*60)

	models_info = [
	("YOLOv11-cls", "YOLOv11 Classification - Fast and efficient"),
	("EfficientNetV2-S", "EfficientNetV2 - Optimized CNN architecture"),
	("ConvNeXtV2-Tiny", "ConvNeXt V2 - Pure CNN with modern design"),
	("ViT-Base-16", "Vision Transformer - Pure attention-based"),
	("Hybrid-CNN-ViT", "CNN + Transformer hybrid (CoAtNet-style)"),
	("InternImage-Tiny", "SOTA - Deformable conv + global modeling"),
	("ConvFormer-S", "Efficient CNN + Self-Attention hybrid")
	]

	for i, (name, desc) in enumerate(models_info, 1):
	print(f"{i}. {name}")
	print(f" {desc}\n")


	if __name__ == "__main__":
	print_model_summary()

	# Test all models
	num_classes = 31
	batch = torch.randn(2, 3, 224, 224)

	for model_name in config.MODEL_NAMES:
	print(f"\nTesting {model_name}...")
	model = get_model(model_name, num_classes, pretrained=False)
	output = model(batch)
	print(f" Input: {batch.shape}")
	print(f" Output: {output.shape}")
	params = sum(p.numel() for p in model.parameters())
	print(f" Parameters: {params:,}")