Upload train_parseq.py

deda9ff verified 22 days ago

16 kB

	"""
	Training script for CLPRNet with PARSeq Tiny backbone.

	Changes from original train.py:
	- Uses CLPRNetPARSeq model instead of CLPRNet
	- Recognition loss uses PARSeq's cross-entropy on sequence output
	- No character attention maps (at_ch removed)
	- Ground-truth boxes are passed to the model for plate cropping during training
	- PARSeq uses teacher forcing during training
	"""

	from common import BaseExperiment
	from model_parseq import CLPRNetPARSeq as Model, Tokenizer
	from dataset import MyDataset, CHARACTER
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.optim import lr_scheduler
	import os
	import matplotlib.pyplot as plt
	import numpy as np
	import random
	from utils import provinces1, provinces2


	def set_seed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	def IoU_multi(pred_boxes, target_boxes, eps=1e-6):
	pred_x1, pred_y1, pred_x2, pred_y2 = torch.split(pred_boxes, 1, dim=-1)
	target_x1, target_y1, target_x2, target_y2 = torch.split(target_boxes, 1, dim=-1)
	inter_x1 = torch.max(pred_x1, target_x1)
	inter_y1 = torch.max(pred_y1, target_y1)
	inter_x2 = torch.min(pred_x2, target_x2)
	inter_y2 = torch.min(pred_y2, target_y2)
	inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
	pred_area = (pred_x2 - pred_x1) * (pred_y2 - pred_y1)
	target_area = (target_x2 - target_x1) * (target_y2 - target_y1)
	union_area = pred_area + target_area - inter_area
	iou = inter_area / (union_area + eps)
	return iou


	def IOU(box, other_boxes):
	box_area = (box[2] - box[0]) * (box[3] - box[1])
	other_boxes_area = (other_boxes[:, 2] - other_boxes[:, 0]) * (other_boxes[:, 3] - other_boxes[:, 1])
	x1 = torch.max(box[0], other_boxes[:, 0])
	y1 = torch.max(box[1], other_boxes[:, 1])
	x2 = torch.min(box[2], other_boxes[:, 2])
	y2 = torch.min(box[3], other_boxes[:, 3])
	Min = torch.zeros(1, device=box.device)
	w, h = torch.max(Min, x2 - x1), torch.max(Min, y2 - y1)
	overlap_area = w * h
	iou = overlap_area / (box_area + other_boxes_area - overlap_area + 1e-6)
	return iou


	def NMS(boxes, C=0.5):
	if len(boxes) == 0:
	return []
	sort_boxes = boxes[boxes[:, 0].argsort(descending=True)]
	keep = []
	while len(sort_boxes) > 0:
	ref_box = sort_boxes[0]
	keep.append(ref_box)
	if len(sort_boxes) > 1:
	other_boxes = sort_boxes[1:]
	sort_boxes = other_boxes[torch.where(IOU(ref_box[1:5], other_boxes[:, 1:5]) < C)]
	else:
	break
	return torch.stack(keep)


	class BCEFocalLoss(torch.nn.Module):
	def __init__(self, gamma=2, alpha=0.25, reduction='mean'):
	super(BCEFocalLoss, self).__init__()
	self.gamma = gamma
	self.alpha = alpha
	self.reduction = reduction

	def forward(self, predict, target):
	pt = predict
	loss = -((1 - self.alpha) * ((1 - pt + 1e-5) ** self.gamma) * (target * torch.log(pt + 1e-5)) +
	self.alpha * ((pt + 1e-5) ** self.gamma) * ((1 - target) * torch.log(1 - pt + 1e-5)))
	if self.reduction == 'mean':
	loss = torch.mean(loss)
	elif self.reduction == 'sum':
	loss = torch.sum(loss)
	return loss


	class BCEWithWeightLoss(torch.nn.Module):
	def __init__(self, weight=[1, 1], reduction='mean'):
	super(BCEWithWeightLoss, self).__init__()
	self.weight = weight
	self.reduction = reduction

	def forward(self, inputs, target):
	loss = -(self.weight[1] * target * torch.log(inputs + 1e-7) +
	self.weight[0] * (1 - target) * torch.log(1 - inputs + 1e-7))
	if self.reduction == 'mean':
	loss = torch.mean(loss)
	elif self.reduction == 'sum':
	loss = torch.sum(loss)
	return loss


	class Experiment(BaseExperiment):

	def __init__(self, **parameter) -> None:
	super().__init__(**parameter)
	self.LR_STEP_SIZE = parameter['LR_STEP_SIZE']
	self.LR_STEP_GAMMA = parameter['LR_STEP_GAMMA']
	self.MOMENTUM = parameter['MOMENTUM']
	self.WEIGHT_DECAY = parameter['WEIGHT_DECAY']
	self.BETAS = parameter['BETAS']
	self.file_name = ''
	self.x_mask = mask[:, :, 0].to(self.DEVICE).unsqueeze_(dim=2)
	self.y_mask = mask[:, :, 1].to(self.DEVICE).unsqueeze_(dim=2)
	self.tokenizer = Tokenizer()

	def load_model(self, pretrain: str = None):
	self.model = Model(max_label_length=8)
	self.model = self.model.to(self.DEVICE)
	if pretrain:
	self.print(f"pretrain: {pretrain}")
	self.model.load_state_dict(torch.load(os.path.join(self.WORKSPACE, pretrain)), strict=False)
	if self.CHECKPOINT:
	self.model.load_state_dict(torch.load(os.path.join(self.WORKSPACE, self.CHECKPOINT))['parameter'])
	self.START_EPOCH = torch.load(os.path.join(self.WORKSPACE, self.CHECKPOINT))['epoch'] + 1

	def load_optimizer(self):
	# Use AdamW for PARSeq (better for transformers) with different LR groups
	parseq_params = list(self.model.parseq.parameters())
	det_params = [p for n, p in self.model.named_parameters() if not n.startswith('parseq')]

	self.optimizer = torch.optim.AdamW([
	{'params': det_params, 'lr': self.LR},
	{'params': parseq_params, 'lr': self.LR * 0.1}, # Lower LR for PARSeq
	], weight_decay=self.WEIGHT_DECAY)

	if self.CHECKPOINT:
	self.optimizer.load_state_dict(torch.load(os.path.join(self.WORKSPACE, self.CHECKPOINT))['optimizer'])

	def load_scheduler(self):
	self.scheduler = lr_scheduler.MultiStepLR(self.optimizer, milestones=[80, 85, 90], gamma=self.LR_STEP_GAMMA)
	if self.CHECKPOINT:
	state_dict = torch.load(os.path.join(self.WORKSPACE, self.CHECKPOINT))['scheduler']
	self.scheduler.load_state_dict(state_dict)

	def forward(self, data):
	x, _, _, _, _, _, _, _ = data
	x = x.to(self.DEVICE)

	# Extract GT boxes from the data for plate cropping
	img, lp_at, ch_at, bboxs, lp, lurds, lp_at_rec_facal, lp_lurd = data

	# Parse plate labels and boxes from lp_lurd string
	batch_boxes = []
	batch_plate_labels = []

	for b_idx in range(x.shape[0]):
	lp_lurd_str = lp_lurd[b_idx]
	plates_info = lp_lurd_str.split(';')
	boxes_b = []
	labels_b = []

	for info in plates_info:
	if '-' not in info:
	continue
	lurd_str, lp_str = info.split('-')
	lurd_vals = [int(v) for v in lurd_str.split(',')]
	lp_indices = [int(v) for v in lp_str.split(',')]

	if len(lurd_vals) == 4 and lurd_vals[2] > lurd_vals[0] and lurd_vals[3] > lurd_vals[1]:
	boxes_b.append(lurd_vals) # [l, t, r, b]
	# Convert indices back to plate string
	plate_str = ''.join([CHARACTER[idx] for idx in lp_indices if idx < len(CHARACTER) - 1])
	labels_b.append(plate_str)

	if len(boxes_b) > 0:
	batch_boxes.append(torch.tensor(boxes_b, dtype=torch.float32, device=self.DEVICE))
	else:
	batch_boxes.append(torch.zeros((1, 4), dtype=torch.float32, device=self.DEVICE))
	batch_plate_labels.extend(labels_b)

	# Forward pass with GT boxes and labels
	pred = self.model(x, boxes_lurd=batch_boxes, plate_labels=batch_plate_labels if batch_plate_labels else None)
	return pred

	def loss(self, data, pred):
	img, lp_at, ch_at, bboxs, lp, lurds, lp_at_rec_facal, lp_lurd = data
	y_detection, y_recognition, pred_at_lp, plate_counts = pred

	# --- Attention loss (LP attention only, no char attention) ---
	lp_at = lp_at.to(self.DEVICE).unsqueeze(dim=1)
	loss_at = BCEWithWeightLoss(weight=[0.1, 0.9])(pred_at_lp, lp_at)

	# --- Location loss (unchanged) ---
	bboxs = bboxs.to(self.DEVICE)
	lurds = lurds.to(self.DEVICE)
	obj = torch.sum(bboxs, dim=3) > 0
	noobj = torch.sum(bboxs, dim=3) == 0

	l, t, r, b = torch.split(y_detection[:, :, :, :4], 1, dim=-1)
	l = self.x_mask - l * img.shape[3]
	t = self.y_mask - t * img.shape[2]
	r = self.x_mask + r * img.shape[3]
	b = self.y_mask + b * img.shape[2]
	iou = IoU_multi(
	torch.flatten(lurds, start_dim=0, end_dim=2),
	torch.flatten(torch.concat([l, t, r, b], dim=3), start_dim=0, end_dim=2)
	)
	iou = iou.view(bboxs.shape[:3])

	loss_location = -torch.log(iou + 1e-6) * obj
	loss_location = torch.sum(loss_location) / (torch.sum(obj) + 1e-6)

	# --- Confidence loss (unchanged) ---
	confidence_location = iou.detach().float()
	loss_confidence_location = nn.MSELoss(reduction='none')(y_detection[:, :, :, 4], confidence_location) * obj + \
	0.1 * nn.MSELoss(reduction='none')(y_detection[:, :, :, 4],
	torch.zeros_like(confidence_location, device=self.DEVICE)) * noobj
	loss_confidence_location = torch.mean(loss_confidence_location)

	# --- Recognition loss (PARSeq cross-entropy) ---
	loss_classify = torch.tensor(0.0, device=self.DEVICE)

	if y_recognition is not None and sum(plate_counts) > 0:
	# Get target labels
	batch_plate_labels = []
	for b_idx in range(img.shape[0]):
	lp_lurd_str = lp_lurd[b_idx]
	plates_info = lp_lurd_str.split(';')
	for info in plates_info:
	if '-' not in info:
	continue
	lurd_str, lp_str = info.split('-')
	lurd_vals = [int(v) for v in lurd_str.split(',')]
	lp_indices = [int(v) for v in lp_str.split(',')]
	if len(lurd_vals) == 4 and lurd_vals[2] > lurd_vals[0] and lurd_vals[3] > lurd_vals[1]:
	plate_str = ''.join([CHARACTER[idx] for idx in lp_indices if idx < len(CHARACTER) - 1])
	batch_plate_labels.append(plate_str)

	if len(batch_plate_labels) > 0 and y_recognition.shape[0] == len(batch_plate_labels):
	# Encode targets: [BOS, c1, ..., cN, EOS, PAD...]
	targets = self.tokenizer.encode(batch_plate_labels, max_length=8, device=self.DEVICE)
	# Target for loss: [c1, c2, ..., cN, EOS, PAD...] (shift by 1)
	targets_shifted = targets[:, 1:] # Remove BOS, keep chars + EOS + PAD

	# PARSeq output: (N, max_len+1, num_tokens=74)
	# Flatten for cross-entropy
	logits_flat = y_recognition.reshape(-1, y_recognition.shape[-1]) # (N*(max_len+1), 74)
	targets_flat = targets_shifted.reshape(-1) # (N*(max_len+1),)

	# Mask out PAD positions from loss
	pad_mask = targets_flat != self.tokenizer.pad_id
	if pad_mask.any():
	loss_classify = F.cross_entropy(
	logits_flat[pad_mask],
	targets_flat[pad_mask],
	ignore_index=self.tokenizer.pad_id
	)

	# --- Total loss ---
	loss = 0.2 * loss_location + loss_confidence_location + 0.5 * loss_classify + 10 * loss_at
	return loss

	def before_val(self):
	self.count_iou = 0
	self.iou_list = [torch.zeros(1, device=self.DEVICE)]
	self.sample_num = 0
	self.pred_num = 1e-5
	self.count_lp = 0
	self.count_iou_lp = 0

	def evaluate(self, data, pred):
	img, _, _, _, _, _, _, lp_lurd = data
	y_detection, _, pred_at_lp, _ = pred

	for index in range(y_detection.shape[0]):
	lp_lurd_list = lp_lurd[index].split(';')
	lp_list = []
	lurd_list = []
	for i in lp_lurd_list:
	lurd, lp_str = i.split('-')
	lp_list.append(np.array(lp_str.split(',')).astype('int32'))
	lurd_list.append(np.array(lurd.split(',')).astype('int32'))

	l, t, r, b, c = torch.split(y_detection[index, :, :, :5], 1, dim=-1)
	l = self.x_mask - l * img.shape[3]
	t = self.y_mask - t * img.shape[2]
	r = self.x_mask + r * img.shape[3]
	b = self.y_mask + b * img.shape[2]

	# For evaluation, get detected boxes then run PARSeq on crops
	out = torch.flatten(torch.concat([c, l, t, r, b], dim=2), start_dim=0, end_dim=1)
	out = out[torch.where(out[:, 0] > 0.3)]

	if len(out) == 0:
	self.sample_num += len(lp_list)
	continue

	out = NMS(out.unsqueeze(0).squeeze(0) if out.dim() == 2 else out, 0.3)

	# Crop and recognize each detected plate
	boxes_for_rec = []
	for det in out:
	boxes_for_rec.append(det[1:5])

	if len(boxes_for_rec) > 0:
	boxes_tensor = torch.stack(boxes_for_rec).unsqueeze(0)
	input_img = img[index:index+1].to(self.DEVICE)
	plate_texts, _ = self.model.recognize_plates(input_img, [boxes_tensor.squeeze(0)])
	else:
	plate_texts = []

	self.sample_num += len(lp_list)
	self.pred_num += len(out)

	for i in range(len(lp_list)):
	for j in range(len(out)):
	iou = self.iou(
	torch.from_numpy(lurd_list[i]).to(self.DEVICE),
	torch.stack([out[j][1], out[j][2], out[j][3], out[j][4]])
	)
	self.iou_list.append(iou)
	if iou > 0.7:
	self.count_iou += 1

	if j < len(plate_texts):
	gt_str = ''.join([CHARACTER[idx] for idx in lp_list[i] if idx < len(CHARACTER) - 1])
	pred_str = plate_texts[j]
	if pred_str == gt_str:
	self.count_lp += 1
	if pred_str == gt_str and iou > 0.6:
	self.count_iou_lp += 1

	def after_val(self):
	print(f"Val IoU Detection Accuracy:{self.count_iou / len(self.val_dataset):>7f}")
	self.iou_list = torch.concat(self.iou_list, dim=0)
	print(f"Ave IoU:{torch.mean(self.iou_list):>7f}")
	print(f"Val Recognition Accuracy:{self.count_lp / len(self.val_dataset):>7f}")
	print(f"Val Recognition and Detection Accuracy:{self.count_iou_lp / len(self.val_dataset):>7f}")
	print(f"Val sample_num:{self.sample_num:>7f}")
	print(f"Val pred_num:{self.pred_num:>7f}")
	print(f"Val recall:{self.count_iou_lp / self.sample_num:>7f}")
	print(f"Val precision:{self.count_iou_lp / self.pred_num:>7f}")

	def iou(self, box, other_boxe):
	box_area = (box[2] - box[0]) * (box[3] - box[1])
	other_boxes_area = (other_boxe[2] - other_boxe[0]) * (other_boxe[3] - other_boxe[1])
	x1 = torch.max(box[0], other_boxe[0])
	y1 = torch.max(box[1], other_boxe[1])
	x2 = torch.min(box[2], other_boxe[2])
	y2 = torch.min(box[3], other_boxe[3])
	Min = torch.zeros(1, device=box.device)
	w, h = torch.max(Min, x2 - x1), torch.max(Min, y2 - y1)
	overlap_area = w * h
	iou = overlap_area / (box_area + other_boxes_area - overlap_area)
	return iou