Spaces:

suncongcong
/

AST-Demo

Runtime error

App Files Files Community

AST-Demo / modeling_ast.py

suncongcong

Upload modeling_ast.py

411567f verified 6 months ago

raw

history blame contribute delete

38.5 kB

	import torch
	import torch.nn as nn
	import torch.utils.checkpoint as checkpoint
	from timm.models.layers import DropPath, to_2tuple, trunc_normal_
	import torch.nn.functional as F
	from einops import rearrange, repeat
	from einops.layers.torch import Rearrange
	import math
	import numpy as np
	import time
	from torch import einsum
	import json
	import os
	import argparse
	from transformers import PretrainedConfig, PreTrainedModel


	#################################################################################
	# #
	# PART 1: 您的模型定义 (From the file you provided) #
	# #
	#################################################################################

	def conv(in_channels, out_channels, kernel_size, bias=False, stride=1):
	return nn.Conv2d(
	in_channels, out_channels, kernel_size,
	padding=(kernel_size // 2), bias=bias, stride=stride)


	class ConvBlock(nn.Module):
	def __init__(self, in_channel, out_channel, strides=1):
	super(ConvBlock, self).__init__()
	self.strides = strides
	self.in_channel = in_channel
	self.out_channel = out_channel
	self.block = nn.Sequential(
	nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=strides, padding=1),
	nn.LeakyReLU(inplace=True),
	nn.Conv2d(out_channel, out_channel, kernel_size=3, stride=strides, padding=1),
	nn.LeakyReLU(inplace=True),
	)
	self.conv11 = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=strides, padding=0)

	def forward(self, x):
	out1 = self.block(x)
	out2 = self.conv11(x)
	out = out1 + out2
	return out


	class LinearProjection(nn.Module):
	def __init__(self, dim, heads=8, dim_head=64, dropout=0., bias=True):
	super().__init__()
	inner_dim = dim_head * heads
	self.heads = heads
	self.to_q = nn.Linear(dim, inner_dim, bias=bias)
	self.to_kv = nn.Linear(dim, inner_dim * 2, bias=bias)
	self.dim = dim
	self.inner_dim = inner_dim

	def forward(self, x, attn_kv=None):
	B_, N, C = x.shape
	if attn_kv is not None:
	attn_kv = attn_kv.unsqueeze(0).repeat(B_, 1, 1)
	else:
	attn_kv = x
	N_kv = attn_kv.size(1)
	q = self.to_q(x).reshape(B_, N, 1, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
	kv = self.to_kv(attn_kv).reshape(B_, N_kv, 2, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
	q = q[0]
	k, v = kv[0], kv[1]
	return q, k, v


	class WindowAttention(nn.Module):
	def __init__(self, dim, win_size, num_heads, token_projection='linear', qkv_bias=True, qk_scale=None, attn_drop=0.,
	proj_drop=0.):
	super().__init__()
	self.dim = dim
	self.win_size = win_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim ** -0.5
	self.relative_position_bias_table = nn.Parameter(
	torch.zeros((2 * win_size[0] - 1) * (2 * win_size[1] - 1), num_heads))
	coords_h = torch.arange(self.win_size[0])
	coords_w = torch.arange(self.win_size[1])
	coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij"))
	coords_flatten = torch.flatten(coords, 1)
	relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
	relative_coords = relative_coords.permute(1, 2, 0).contiguous()
	relative_coords[:, :, 0] += self.win_size[0] - 1
	relative_coords[:, :, 1] += self.win_size[1] - 1
	relative_coords[:, :, 0] = 2 self.win_size[1] - 1
	relative_position_index = relative_coords.sum(-1)
	self.register_buffer("relative_position_index", relative_position_index)
	trunc_normal_(self.relative_position_bias_table, std=.02)
	if token_projection == 'linear':
	self.qkv = LinearProjection(dim, num_heads, dim // num_heads, bias=qkv_bias)
	else:
	raise Exception("Projection error!")
	self.token_projection = token_projection
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, attn_kv=None, mask=None):
	B_, N, C = x.shape
	q, k, v = self.qkv(x, attn_kv)
	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))
	relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
	self.win_size[0] * self.win_size[1], self.win_size[0] * self.win_size[1], -1)
	relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
	ratio = attn.size(-1) // relative_position_bias.size(-1)
	relative_position_bias = repeat(relative_position_bias, 'nH l c -> nH l (c d)', d=ratio)
	attn = attn + relative_position_bias.unsqueeze(0)
	if mask is not None:
	nW = mask.shape[0]
	mask = repeat(mask, 'nW m n -> nW m (n d)', d=ratio)
	attn = attn.view(B_ // nW, nW, self.num_heads, N, N * ratio) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, N, N * ratio)
	attn = self.softmax(attn)
	else:
	attn = self.softmax(attn)
	attn = self.attn_drop(attn)
	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class WindowAttention_sparse(nn.Module):
	def __init__(self, dim, win_size, num_heads, token_projection='linear', qkv_bias=True, qk_scale=None, attn_drop=0.,
	proj_drop=0.):
	super().__init__()
	self.dim = dim
	self.win_size = win_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim ** -0.5
	self.relative_position_bias_table = nn.Parameter(
	torch.zeros((2 * win_size[0] - 1) * (2 * win_size[1] - 1), num_heads))
	coords_h = torch.arange(self.win_size[0])
	coords_w = torch.arange(self.win_size[1])
	coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij"))
	coords_flatten = torch.flatten(coords, 1)
	relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
	relative_coords = relative_coords.permute(1, 2, 0).contiguous()
	relative_coords[:, :, 0] += self.win_size[0] - 1
	relative_coords[:, :, 1] += self.win_size[1] - 1
	relative_coords[:, :, 0] = 2 self.win_size[1] - 1
	relative_position_index = relative_coords.sum(-1)
	self.register_buffer("relative_position_index", relative_position_index)
	trunc_normal_(self.relative_position_bias_table, std=.02)
	if token_projection == 'linear':
	self.qkv = LinearProjection(dim, num_heads, dim // num_heads, bias=qkv_bias)
	else:
	raise Exception("Projection error!")
	self.token_projection = token_projection
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	self.softmax = nn.Softmax(dim=-1)
	self.relu = nn.ReLU()
	self.w = nn.Parameter(torch.ones(2))

	def forward(self, x, attn_kv=None, mask=None):
	B_, N, C = x.shape
	q, k, v = self.qkv(x, attn_kv)
	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))
	relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
	self.win_size[0] * self.win_size[1], self.win_size[0] * self.win_size[1], -1)
	relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
	ratio = attn.size(-1) // relative_position_bias.size(-1)
	relative_position_bias = repeat(relative_position_bias, 'nH l c -> nH l (c d)', d=ratio)
	attn = attn + relative_position_bias.unsqueeze(0)
	if mask is not None:
	nW = mask.shape[0]
	mask = repeat(mask, 'nW m n -> nW m (n d)', d=ratio)
	attn = attn.view(B_ // nW, nW, self.num_heads, N, N * ratio) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, N, N * ratio)
	attn0 = self.softmax(attn)
	attn1 = self.relu(attn) ** 2
	else:
	attn0 = self.softmax(attn)
	attn1 = self.relu(attn) ** 2
	w1 = torch.exp(self.w[0]) / torch.sum(torch.exp(self.w))
	w2 = torch.exp(self.w[1]) / torch.sum(torch.exp(self.w))
	attn = attn0 * w1 + attn1 * w2
	attn = self.attn_drop(attn)
	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class LeFF(nn.Module):
	def __init__(self, dim=32, hidden_dim=128, act_layer=nn.GELU, drop=0., use_eca=False):
	super().__init__()
	self.linear1 = nn.Sequential(nn.Linear(dim, hidden_dim), act_layer())
	self.dwconv = nn.Sequential(
	nn.Conv2d(hidden_dim, hidden_dim, groups=hidden_dim, kernel_size=3, stride=1, padding=1), act_layer())
	self.linear2 = nn.Sequential(nn.Linear(hidden_dim, dim))
	self.eca = nn.Identity()

	def forward(self, x):
	bs, hw, c = x.size()
	hh = int(math.sqrt(hw))
	x = self.linear1(x)
	x = rearrange(x, ' b (h w) (c) -> b c h w ', h=hh, w=hh)
	x = self.dwconv(x)
	x = rearrange(x, ' b c h w -> b (h w) c', h=hh, w=hh)
	x = self.linear2(x)
	x = self.eca(x)
	return x


	class FRFN(nn.Module):
	def __init__(self, dim=32, hidden_dim=128, act_layer=nn.GELU, drop=0., use_eca=False):
	super().__init__()
	self.linear1 = nn.Sequential(nn.Linear(dim, hidden_dim * 2),
	act_layer())
	self.dwconv = nn.Sequential(
	nn.Conv2d(hidden_dim, hidden_dim, groups=hidden_dim, kernel_size=3, stride=1, padding=1),
	act_layer())
	self.linear2 = nn.Sequential(nn.Linear(hidden_dim, dim))
	self.dim = dim
	self.hidden_dim = hidden_dim

	self.dim_conv = self.dim // 4
	self.dim_untouched = self.dim - self.dim_conv
	self.partial_conv3 = nn.Conv2d(self.dim_conv, self.dim_conv, 3, 1, 1, bias=False)

	def forward(self, x):
	bs, hw, c = x.size()
	hh = int(math.sqrt(hw))
	x = rearrange(x, ' b (h w) (c) -> b c h w ', h=hh, w=hh)
	x1, x2, = torch.split(x, [self.dim_conv, self.dim_untouched], dim=1)
	x1 = self.partial_conv3(x1)
	x = torch.cat((x1, x2), 1)
	x = rearrange(x, ' b c h w -> b (h w) c', h=hh, w=hh)
	x = self.linear1(x)
	x_1, x_2 = x.chunk(2, dim=-1)
	x_1 = rearrange(x_1, ' b (h w) (c) -> b c h w ', h=hh, w=hh)
	x_1 = self.dwconv(x_1)
	x_1 = rearrange(x_1, ' b c h w -> b (h w) c', h=hh, w=hh)
	x = x_1 * x_2
	x = self.linear2(x)
	return x


	def window_partition(x, win_size, dilation_rate=1):
	B, H, W, C = x.shape
	if dilation_rate != 1:
	x = x.permute(0, 3, 1, 2)
	assert type(dilation_rate) is int, 'dilation_rate should be a int'
	x = F.unfold(x, kernel_size=win_size, dilation=dilation_rate, padding=4 * (dilation_rate - 1), stride=win_size)
	windows = x.permute(0, 2, 1).contiguous().view(-1, C, win_size, win_size)
	windows = windows.permute(0, 2, 3, 1).contiguous()
	else:
	x = x.view(B, H // win_size, win_size, W // win_size, win_size, C)
	windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, win_size, win_size, C)
	return windows


	def window_reverse(windows, win_size, H, W, dilation_rate=1):
	B = int(windows.shape[0] / (H * W / win_size / win_size))
	x = windows.view(B, H // win_size, W // win_size, win_size, win_size, -1)
	if dilation_rate != 1:
	x = windows.permute(0, 5, 3, 4, 1, 2).contiguous()
	x = F.fold(x, (H, W), kernel_size=win_size, dilation=dilation_rate, padding=4 * (dilation_rate - 1),
	stride=win_size)
	else:
	x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
	return x


	class Downsample(nn.Module):
	def __init__(self, in_channel, out_channel):
	super(Downsample, self).__init__()
	self.conv = nn.Sequential(nn.Conv2d(in_channel, out_channel, kernel_size=4, stride=2, padding=1))

	def forward(self, x):
	B, L, C = x.shape
	H = int(math.sqrt(L))
	W = int(math.sqrt(L))
	x = x.transpose(1, 2).contiguous().view(B, C, H, W)
	out = self.conv(x).flatten(2).transpose(1, 2).contiguous()
	return out


	class Upsample(nn.Module):
	def __init__(self, in_channel, out_channel):
	super(Upsample, self).__init__()
	self.deconv = nn.Sequential(nn.ConvTranspose2d(in_channel, out_channel, kernel_size=2, stride=2))

	def forward(self, x):
	B, L, C = x.shape
	H = int(math.sqrt(L))
	W = int(math.sqrt(L))
	x = x.transpose(1, 2).contiguous().view(B, C, H, W)
	out = self.deconv(x).flatten(2).transpose(1, 2).contiguous()
	return out


	class InputProj(nn.Module):
	def __init__(self, in_channel=3, out_channel=64, kernel_size=3, stride=1, norm_layer=None, act_layer=nn.LeakyReLU):
	super().__init__()
	self.proj = nn.Sequential(
	nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, padding=kernel_size // 2),
	act_layer(inplace=True))
	self.norm = norm_layer(out_channel) if norm_layer is not None else None

	def forward(self, x):
	B, C, H, W = x.shape
	x = self.proj(x).flatten(2).transpose(1, 2).contiguous()
	if self.norm is not None:
	x = self.norm(x)
	return x


	class OutputProj(nn.Module):
	def __init__(self, in_channel=64, out_channel=3, kernel_size=3, stride=1, norm_layer=None, act_layer=None):
	super().__init__()
	self.proj = nn.Sequential(
	nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, padding=kernel_size // 2))
	if act_layer is not None:
	self.proj.add_module(str(len(self.proj)), act_layer(inplace=True))
	self.norm = norm_layer(out_channel) if norm_layer is not None else None

	def forward(self, x):
	B, L, C = x.shape
	H = int(math.sqrt(L))
	W = int(math.sqrt(L))
	x = x.transpose(1, 2).view(B, C, H, W)
	x = self.proj(x)
	if self.norm is not None:
	x = self.norm(x)
	return x


	class TransformerBlock(nn.Module):
	def __init__(self, dim, input_resolution, num_heads, win_size=8, shift_size=0, mlp_ratio=4., qkv_bias=True,
	qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
	token_projection='linear', token_mlp='leff', att=True, sparseAtt=False):
	super().__init__()
	self.att = att
	self.sparseAtt = sparseAtt
	self.dim = dim
	self.input_resolution = input_resolution
	self.num_heads = num_heads
	self.win_size = win_size
	self.shift_size = shift_size
	self.mlp_ratio = mlp_ratio
	if min(self.input_resolution) <= self.win_size:
	self.shift_size = 0
	self.win_size = min(self.input_resolution)
	assert 0 <= self.shift_size < self.win_size, "shift_size must in 0-win_size"
	if self.att:
	self.norm1 = norm_layer(dim)
	if self.sparseAtt:
	self.attn = WindowAttention_sparse(dim, win_size=to_2tuple(self.win_size), num_heads=num_heads,
	qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
	proj_drop=drop, token_projection=token_projection)
	else:
	self.attn = WindowAttention(dim, win_size=to_2tuple(self.win_size), num_heads=num_heads,
	qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop,
	token_projection=token_projection)
	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	if token_mlp in ['ffn', 'mlp']:
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
	elif token_mlp == 'leff':
	self.mlp = LeFF(dim, mlp_hidden_dim, act_layer=act_layer, drop=drop)
	elif token_mlp == 'frfn':
	self.mlp = FRFN(dim, mlp_hidden_dim, act_layer=act_layer, drop=drop)
	else:
	raise Exception("FFN error!")

	def forward(self, x, mask=None):
	B, L, C = x.shape
	H = int(math.sqrt(L))
	W = int(math.sqrt(L))
	attn_mask = None
	if self.shift_size > 0:
	shift_mask = torch.zeros((1, H, W, 1), device=x.device)
	h_slices = (slice(0, -self.win_size), slice(-self.win_size, -self.shift_size),
	slice(-self.shift_size, None))
	w_slices = (slice(0, -self.win_size), slice(-self.win_size, -self.shift_size),
	slice(-self.shift_size, None))
	cnt = 0
	for h in h_slices:
	for w in w_slices:
	shift_mask[:, h, w, :] = cnt
	cnt += 1
	shift_mask_windows = window_partition(shift_mask, self.win_size)
	shift_mask_windows = shift_mask_windows.view(-1, self.win_size * self.win_size)
	attn_mask = shift_mask_windows.unsqueeze(1) - shift_mask_windows.unsqueeze(2)
	attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
	shortcut = x
	if self.att:
	x = self.norm1(x)
	x = x.view(B, H, W, C)
	if self.shift_size > 0:
	shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
	else:
	shifted_x = x
	x_windows = window_partition(shifted_x, self.win_size)
	x_windows = x_windows.view(-1, self.win_size * self.win_size, C)
	attn_windows = self.attn(x_windows, mask=attn_mask)
	attn_windows = attn_windows.view(-1, self.win_size, self.win_size, C)
	shifted_x = window_reverse(attn_windows, self.win_size, H, W)
	if self.shift_size > 0:
	x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
	else:
	x = shifted_x
	x = x.view(B, H * W, C)
	x = shortcut + self.drop_path(x)
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x


	class BasicASTLayer(nn.Module):
	def __init__(self, dim, output_dim, input_resolution, depth, num_heads, win_size, mlp_ratio=4., qkv_bias=True,
	qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, use_checkpoint=False,
	token_projection='linear', token_mlp='ffn', shift_flag=True, att=False, sparseAtt=False):
	super().__init__()
	self.att = att
	self.sparseAtt = sparseAtt
	self.depth = depth
	self.use_checkpoint = use_checkpoint
	if shift_flag:
	self.blocks = nn.ModuleList([
	TransformerBlock(dim=dim, input_resolution=input_resolution, num_heads=num_heads, win_size=win_size,
	shift_size=0 if (i % 2 == 0) else win_size // 2, mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
	drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
	norm_layer=norm_layer, token_projection=token_projection, token_mlp=token_mlp,
	att=self.att, sparseAtt=self.sparseAtt)
	for i in range(depth)])
	else:
	self.blocks = nn.ModuleList([
	TransformerBlock(dim=dim, input_resolution=input_resolution, num_heads=num_heads, win_size=win_size,
	shift_size=0, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop,
	attn_drop=attn_drop,
	drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
	norm_layer=norm_layer, token_projection=token_projection, token_mlp=token_mlp,
	att=self.att, sparseAtt=self.sparseAtt)
	for i in range(depth)])

	def forward(self, x, mask=None):
	for blk in self.blocks:
	if self.use_checkpoint:
	# Note: checkpoint doesn't support mask argument, so we pass it as None
	x = checkpoint.checkpoint(blk, x, None)
	else:
	x = blk(x, mask)
	return x


	class AST(nn.Module):
	def __init__(self, img_size=256, in_chans=3, dd_in=3, embed_dim=32, depths=[2, 2, 2, 2, 2, 2, 2, 2, 2],
	num_heads=[1, 2, 4, 8, 16, 16, 8, 4, 2], win_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
	drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, patch_norm=True,
	use_checkpoint=False, token_projection='linear', token_mlp='leff', dowsample=Downsample,
	upsample=Upsample, shift_flag=True, **kwargs):
	super().__init__()
	self.num_enc_layers = len(depths) // 2
	self.num_dec_layers = len(depths) // 2
	self.embed_dim = embed_dim
	self.patch_norm = patch_norm
	self.mlp_ratio = mlp_ratio
	self.token_projection = token_projection
	self.mlp = token_mlp
	self.win_size = win_size
	self.reso = img_size
	self.pos_drop = nn.Dropout(p=drop_rate)
	self.dd_in = dd_in
	enc_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths[:self.num_enc_layers]))]
	conv_dpr = [drop_path_rate] * depths[4]
	dec_dpr = enc_dpr[::-1]
	self.input_proj = InputProj(in_channel=dd_in, out_channel=embed_dim, kernel_size=3, stride=1,
	act_layer=nn.LeakyReLU)
	self.output_proj = OutputProj(in_channel=2 * embed_dim, out_channel=in_chans, kernel_size=3, stride=1)
	# Encoder
	self.encoderlayer_0 = BasicASTLayer(dim=embed_dim, output_dim=embed_dim, input_resolution=(img_size, img_size),
	depth=depths[0], num_heads=num_heads[0], win_size=win_size,
	mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate,
	drop_path=enc_dpr[sum(depths[:0]):sum(depths[:1])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=False, sparseAtt=False)
	self.dowsample_0 = dowsample(embed_dim, embed_dim * 2)
	self.encoderlayer_1 = BasicASTLayer(dim=embed_dim * 2, output_dim=embed_dim * 2,
	input_resolution=(img_size // 2, img_size // 2), depth=depths[1],
	num_heads=num_heads[1], win_size=win_size, mlp_ratio=self.mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate,
	attn_drop=attn_drop_rate,
	drop_path=enc_dpr[sum(depths[:1]):sum(depths[:2])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=False, sparseAtt=False)
	self.dowsample_1 = dowsample(embed_dim * 2, embed_dim * 4)
	self.encoderlayer_2 = BasicASTLayer(dim=embed_dim * 4, output_dim=embed_dim * 4,
	input_resolution=(img_size // (2 2), img_size // (2 2)),
	depth=depths[2], num_heads=num_heads[2], win_size=win_size,
	mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate,
	drop_path=enc_dpr[sum(depths[:2]):sum(depths[:3])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=False, sparseAtt=False)
	self.dowsample_2 = dowsample(embed_dim * 4, embed_dim * 8)
	self.encoderlayer_3 = BasicASTLayer(dim=embed_dim * 8, output_dim=embed_dim * 8,
	input_resolution=(img_size // (2 3), img_size // (2 3)),
	depth=depths[3], num_heads=num_heads[3], win_size=win_size,
	mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate,
	drop_path=enc_dpr[sum(depths[:3]):sum(depths[:4])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=False, sparseAtt=False)
	self.dowsample_3 = dowsample(embed_dim * 8, embed_dim * 16)
	# Bottleneck
	self.conv = BasicASTLayer(dim=embed_dim * 16, output_dim=embed_dim * 16,
	input_resolution=(img_size // (2 4), img_size // (2 4)), depth=depths[4],
	num_heads=num_heads[4], win_size=win_size, mlp_ratio=self.mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate,
	drop_path=conv_dpr, norm_layer=norm_layer, use_checkpoint=use_checkpoint,
	token_projection=token_projection, token_mlp=token_mlp, shift_flag=shift_flag,
	att=True, sparseAtt=True)
	# Decoder
	self.upsample_0 = upsample(embed_dim * 16, embed_dim * 8)
	self.decoderlayer_0 = BasicASTLayer(dim=embed_dim * 16, output_dim=embed_dim * 16,
	input_resolution=(img_size // (2 3), img_size // (2 3)),
	depth=depths[5], num_heads=num_heads[5], win_size=win_size,
	mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dec_dpr[:depths[5]],
	norm_layer=norm_layer, use_checkpoint=use_checkpoint,
	token_projection=token_projection, token_mlp=token_mlp,
	shift_flag=shift_flag, att=True, sparseAtt=True)
	self.upsample_1 = upsample(embed_dim * 16, embed_dim * 4)
	self.decoderlayer_1 = BasicASTLayer(dim=embed_dim * 8, output_dim=embed_dim * 8,
	input_resolution=(img_size // (2 2), img_size // (2 2)),
	depth=depths[6], num_heads=num_heads[6], win_size=win_size,
	mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate,
	drop_path=dec_dpr[sum(depths[5:6]):sum(depths[5:7])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=True, sparseAtt=True)
	self.upsample_2 = upsample(embed_dim * 8, embed_dim * 2)
	self.decoderlayer_2 = BasicASTLayer(dim=embed_dim * 4, output_dim=embed_dim * 4,
	input_resolution=(img_size // 2, img_size // 2), depth=depths[7],
	num_heads=num_heads[7], win_size=win_size, mlp_ratio=self.mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate,
	attn_drop=attn_drop_rate,
	drop_path=dec_dpr[sum(depths[5:7]):sum(depths[5:8])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=True, sparseAtt=True)
	self.upsample_3 = upsample(embed_dim * 4, embed_dim)
	self.decoderlayer_3 = BasicASTLayer(dim=embed_dim * 2, output_dim=embed_dim * 2,
	input_resolution=(img_size, img_size), depth=depths[8],
	num_heads=num_heads[8], win_size=win_size, mlp_ratio=self.mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate,
	attn_drop=attn_drop_rate,
	drop_path=dec_dpr[sum(depths[5:8]):sum(depths[5:9])], norm_layer=norm_layer,
	use_checkpoint=use_checkpoint, token_projection=token_projection,
	token_mlp=token_mlp, shift_flag=shift_flag, att=True, sparseAtt=True)
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	def forward(self, x, mask=None):
	y = self.input_proj(x)
	y = self.pos_drop(y)
	conv0 = self.encoderlayer_0(y, mask=mask)
	pool0 = self.dowsample_0(conv0)
	conv1 = self.encoderlayer_1(pool0, mask=mask)
	pool1 = self.dowsample_1(conv1)
	conv2 = self.encoderlayer_2(pool1, mask=mask)
	pool2 = self.dowsample_2(conv2)
	conv3 = self.encoderlayer_3(pool2, mask=mask)
	pool3 = self.dowsample_3(conv3)
	conv4 = self.conv(pool3, mask=mask)
	up0 = self.upsample_0(conv4)
	deconv0 = torch.cat([up0, conv3], -1)
	deconv0 = self.decoderlayer_0(deconv0, mask=mask)
	up1 = self.upsample_1(deconv0)
	deconv1 = torch.cat([up1, conv2], -1)
	deconv1 = self.decoderlayer_1(deconv1, mask=mask)
	up2 = self.upsample_2(deconv1)
	deconv2 = torch.cat([up2, conv1], -1)
	deconv2 = self.decoderlayer_2(deconv2, mask=mask)
	up3 = self.upsample_3(deconv2)
	deconv3 = torch.cat([up3, conv0], -1)
	deconv3 = self.decoderlayer_3(deconv3, mask=mask)
	y = self.output_proj(deconv3)
	return x + y if self.dd_in == 3 else y


	#################################################################################
	# #
	# PART 2: Hugging Face 包装类 (The Hugging Face Wrapper Classes) #
	# #
	#################################################################################

	class ASTConfig(PretrainedConfig):
	"""
	This is the configuration class to store the configuration of an `AST` model.
	"""
	model_type = "ast"

	def __init__(self, **kwargs):
	super().__init__(**kwargs)


	class ASTForRestoration(PreTrainedModel):
	"""
	This is the main model class that will be loaded by Hugging Face.
	"""
	config_class = ASTConfig

	def __init__(self, config: ASTConfig):
	super().__init__(config)
	self.model = AST(**config.to_dict())

	def forward(self, pixel_values):
	"""
	The forward pass of the model.
	"""
	return self.model(pixel_values)


	#################################################################################
	# #
	# PART 3: 主转换逻辑 (Main Conversion Logic) #
	# #
	#################################################################################

	if __name__ == '__main__':
	# --- 使用 argparse 使脚本可重用 ---
	parser = argparse.ArgumentParser(description="Convert AST model .pth files to Hugging Face format.")
	parser.add_argument("--pth_path", type=str, required=True, help="Path to the input .pth weight file.")
	parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the Hugging Face model.")
	parser.add_argument("--task_name", type=str, default="restoration",
	help="Name of the task (e.g., 'dehazing', 'desnowing') for logging.")
	args = parser.parse_args()

	# --- 模型架构参数 (最终修正版) ---
	model_params = {
	"img_size": 256,
	"in_chans": 3,
	"dd_in": 3,
	"embed_dim": 32,
	"depths": [1, 2, 8, 8, 2, 8, 8, 2, 1], # <--- 最终的关键修正！
	"num_heads": [1, 2, 4, 8, 16, 16, 8, 4, 2],
	"win_size": 8,
	"mlp_ratio": 4.0,
	"qkv_bias": True,
	"qk_scale": None,
	"drop_rate": 0.0,
	"attn_drop_rate": 0.0,
	"drop_path_rate": 0.1,
	"patch_norm": True,
	"use_checkpoint": False,
	"token_projection": "linear",
	"token_mlp": "frfn",
	"shift_flag": True
	}

	# --- 执行转换 ---
	print(f"🚀 任务: {args.task_name.upper()} \| 步骤 1/5: 正在创建 Hugging Face 模型实例 (AST)...")
	hf_config = ASTConfig(**model_params)
	hf_model = ASTForRestoration(hf_config)
	print("✅ 模型实例创建成功!")

	print(f"🚀 步骤 2/5: 正在从 '{args.pth_path}' 加载权重...")
	if not os.path.exists(args.pth_path):
	raise FileNotFoundError(f"错误: 找不到权重文件 '{args.pth_path}'。请检查路径是否正确。")
	state_dict = torch.load(args.pth_path, map_location='cpu')
	print("✅ 权重文件加载成功!")

	print("🚀 步骤 3/5: 正在处理权重字典...")
	# 检查权重是否嵌套在某个通用键下
	if 'state_dict' in state_dict:
	state_dict = state_dict['state_dict']
	elif 'params_ema' in state_dict:
	state_dict = state_dict['params_ema']
	elif 'params' in state_dict:
	state_dict = state_dict['params']

	# 移除 'module.' 前缀
	new_state_dict = {k.replace('module.', '', 1): v for k, v in state_dict.items()}

	# 加载权重
	hf_model.model.load_state_dict(new_state_dict)
	hf_model.eval()
	print("✅ 权重成功加载到模型中!")

	print(f"🚀 步骤 4/5: 正在将模型保存到 '{args.output_dir}'...")
	if not os.path.exists(args.output_dir):
	os.makedirs(args.output_dir)
	hf_model.save_pretrained(args.output_dir)
	print(f"✅ 模型和 config.json 已保存!")

	# 创建并保存图像处理器配置
	image_processor_config = {
	"do_normalize": True,
	"image_mean": [0.5, 0.5, 0.5],
	"image_std": [0.5, 0.5, 0.5],
	"data_format": "channels_first"
	}
	with open(os.path.join(args.output_dir, 'preprocessor_config.json'), 'w') as f:
	json.dump(image_processor_config, f)
	print(f"✅ 图像处理器配置 (preprocessor_config.json) 已保存!")

	print(f"\n🎉 任务 '{args.task_name.upper()}' 转换完成！")
	print(f"查看输出目录: {args.output_dir}")
	print("\n下一步操作:")
	print(f"1. 将此脚本文件本身复制到输出目录 '{args.output_dir}' 中，并重命名为 `modeling_ast.py`。")
	print("2. 将整个输出目录上传到您的 Hugging Face 仓库。")
	print("3. 在 Hub 上加载模型时，请确保使用 `trust_remote_code=True`。")