Spaces:

abdullahsajid
/

antispoofing-test

Sleeping

App Files Files Community

antispoofing-test / app.py

abdullahsajid

Update app.py

53c7fc4 verified over 1 year ago

raw

history blame contribute delete

21.3 kB

	from flask import Flask, jsonify, request
	from flask_cors import CORS
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision
	import torchvision.models as models
	from torchvision import transforms
	import torchaudio
	import numpy as np
	import matplotlib.pyplot as plt
	import base64
	import io
	from PIL import Image
	from ultralytics import YOLO
	from PIL import Image
	from facenet_pytorch import MTCNN
	from ultralytics import YOLO


	app = Flask(__name__)
	CORS(app)


	binary_labels = ['Real','Spoof']
	multi_voice_labels = ['Real','Text to Speech','Voice Conversion','Text to Speech + Voice Conversion']
	multi_face_labels = ['Genuine Face','Printed Photo','Paper Cut','Replayed Face','3D Mask']
	multi_finger_print_labels = ['Real Fingerprint','Printed Image','Gelatin Mold','Silicone Mask']

	finger_print_detector = YOLO('fingerprint_best.pt')


	# def process_audio(encoded_audio):
	# decoded_audio = base64.b64decode(encoded_audio)
	# audio_bytes = io.BytesIO(decoded_audio)
	# waveform, sample_rate = torchaudio.load(audio_bytes)

	# if waveform.size(0) > 1:
	# waveform = waveform.mean(dim=0, keepdim=True) # Convert to mono by averaging channels

	# mel_spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=80)(waveform).squeeze(0)
	# num_frames = mel_spectrogram.size(1)
	# target_length = 400

	# if num_frames < target_length:
	# padding = target_length - num_frames
	# mel_spectrogram = torch.cat([mel_spectrogram, torch.zeros(mel_spectrogram.size(0), padding)], dim=1)
	# else:
	# mel_spectrogram = mel_spectrogram[:, :target_length]

	# mel_spectrogram = mel_spectrogram.transpose(0, 1)
	# length = torch.tensor([mel_spectrogram.size(0)])
	# return mel_spectrogram.unsqueeze(0) ,length



	def process_audio(encoded_audio):
	decoded_audio = base64.b64decode(encoded_audio)
	audio_bytes = io.BytesIO(decoded_audio)
	waveform, sample_rate = torchaudio.load(audio_bytes)
	mel_spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=80)
	normalize = torchvision.transforms.Normalize(mean=[0.5], std=[0.5])
	mel_spectrogram = mel_spectrogram(waveform)[0].squeeze(0) if len(mel_spectrogram(waveform)) > 0 else mel_spectrogram(waveform).squeeze(0)
	num_frames = mel_spectrogram.size(1)
	target_length = 400
	target_size=(224, 224)

	if num_frames < target_length:
	padding = target_length - num_frames
	mel_spectrogram = torch.cat([mel_spectrogram, torch.zeros(mel_spectrogram.size(0), padding)], dim=1)
	else:
	mel_spectrogram = mel_spectrogram[:, :target_length]

	mel_spectrogram = torchvision.transforms.Resize(target_size)(mel_spectrogram.unsqueeze(0)).squeeze(0)
	mel_spectrogram = mel_spectrogram.unsqueeze(0)
	mel_spectrogram = mel_spectrogram.repeat(3, 1, 1)
	mel_spectrogram = normalize(mel_spectrogram).unsqueeze(0)
	return mel_spectrogram


	def process_image(base64_img,extend=0):
	image_data = base64.b64decode(base64_img)
	img = Image.open(io.BytesIO(image_data)).convert('RGB')
	if isinstance(img, torch.Tensor):
	img = transforms.ToPILImage()(img)
	elif isinstance(img, np.ndarray):
	img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
	mtcnn = MTCNN(keep_all=False, device='cuda' if torch.cuda.is_available() else 'cpu')
	boxes, _ = mtcnn.detect(img)
	face_detected = boxes is not None
	if face_detected:
	real_w, real_h = img.size
	box = boxes[0]
	bbox = list(map(float, box))
	x1 = int(bbox[0])
	x2 = int(bbox[1])
	x3 = int(bbox[2])
	x4 = int(bbox[3])
	img = img.crop((x1,x2,x3,x4))

	transformer = torchvision.transforms.Compose([
	transforms.ToTensor(),
	transforms.Resize((224, 224), antialias=True)
	])

	img = transformer(img)
	bbox = [x1,x2,x3,x4] if boxes is not None else None
	return img.unsqueeze(0), face_detected, bbox



	def process_fingerprint_image(base64_img):
	image_data = base64.b64decode(base64_img)
	img = Image.open(io.BytesIO(image_data)).convert('RGB')

	if isinstance(img, torch.Tensor):
	img = transforms.ToPILImage()(img)
	elif isinstance(img, np.ndarray):
	img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

	transformer = torchvision.transforms.Compose([
	transforms.ToTensor(),
	transforms.Resize((224, 224), antialias=True)
	])

	img = transformer(img)
	results = finger_print_detector(img.unsqueeze(0))
	is_detected = any(np.array(results[0].boxes.cls.cpu())==0)
	return img.unsqueeze(0),is_detected





	class ConformerClassifier(torch.nn.Module):
	def __init__(self, input_dim, num_classes, num_heads, ffn_dim, num_layers, depthwise_conv_kernel_size,dropout=0.0,use_group_norm=False,convolution_first=False):
	super(ConformerClassifier, self).__init__()
	self.conformer = torchaudio.models.Conformer(
	input_dim=input_dim,
	num_heads=num_heads,
	ffn_dim=ffn_dim,
	num_layers=num_layers,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first
	)
	self.fc = torch.nn.Linear(input_dim, num_classes)

	def forward(self, x, lengths):
	x,length = self.conformer(x, lengths)
	x = x.mean(dim=1)
	x = self.fc(x)
	return x





	def initialize_weights(m):
	if isinstance(m, nn.Linear):
	nn.init.xavier_uniform_(m.weight)
	if m.bias is not None:
	nn.init.zeros_(m.bias)

	class NativeAdapter(nn.Module):
	def __init__(self, input_dim=1024, bottleneck_dim=64):
	super(NativeAdapter, self).__init__()
	self.linear1 = nn.Linear(input_dim, bottleneck_dim)
	self.activ = nn.GELU()
	self.linear2 = nn.Linear(bottleneck_dim, input_dim)
	self.apply(initialize_weights)

	def forward(self, x):
	residual = x
	out = self.linear1(x)
	out = self.activ(out)
	out = self.linear2(out)
	return out + residual

	class EnsembleAdapter(nn.Module):
	def __init__(self):
	super(EnsembleAdapter, self).__init__()
	self.adapter1 = NativeAdapter()
	self.adapter2 = NativeAdapter()

	def forward(self, x):
	out1 = self.adapter1(x)
	out2 = self.adapter2(x)
	out = (out1 + out2) / 2
	cos_sim = torch.nn.functional.cosine_similarity(out1, out2, dim=-1)
	cos_sim_loss = cos_sim.mean()
	return out, cos_sim_loss

	class FWTLayer(nn.Module):
	def __init__(self, hidden_dim=1024, std=0.02):
	super(FWTLayer, self).__init__()
	self.hidden_dim = hidden_dim
	self.std = std

	self.W_alpha = nn.Parameter(torch.randn(hidden_dim))
	self.W_beta = nn.Parameter(torch.randn(hidden_dim))

	def forward(self, x):
	alpha = torch.randn(self.hidden_dim).to(x.device) * self.std * F.softplus(self.W_alpha)
	beta = torch.randn(self.hidden_dim).to(x.device) * self.std * F.softplus(self.W_beta)

	x_transformed = x + alpha * x + beta
	return x_transformed

	class UpdatedBlock(nn.Module):
	def __init__(self, encoder_block):
	super(UpdatedBlock, self).__init__()
	self.ln_1 = encoder_block.ln_1
	self.self_attention = encoder_block.self_attention
	self.dropout = encoder_block.dropout
	self.ensemble_adapter1 = EnsembleAdapter()
	self.ln_2 = encoder_block.ln_2
	self.mlp = encoder_block.mlp
	self.ensemble_adapter2 = EnsembleAdapter()
	self.fwt_layer = FWTLayer()

	def forward(self, input):
	x = self.ln_1(input)
	x, _ = self.self_attention(x, x, x, need_weights=False)
	x = self.dropout(x)
	x, loss_1 = self.ensemble_adapter1(x)
	x = x + input

	y = self.ln_2(x)
	y = self.mlp(y)
	y, loss_2 = self.ensemble_adapter2(y)
	out = x + y
	if self.training:
	out = self.fwt_layer(out)
	return out, (loss_1 + loss_2) / 2

	class UpdatedEncoder(nn.Module):
	def __init__(self, encoder):
	super(UpdatedEncoder, self).__init__()
	self.pos_embedding = encoder.pos_embedding
	self.dropout = encoder.dropout
	self.layers = nn.ModuleList([UpdatedBlock(layer) for layer in encoder.layers])
	self.ln = encoder.ln

	def forward(self, x):
	out = x + self.pos_embedding
	out = self.dropout(out)
	total_loss = 0
	for layer in self.layers:
	out, loss = layer(out)
	total_loss += loss
	out = self.ln(out)
	return out, total_loss

	class UpdatedViT(nn.Module):
	def __init__(self, base_model):
	super(UpdatedViT, self).__init__()
	self.conv_proj = base_model.conv_proj
	self.encoder = UpdatedEncoder(base_model.encoder)
	self.heads = base_model.heads
	self._process_input = base_model._process_input
	self.class_token = base_model.class_token

	def forward(self, x):
	x = self._process_input(x)
	n = x.shape[0]
	batch_class_token = self.class_token.expand(n, -1, -1)
	x = torch.cat([batch_class_token, x], dim=1)
	x, cos_loss = self.encoder(x)
	x = x[:, 0]
	x = self.heads(x)
	return x, cos_loss / len(self.encoder.layers)



	# Voice Binary Model

	# voice_binary_model = ConformerClassifier(
	# input_dim=80,
	# num_classes=2,
	# num_heads=4,
	# ffn_dim=128,
	# num_layers=4,
	# depthwise_conv_kernel_size=7,
	# dropout=0.3,
	# use_group_norm=False,
	# convolution_first=True
	# )
	# voice_binary_model.load_state_dict(torch.load('binary_voice_model.pth',map_location='cpu'))
	# voice_binary_model.eval()


	voice_model_binary = torchvision.models.vit_l_16(weights=None,progress=True)
	voice_model_binary.heads=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 2)
	)

	voice_binary_model = UpdatedViT(voice_model_binary)
	voice_binary_model.load_state_dict(torch.load('voice_weights.pth',map_location='cpu'))
	voice_binary_model.eval()


	# Voice Multi Model
	voice_multi_model = ConformerClassifier(
	input_dim=80,
	num_classes=4,
	num_heads=4,
	ffn_dim=128,
	num_layers=4,
	depthwise_conv_kernel_size=31,
	dropout=0.3,
	use_group_norm=False,
	convolution_first=True
	)

	voice_multi_model.load_state_dict(torch.load('multi_voice_model.pth',map_location='cpu'))
	voice_multi_model.eval()

	# Vision Transformer Binary Model
	vit_model_binary = torchvision.models.vit_l_16(weights=None,progress=True)
	vit_model_binary.heads=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 2)
	)

	vit_binary_model = UpdatedViT(vit_model_binary)
	vit_binary_model.load_state_dict(torch.load('binary_vit_model_correct.pth',map_location='cpu'))
	vit_binary_model.eval()


	# Vision Transformer Multi Model
	vit_model_multi = torchvision.models.vit_l_16(weights=None,progress=True)
	vit_model_multi.heads=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 5)
	)

	vit_multi_model = UpdatedViT(vit_model_multi)
	vit_multi_model.load_state_dict(torch.load('multi_vit_model_correct.pth',map_location='cpu'))
	vit_multi_model.eval()

	# ConvNext Binary Model
	convnext_binary_model = torchvision.models.convnext_base(weights=None,progress=False)
	convnext_binary_model.classifier[2]=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 2),
	)
	convnext_binary_model.load_state_dict(torch.load('binary_convnext_model_correct.pth',map_location='cpu'))
	convnext_binary_model.eval()


	# ConvNext Multi Model
	convnext_multi_model = torchvision.models.convnext_base(weights=None,progress=False)
	convnext_multi_model.classifier[2]=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 5),
	)
	convnext_multi_model.load_state_dict(torch.load('multi_convnext_model_correct.pth',map_location='cpu'))
	convnext_multi_model.eval()


	# Fingerprint Binary Model
	fingerprint_binary = torchvision.models.vit_l_16(weights=None,progress=True)
	fingerprint_binary.heads=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 2)
	)

	binary_fingerprint_model = UpdatedViT(fingerprint_binary)
	binary_fingerprint_model.load_state_dict(torch.load('binary_finger_print_correct1.pth',map_location='cpu'))
	fingerprint_binary.eval()
	binary_fingerprint_model.eval()


	# Fingerprint Multi Model
	fingerprint_multi = torchvision.models.vit_l_16(weights=None,progress=True)
	fingerprint_multi.heads=nn.Sequential(
	nn.Linear(1024, 512),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(512, 4)
	)

	multi_fingerprint_model = UpdatedViT(fingerprint_multi)
	multi_fingerprint_model.load_state_dict(torch.load('multi_finger_print_correct1.pth',map_location='cpu'))
	fingerprint_multi.eval()
	multi_fingerprint_model.eval()
	print('Models Loaded Successfully')




	@app.route('/')
	def home():
	return "Welcome to the Antispoofing Solutions!"



	# @app.route('/api/voice', methods=['POST'])
	# def post_api_voice():
	# try:
	# binary_mode = request.args.get('binary', 'False').lower() == 'true'
	# data = request.json
	# if not data or 'base64' not in data:
	# return jsonify({'error': 'Invalid input. No base64 data provided.','status':400}), 400

	# encoded_audio = data['base64']

	# mel_spectrogram, length = process_audio(encoded_audio)

	# with torch.no_grad():
	# if binary_mode:
	# output = voice_binary_model(mel_spectrogram, length)
	# prob = torch.nn.functional.softmax(output[0], dim=0)
	# pred = torch.argmax(prob).item()
	# category = binary_labels[pred]
	# probs_dict = {binary_labels[i]: prob[i].item()*100 for i in range(len(binary_labels))}
	# else:
	# output = voice_multi_model(mel_spectrogram, length)
	# prob = torch.nn.functional.softmax(output[0], dim=0)
	# pred = torch.argmax(prob).item()
	# category = multi_voice_labels[pred]
	# probs_dict = {multi_voice_labels[i]: prob[i].item()*100 for i in range(len(multi_voice_labels))}
	# mode = 'binary' if binary_mode else 'multi'

	# response = {
	# 'message': 'Data received!',
	# 'class': category,
	# 'mode' : mode,
	# 'probs': probs_dict,
	# 'status':200
	# }
	# return jsonify(response), 201

	# except KeyError as e:
	# return jsonify({'error': f'Missing key: {str(e)}','status':400}), 400

	# except Exception as e:
	# return jsonify({'error': str(e),'status':400}), 500



	@app.route('/api/voice', methods=['POST'])
	def post_api_voice():
	try:
	binary_mode = True
	data = request.json
	if not data or 'base64' not in data:
	return jsonify({'error': 'Invalid input. No base64 data provided.','status':400}), 400

	encoded_audio = data['base64']

	mel_spectrogram = process_audio(encoded_audio)

	with torch.no_grad():
	if binary_mode:
	output,_ = voice_binary_model(mel_spectrogram)
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = binary_labels[pred]
	probs_dict = {binary_labels[i]: prob[i].item()*100 for i in range(len(binary_labels))}
	else:
	output = voice_multi_model(mel_spectrogram, length)
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = multi_voice_labels[pred]
	probs_dict = {multi_voice_labels[i]: prob[i].item()*100 for i in range(len(multi_voice_labels))}
	mode = 'binary' if binary_mode else 'multi'

	response = {
	'message': 'Data received!',
	'class': category,
	'mode' : mode,
	'probs': probs_dict,
	'status':200
	}
	return jsonify(response), 201

	except KeyError as e:
	return jsonify({'error': f'Missing key: {str(e)}','status':400}), 400

	except Exception as e:
	return jsonify({'error': str(e),'status':400}), 500




	@app.route('/api/face', methods=['POST'])
	def post_api_face():
	try:
	binary_mode = request.args.get('binary', 'False').lower() == 'true'
	model_name = request.args.get('model', 'convnext').lower()
	data = request.json
	if not data or 'base64' not in data:
	return jsonify({'error': 'Invalid input. No base64 data provided.'}), 400

	encoded_image = data['base64']

	# Process the image
	processsed_image, is_face_detected, bbox = process_image(encoded_image)

	if not is_face_detected:
	return jsonify({'error': 'No Face Detected.','status':400}), 400
	with torch.no_grad():
	if binary_mode:
	if model_name=='transformer':
	output, _ = vit_binary_model(processsed_image)
	else:
	output= convnext_binary_model(processsed_image)
	model_name = 'convnext'
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = binary_labels[pred]
	probs_dict = {binary_labels[i]: prob[i].item()*100 for i in range(len(binary_labels))}
	else:
	if model_name=='transformer':
	output, _ = vit_multi_model(processsed_image)
	else:
	output= convnext_multi_model(processsed_image)
	model_name = 'convnext'
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = multi_face_labels[pred]
	probs_dict = {multi_face_labels[i]: prob[i].item()*100 for i in range(len(multi_face_labels))}
	mode = 'binary' if binary_mode else 'multi'


	response = {
	'message': 'Data received!',
	'class': category,
	'probs': probs_dict,
	'model': model_name,
	'mode' : mode,
	'bbox' : bbox,
	'status':200
	}
	return jsonify(response), 201

	except KeyError as e:
	return jsonify({'error': f'Missing key: {str(e)}','status':400}), 400

	except Exception as e:
	return jsonify({'error': str(e),'status':400}), 500



	@app.route('/api/fingerprint', methods=['POST'])
	def post_api_fingerprint():
	try:
	data = request.json
	binary_mode = request.args.get('binary', 'False').lower() == 'true'
	if not data or 'base64' not in data:
	return jsonify({'error': 'Invalid input. No base64 data provided.','status':400}), 400

	encoded_image = data['base64']

	# Process the image
	processsed_image, is_detected = process_fingerprint_image(encoded_image)

	if is_detected:
	with torch.no_grad():
	if binary_mode:
	output, _ = vit_binary_model(processsed_image)
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = binary_labels[pred]
	probs_dict = {binary_labels[i]: prob[i].item()*100 for i in range(len(binary_labels))}
	else:
	output, _ = vit_multi_model(processsed_image)
	prob = torch.nn.functional.softmax(output[0], dim=0)
	pred = torch.argmax(prob).item()
	category = multi_finger_print_labels[pred]
	probs_dict = {multi_finger_print_labels[i]: prob[i].item()*100 for i in range(len(multi_finger_print_labels))}


	response = {
	'message': 'Data received!',
	'class': category,
	'probs': probs_dict,
	'status':200
	}
	return jsonify(response), 201
	else:
	return jsonify({'error': f'No Fingerprint Detected','status':400}), 400

	except KeyError as e:
	return jsonify({'error': f'Missing key: {str(e)}','status':400}), 400

	except Exception as e:
	return jsonify({'error': str(e),'status':400}), 500