Spaces:

mkunz7
/

ctf

Sleeping

ctf / host.py

m kunz

test

45a4975 about 2 years ago

5.37 kB

	from flask import Flask, render_template, request, jsonify
	from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForXVector
	import torchaudio
	import torch
	import io
	import librosa
	from scipy.spatial.distance import cosine
	import numpy as np
	import os
	# brew install ffmpeg
	# pip install flask transformers librosa torch torchaudio

	app = Flask(__name__, static_url_path='/static')

	# https://www.youtube.com/watch?v=NjR6TyHgAho first 30s
	mp3_file_path = "arnold.mp3"

	# https://neets.ai/ "With great power comes great responsibility"
	mp3_file_path2 = 'arnold2.wav'

	flag1=""
	flag2=""

	with open("flag1.txt") as f:
	flag1=f.read()
	with open("flag2.txt") as f:
	flag2=f.read()

	# Load feature extractor and model
	themodel = "microsoft/unispeech-sat-large-sv"
	if os.path.exists("model"):
	themodel = "model"
	feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(themodel)
	model = UniSpeechSatForXVector.from_pretrained(themodel)

	# Preprocess audio function to convert audio to mono 16khz
	def preprocess_audio(audio_data):
	waveform, sample_rate = torchaudio.load(audio_data)
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
	waveform = waveform.squeeze().numpy()
	return waveform

	@app.route('/')
	def index():
	return render_template('index.html')

	@app.route('/chal2')
	def chal2():
	return render_template('chal2.html')

	# Hugging faces doesn't run an api for us anymore so processing data against a model needs to be done locally now
	# https://www.ktskumar.com/2021/12/introduction-to-voice-authentication-using-javascript/
	# https://hf.space/gradioiframe/microsoft/unispeech-speaker-verification/api/predict
	# You can buy something similiar through Azure, perhaps microsoft just wanted to commercialize this
	@app.route('/compare_audio', methods=['POST'])
	def compare_audio():
	try:
	# Get the recorded audio file from the frontend
	recorded_audio = request.files['audio_data']

	# Preprocess recorded audio
	audio_data = preprocess_audio(recorded_audio)
	inputs = feature_extractor(audio_data, return_tensors="pt")
	embeddings = model(**inputs).embeddings
	embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

	# Load and preprocess MP3 file for comparison
	mp3_audio = preprocess_audio(mp3_file_path)
	mp3_inputs = feature_extractor(mp3_audio, return_tensors="pt")
	mp3_embeddings = model(**mp3_inputs).embeddings
	mp3_embeddings_normalized = torch.nn.functional.normalize(mp3_embeddings, dim=-1).cpu()

	# Calculate cosine similarity
	cosine_sim = torch.nn.CosineSimilarity(dim=-1)
	similarity = cosine_sim(embeddings_normalized, mp3_embeddings_normalized).item()

	similarity = round(similarity, 3)

	threshold = 0.89 # Adjust the threshold as needed
	if similarity < threshold:
	result = "Authorization Failed! " + str(similarity) + " < 0.890<br>Do your best Terminator impression"
	else:
	result = "Good job! Match: " + str(similarity) + "<br>" + flag1 + "<br><a href='/chal2'>Click here to open the next challenge</a>"

	return jsonify({'result': result})
	except Exception as e:
	print("Caught: "+str(e))
	return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.' })

	def extract_mfcc(audio_bytes):
	# Preprocess audio
	waveform = preprocess_audio2(audio_bytes)

	# Extract MFCC coefficients
	mfcc = librosa.feature.mfcc(y=waveform, sr=16000, n_mfcc=13)

	return mfcc

	def preprocess_audio2(audio_bytes):
	# Load the audio bytes into torchaudio waveform
	waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))

	# Ensure the audio has a single channel (mono)
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Resample the audio to 16kHz if needed
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Trim silence at beginning and end
	waveform, _ = librosa.effects.trim(waveform, top_db=20)

	waveform = waveform.squeeze().numpy()

	return waveform

	@app.route('/compare_audio2', methods=['POST'])
	def compare_audio2():
	try:
	recorded_audio = request.files['audio_data'].read()
	mp3_audio = open(mp3_file_path2, 'rb').read()

	# Compare similarity between audio
	mfcc1 = extract_mfcc(recorded_audio)
	mfcc2 = extract_mfcc(mp3_audio)
	similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1))
	similarity = round(similarity, 3)
	if similarity < 0.940:
	result = "Authorization Failed! " + str(similarity) + " < 0.940<br>Say: 'With great power comes great responsibility' as Arnold Schwarzenegger"
	else:
	result = "Good job! Match: " + str(similarity) + "<br>" + flag2

	return jsonify({'result': result})
	except Exception as e:
	print("Caught: "+str(e))
	return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.'})

	if __name__ == '__main__':
	app.run(host="0.0.0.0", port=8080, debug=True)