Spaces:

jvcanavarro
/

speaker-identification-demo

Runtime error

App Files Files Community

speaker-identification-demo / app.py

jvcanavarro

Add app v0.1

8b843d9 over 2 years ago

raw

history blame contribute delete

2.39 kB

	import gradio as gr
	import torch
	import librosa
	import time
	import pandas as pd
	from datetime import datetime
	from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

	DESCRIPTION = "Store a record of previous calls in order to verify if the client already called or not. Pretrained on `https://huggingface.co/datasets/superb` using [S3PRL recipe](https://github.com/s3prl/s3prl/tree/master/s3prl/downstream/voxceleb1)."

	# COLUMNS = ["call_id", "date", "client_id", "duration", "new"]
	model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-large-superb-sid")
	feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-large-superb-sid")

	def file_to_array(path):
	speech, _ = librosa.load(path, sr=16000, mono=True)
	duration = librosa.get_duration(y=speech)
	return speech, duration


	def handler(audio_path):
	calls = pd.read_csv("call_records.csv")
	speech, duration = file_to_array(audio_path)

	# compute attention masks and normalize the waveform if needed
	inputs = feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt")

	logits = model(**inputs).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]

	client_id = labels[0]
	call_id = str(int(time.time()))

	date = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

	n_of_calls = len(calls.loc[calls.client_id == client_id])
	new = n_of_calls == 0

	# add new call record
	record = [call_id, date, client_id, duration, new]
	calls.loc[len(calls)] = record

	calls.to_csv("call_records.csv", index=False)

	if new:
	return f"New client call: Client ID {client_id}"

	return f"Client {client_id} calling again: {n_of_calls} previous calls"


	first = gr.Interface(
	fn=handler,
	inputs=gr.Audio(label="Speech Audio", type="filepath"),

	outputs=gr.Text(label="Output", value="..."),
	description=DESCRIPTION

	)

	second = gr.Interface(
	fn=handler,
	inputs=gr.Audio(label="Microphone Input", source="microphone", type="filepath"),
	outputs=gr.Text(label="Output", value="..."),
	description=DESCRIPTION
	)

	app = gr.TabbedInterface(
	[first, second],
	title="Speaker Call Verification 🎤",
	tab_names=["Audio Upload", "Microphone"],
	)
	app.launch()