Spaces:

Dougsworth
/

linkup-asr

Sleeping

App Files Files Community

linkup-asr / app.py

Dougsworth

Update app.py

d829e3c verified 3 months ago

raw

history blame contribute delete

1.98 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	from transformers import WhisperForConditionalGeneration, WhisperProcessor
	from peft import PeftModel

	print("Loading model...")
	processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
	base_model = WhisperForConditionalGeneration.from_pretrained(
	"openai/whisper-large-v3",
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True
	)
	model = PeftModel.from_pretrained(base_model, "Dougsworth/caribbean-whisper-asr")
	print("Model loaded!")

	def enhance_phone_audio(audio):
	"""Gentle enhancement for phone audio - just normalize, no filtering"""
	# Remove DC offset
	audio = audio - np.mean(audio)

	# Normalize to [-1, 1]
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	# Gentle compression to boost quiet parts
	audio = np.sign(audio) * np.power(np.abs(audio), 0.9)

	# Normalize again
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	return audio.astype(np.float32)

	def transcribe(audio_path):
	if audio_path is None:
	return "Please upload an audio file."

	audio, sr = librosa.load(audio_path, sr=16000)

	# Apply gentle enhancement
	audio = enhance_phone_audio(audio)

	inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

	with torch.no_grad():
	predicted_ids = model.generate(inputs.input_features, language="en", task="transcribe")

	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	return transcription

	demo = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Upload Voice Note"),
	outputs=gr.Textbox(label="Job Listing Transcription"),
	title="Linkup - Caribbean Speech to Text",
	description="Upload a voice note describing a job and get it transcribed. Built for Caribbean accents.",
	examples=[]
	)
	demo.launch()