userdotcs's picture
Update app.py
b88c7ba verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
import spaces
MODEL_ID = "userdotcs/whisper-small-turkish-phono-finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
# İşlemci ve Modeli yükle
processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
@spaces.GPU
def transcribe(audio):
if audio is None:
return "Ses dosyası algılanamadı."
sr, y = audio
# 1. Float32 Dönüşümü ve Normalizasyon (KRİTİK ADIM)
y = y.astype(np.float32)
y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1
# 2. Mono Yapma
if y.ndim > 1:
y = y.mean(axis=1)
# 3. Resampling
if sr != 16000:
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
sr = 16000
# 4. Input Hazırlama
input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# 5. Inference
with torch.no_grad():
# forced_decoder_ids ile modeli Türkçe'ye zorlayabilirsin (opsiyonel)
predicted_ids = model.generate(input_features, max_new_tokens=128)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
# Arayüz
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="numpy"), # Gradio 4+ kullanıyorsan bu kalsın
outputs="text",
title="Whisper Small Turkish Phonetic Trancription"
)
demo.launch()