File size: 1,556 Bytes
8864fd4
 
 
d7d90f3
 
b88c7ba
8864fd4
3eb3019
7833c3a
8864fd4
7833c3a
8864fd4
7833c3a
8864fd4
f8741c8
8864fd4
7833c3a
 
 
8864fd4
d7d90f3
7833c3a
 
 
 
 
d7d90f3
7833c3a
d7d90f3
7833c3a
d7d90f3
 
 
 
7833c3a
 
8864fd4
7833c3a
8864fd4
7833c3a
 
8864fd4
7833c3a
 
8864fd4
7833c3a
 
8864fd4
7833c3a
8864fd4
7833c3a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
import spaces

MODEL_ID = "userdotcs/whisper-small-turkish-phono-finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"

# İşlemci ve Modeli yükle
processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(device)

@spaces.GPU
def transcribe(audio):
    if audio is None:
        return "Ses dosyası algılanamadı."

    sr, y = audio

    # 1. Float32 Dönüşümü ve Normalizasyon (KRİTİK ADIM)
    y = y.astype(np.float32)
    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1

    # 2. Mono Yapma
    if y.ndim > 1:
        y = y.mean(axis=1)

    # 3. Resampling
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000

    # 4. Input Hazırlama
    input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features.to(device)

    # 5. Inference
    with torch.no_grad():
        # forced_decoder_ids ile modeli Türkçe'ye zorlayabilirsin (opsiyonel)
        predicted_ids = model.generate(input_features, max_new_tokens=128)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Arayüz
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="numpy"), # Gradio 4+ kullanıyorsan bu kalsın
    outputs="text",
    title="Whisper Small Turkish Phonetic Trancription"
)

demo.launch()