File size: 2,180 Bytes
c3fbcde
2bc8f15
f0e9bad
0918b24
c27f348
707f539
c871a9c
1170a88
c871a9c
cde6c6f
707f539
 
 
cde6c6f
707f539
 
 
1feadc6
c3fbcde
cde6c6f
 
 
 
 
c3fbcde
 
 
 
707f539
 
 
c3fbcde
cde6c6f
 
cc6ae2a
6161422
cc6ae2a
6161422
1feadc6
cde6c6f
cc6ae2a
c3fbcde
 
cc6ae2a
 
 
707f539
cc6ae2a
 
6161422
cc6ae2a
707f539
cde6c6f
cc6ae2a
 
 
6161422
707f539
cc6ae2a
707f539
cc6ae2a
 
1feadc6
cc6ae2a
 
1feadc6
ca5b750
cc6ae2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# app.py – FIXED: no_timestamps_token_id added (no more ValueError)
import os
import gradio as gr
import spaces
from transformers import pipeline
import torch

MODEL_NAME = "palli23/whisper-small-sam_spjall"

print("Loading model once at startup...")
pipe = pipeline(
    "automatic-speech-recognition",
    model=MODEL_NAME,
    torch_dtype=torch.float16,
    device=0,
    token=os.getenv("HF_TOKEN")
)

# Fix old Whisper config completely (including timestamps token)
if not hasattr(pipe.model.generation_config, "lang_to_id"):
    pipe.model.generation_config.lang_to_id = {"is": 50259}
    pipe.model.generation_config.task_to_id = {"transcribe": 50359}
    pipe.model.generation_config.forced_decoder_ids = None

# ←←← THIS FIXES THE TIMESTAMP ERROR
if not hasattr(pipe.model.generation_config, "no_timestamps_token_id"):
    pipe.model.generation_config.no_timestamps_token_id = 50363

pipe.model.generation_config.language = "is"
pipe.model.generation_config.task = "transcribe"

print("Model ready – fully fixed for timestamps!")

@spaces.GPU(duration=120)
def transcribe_safe(audio_path):
    if not audio_path:
        return "Hladdu upp hljóðskrá"
    
    import librosa
    
    audio, sr = librosa.load(audio_path, sr=16000)
    chunk_len = 16000 * 20
    stride = 16000 * 2
    chunks = []
    for i in range(0, len(audio), chunk_len - stride):
        chunk = audio[i:i + chunk_len]
        if len(chunk) < 16000:
            break
        chunks.append(chunk)
    
    full_text = ""
    for chunk in chunks:
        result = pipe(chunk, batch_size=16)
        full_text += result["text"] + " "
    
    return full_text.strip() or "Ekkert heyrt"

with gr.Blocks(title="Íslenskt ASR – 3 mín T4 Paid") as demo:
    gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
    gr.Markdown("**~4 % WER · 15–25 sek · T4 Paid**")
    
    audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
    btn = gr.Button("Transcribe (15–25 sek)", variant="primary", size="lg")
    out = gr.Textbox(lines=30, label="Útskrift")
    
    btn.click(transcribe_safe, inputs=audio, outputs=out)

demo.launch(auth=("beta", "beta2025"))