File size: 3,141 Bytes
7708e2e
364e491
7708e2e
364e491
7708e2e
364e491
024de1d
364e491
83e303c
364e491
 
 
abd1c03
364e491
 
 
 
7708e2e
 
 
364e491
83e303c
364e491
abd1c03
83e303c
 
 
 
7708e2e
 
83e303c
abd1c03
 
 
 
7708e2e
abd1c03
83e303c
 
 
abd1c03
7708e2e
 
364e491
83e303c
364e491
 
 
83e303c
abd1c03
 
 
024de1d
364e491
 
024de1d
83e303c
024de1d
83e303c
024de1d
abd1c03
83e303c
024de1d
83e303c
024de1d
 
abd1c03
024de1d
abd1c03
024de1d
83e303c
024de1d
83e303c
364e491
abd1c03
83e303c
 
364e491
 
83e303c
abd1c03
024de1d
 
364e491
 
024de1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e491
83e303c
 
 
7708e2e
83e303c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
import torch
from transformers import pipeline
import spaces

# =========================================
# LOAD MODEL
# =========================================
# Load on CPU with bfloat16 to save memory during startup
pipe = pipeline(
    "automatic-speech-recognition", 
    model="uzair0/Katib-ASR",
    torch_dtype=torch.bfloat16,
    device="cpu"
)

@spaces.GPU(duration=60)
def transcribe_audio(audio_filepath):
    if audio_filepath is None:
        return "⚠️ Please record some audio first!"
    
    # 1. Move model to GPU
    pipe.model.to("cuda")
    
    # 2. Re-assign the device to the pipeline for this call
    pipe.device = torch.device("cuda")
    
    # 3. Run transcription
    result = pipe(
        audio_filepath, 
        chunk_length_s=30, # Helps with longer recordings
        generate_kwargs={
            "language": "pashto", 
            "task": "transcribe"
        }
    )
    
    # 4. Cleanup: Move back to CPU so ZeroGPU can release the hook
    pipe.model.to("cpu")
    pipe.device = torch.device("cpu")
    
    return result["text"]

# =========================================
# UI DESIGN (Dark Reference Layout)
# =========================================

custom_css = """
.gradio-container { background-color: #0b0f19 !important; border: none !important; }
h2, p { color: white !important; }

/* Transcription box styling */
.transcription-box textarea { 
    direction: rtl !important; 
    text-align: right !important; 
    font-size: 1.2em !important;
    background-color: #161b22 !important;
    color: white !important;
    border: 1px solid #30363d !important;
}

/* Orange Submit Button */
.submit-btn {
    background: #ff5722 !important;
    color: white !important;
    font-weight: bold !important;
    border: none !important;
}

.clear-btn {
    background-color: #21262d !important;
    color: white !important;
    border: 1px solid #30363d !important;
}

/* Make audio player look better in dark mode */
audio { filter: invert(1) hue-rotate(180deg); }
"""

with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo:
    with gr.Column():
        gr.Markdown("## 🎙️ Katib ASR: Pashto Speech Recognition")
        gr.Markdown("Click the Record button below, speak Pashto into your microphone, and see the result!")

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="filepath", 
                label="Record Pashto"
            )
            with gr.Row():
                clear_btn = gr.Button("Clear", elem_classes="clear-btn")
                submit_btn = gr.Button("Submit", elem_classes="submit-btn")
        
        with gr.Column(scale=1):
            output_text = gr.Textbox(
                label="Katib ASR Transcription", 
                lines=8, 
                elem_classes="transcription-box"
            )

    # Logic
    submit_btn.click(fn=transcribe_audio, inputs=audio_input, outputs=output_text)
    clear_btn.click(fn=lambda: [None, ""], inputs=None, outputs=[audio_input, output_text])

demo.launch(ssr_mode=False)