File size: 3,083 Bytes
ff44794
b23bcf3
ff44794
778f5dc
3debdab
ff44794
 
 
 
6d44df0
ff44794
 
 
 
 
 
778f5dc
 
 
c147ba9
b7809d1
 
 
 
778f5dc
 
3debdab
778f5dc
 
b23bcf3
778f5dc
 
3debdab
778f5dc
3debdab
 
b7809d1
778f5dc
b23bcf3
b7809d1
778f5dc
b7809d1
 
 
 
 
778f5dc
b7809d1
 
778f5dc
b23bcf3
b7809d1
 
 
 
 
 
778f5dc
3debdab
778f5dc
 
 
 
b23bcf3
778f5dc
b23bcf3
ff44794
 
 
 
 
 
 
 
 
b7809d1
 
 
ff44794
 
b7809d1
ff44794
b7809d1
ff44794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7809d1
 
ff44794
 
 
 
 
 
 
 
 
b7809d1
ff44794
 
 
 
b7809d1
ff44794
b7809d1
 
 
 
 
 
 
 
ff44794
 
 
 
 
 
 
 
 
 
 
 
3debdab
ff44794
 
 
 
 
3debdab
b7809d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import re
import requests
import gradio as gr

# =========================
# ElevenLabs Config
# =========================

ELEVENLABS_API_KEY = "c92a87a2ebb5f51ee9fe90cc421e836e32780c188f4e0056d77ce69803008ae9"
STT_URL = "https://api.elevenlabs.io/v1/speech-to-text"

# =========================
# Regex Cleaning
# =========================

REPEAT_WORD = re.compile(r'\b(\w+)(?:\s+\1\b)+', re.IGNORECASE)
CHAR_STRETCH = re.compile(r'(.)\1{2,}')
REPEAT_SYLLABLE = re.compile(r'\b(\w{1,3})(?:\s+\1\b)+', re.IGNORECASE)

STUTTER = re.compile(r'\b(\w)[\-ـ]+(\1[\-ـ]+)+')
REPEAT_AFTER_COMMA = re.compile(r'(\b\w+\b)[،,]\s+\1')
COMMA_SPACES = re.compile(r'\s+([،,])')

def is_filler(word):
    w = word.lower()

    if len(set(w)) == 1 and len(w) <= 4:
        return True

    if len(w) <= 2:
        return True

    return False



def clean_transcript(text):

    # collapse stretched sounds
    text = CHAR_STRETCH.sub(r'\1', text)

    # fix stutter like ب-ب-بالشيء
    text = STUTTER.sub(r'\1', text)

    # repeated words
    text = REPEAT_WORD.sub(r'\1', text)

    # repeated short syllables
    text = REPEAT_SYLLABLE.sub(r'\1', text)

    # repeated after comma
    text = REPEAT_AFTER_COMMA.sub(r'\1', text)

    # fix spaces before comma
    text = COMMA_SPACES.sub(r'\1', text)

    words = text.split()

    filtered = []
    for w in words:
        if not is_filler(w):
            filtered.append(w)

    return " ".join(filtered)

# =========================
# Speech To Text
# =========================

def transcribe_audio(audio_file):

    if audio_file is None:
        return "No audio uploaded", ""

    headers = {
        "xi-api-key": ELEVENLABS_API_KEY
    }

    with open(audio_file, "rb") as f:

        files = {"file": f}

        data = {
            "model_id": "scribe_v2",
            "enable_logging": "false"
        }

        response = requests.post(
            STT_URL,
            headers=headers,
            files=files,
            data=data
        )

    if response.status_code != 200:
        return f"Error: {response.text}", ""

    result = response.json()

    text = ""

    if "segments" in result:
        for segment in result["segments"]:
            text += segment.get("text", "") + " "
    else:
        text = result.get("text", "")

    cleaned = clean_transcript(text)

    return text, cleaned


# =========================
# Gradio Interface
# =========================

with gr.Blocks() as demo:

    gr.Markdown("# Arabic Speech Cleaner")

    gr.Markdown(
        "Upload audio → convert to text using ElevenLabs → remove fillers and stuttering"
    )

    audio_input = gr.Audio(
        type="filepath",
        label="Upload Audio"
    )

    raw_text = gr.Textbox(
        label="Original Transcript",
        lines=8
    )

    cleaned_text = gr.Textbox(
        label="Cleaned Transcript",
        lines=8
    )

    btn = gr.Button("Transcribe")

    btn.click(
        fn=transcribe_audio,
        inputs=audio_input,
        outputs=[raw_text, cleaned_text]
    )

demo.launch()