Sammaali commited on
Commit
b23bcf3
ยท
verified ยท
1 Parent(s): b61fa99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -67
app.py CHANGED
@@ -1,69 +1,47 @@
1
  import gradio as gr
 
2
  import requests
3
- import os
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
- import torch
6
 
7
  # =========================
8
  # ElevenLabs Configuration
9
  # =========================
10
 
11
  ELEVENLABS_API_KEY = "c92a87a2ebb5f51ee9fe90cc421e836e32780c188f4e0056d77ce69803008ae9"
12
- STT_URL = "https://api.elevenlabs.io/v1/speech-to-text"
13
-
14
- # =========================
15
- # Load Gemma Model
16
- # =========================
17
-
18
-
19
- model_id = "Sammaali/gemma-3-4b"
20
 
21
- tokenizer = AutoTokenizer.from_pretrained(model_id)
22
-
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_id,
25
- torch_dtype=torch.float32
26
- )
27
 
28
- device = "cuda" if torch.cuda.is_available() else "cpu"
29
- model.to(device)
30
 
31
  # =========================
32
- # Clean Text Using Gemma
33
  # =========================
34
- def clean_text(text):
35
 
36
- text = text[:1500]
 
 
37
 
38
- prompt = f"""
39
- Clean this Arabic speech transcript.
 
40
 
41
- Remove filler words like:
42
- ุงู…ู…ู…ุŒ ุขุขุขุŒ ูŠุนู†ูŠ
43
 
44
- Remove repeated words.
45
- Keep the same meaning.
46
 
47
- Transcript:
48
- {text}
49
- """
50
 
51
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
52
 
53
- with torch.no_grad():
 
54
 
55
- outputs = model.generate(
56
- **inputs,
57
- max_new_tokens=120,
58
- do_sample=True,
59
- temperature=0.7,
60
- top_p=0.9,
61
- repetition_penalty=1.2
62
- )
63
 
64
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
 
66
- return result# =========================
67
  # ElevenLabs Speech To Text
68
  # =========================
69
 
@@ -76,61 +54,78 @@ def transcribe_audio(audio_file):
76
  "xi-api-key": ELEVENLABS_API_KEY
77
  }
78
 
79
- with open(audio_file, "rb") as f:
80
-
81
- files = {
82
- "file": f
83
- }
84
 
85
- data = {
86
- "model_id": "scribe_v2",
87
- "enable_logging": "false"
88
- }
89
 
90
- response = requests.post(
91
- STT_URL,
92
- headers=headers,
93
- files=files,
94
- data=data
95
- )
96
 
97
  if response.status_code != 200:
98
  return "Error: " + response.text, ""
99
 
100
  result = response.json()
101
 
 
102
  text = ""
103
 
104
  if "segments" in result:
105
  for segment in result["segments"]:
106
- text += segment.get("text", "") + " "
 
 
107
  else:
108
  text = result.get("text", "")
109
 
110
- cleaned = clean_text(text)
111
 
112
  return text, cleaned
113
 
 
114
  # =========================
115
  # Gradio UI
116
  # =========================
 
117
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
118
 
119
- gr.Markdown("# ElevenLabs Speech To Text + Gemma Cleaner")
120
- gr.Markdown("ุงุฑูุน ู…ู„ู ุตูˆุชูŠ ูˆุณูŠุชู… ุชุญูˆูŠู„ู‡ ุฅู„ู‰ ู†ุต ุนุฑุจูŠ ุซู… ุชู†ุธูŠูู‡ ุจุงุณุชุฎุฏุงู… Gemma.")
 
 
 
121
 
122
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
123
- raw_text = gr.Textbox(label="Original Text", lines=8)
124
- clean_text_box = gr.Textbox(label="Cleaned Text", lines=8)
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  btn = gr.Button("Transcribe")
127
 
128
  btn.click(
129
  fn=transcribe_audio,
130
  inputs=audio_input,
131
- outputs=[raw_text, clean_text_box]
132
  )
133
 
134
 
135
  if __name__ == "__main__":
136
- demo.launch()
 
1
  import gradio as gr
2
+ import re
3
  import requests
 
 
 
4
 
5
  # =========================
6
  # ElevenLabs Configuration
7
  # =========================
8
 
9
  ELEVENLABS_API_KEY = "c92a87a2ebb5f51ee9fe90cc421e836e32780c188f4e0056d77ce69803008ae9"
 
 
 
 
 
 
 
 
10
 
11
+ STT_URL = "https://api.elevenlabs.io/v1/speech-to-text"
 
 
 
 
 
12
 
 
 
13
 
14
  # =========================
15
+ # Arabic Post Processing
16
  # =========================
 
17
 
18
+ def clean_arabic_text(text):
19
+ if not text:
20
+ return ""
21
 
22
+ # Remove tashkeel
23
+ tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
24
+ text = re.sub(tashkeel_pattern, '', text)
25
 
26
+ # Normalize Hamza
27
+ text = re.sub(r'[ุฃุฅุข]', 'ุง', text)
28
 
29
+ # ุฉ โ†’ ู‡
30
+ text = re.sub(r'ุฉ\b', 'ู‡', text)
31
 
32
+ # ู‰ โ†’ ูŠ
33
+ text = re.sub(r'ู‰\b', 'ูŠ', text)
 
34
 
35
+ # Remove symbols
36
+ text = re.sub(r'[^\w\s]', '', text)
37
 
38
+ # Remove extra spaces
39
+ text = " ".join(text.split())
40
 
41
+ return text
 
 
 
 
 
 
 
42
 
 
43
 
44
+ # =========================
45
  # ElevenLabs Speech To Text
46
  # =========================
47
 
 
54
  "xi-api-key": ELEVENLABS_API_KEY
55
  }
56
 
57
+ files = {
58
+ "file": open(audio_file, "rb")
59
+ }
 
 
60
 
61
+ data = {
62
+ "model_id": "scribe_v2",
63
+ "enable_logging": "false"
64
+ }
65
 
66
+ response = requests.post(
67
+ STT_URL,
68
+ headers=headers,
69
+ files=files,
70
+ data=data
71
+ )
72
 
73
  if response.status_code != 200:
74
  return "Error: " + response.text, ""
75
 
76
  result = response.json()
77
 
78
+ # Extract speaker_0 text
79
  text = ""
80
 
81
  if "segments" in result:
82
  for segment in result["segments"]:
83
+ if segment.get("speaker") == "speaker_0":
84
+ text += segment.get("text", "") + " "
85
+
86
  else:
87
  text = result.get("text", "")
88
 
89
+ cleaned = clean_arabic_text(text)
90
 
91
  return text, cleaned
92
 
93
+
94
  # =========================
95
  # Gradio UI
96
  # =========================
97
+
98
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
99
 
100
+ gr.Markdown("# ElevenLabs Speech To Text + Post Process")
101
+
102
+ gr.Markdown(
103
+ "ุงุฑูุน ู…ู„ู ุตูˆุชูŠ (wav) ูˆุณูŠุชู… ุชุญูˆูŠู„ู‡ ุฅู„ู‰ ู†ุต ุนุฑุจูŠ ุฃูˆ ุฅู†ุฌู„ูŠุฒูŠ ู…ุน ุชู†ุธูŠู ุงู„ู†ุต."
104
+ )
105
 
106
+ audio_input = gr.Audio(
107
+ type="filepath",
108
+ label="Upload audio.wav"
109
+ )
110
+
111
+ raw_text = gr.Textbox(
112
+ label="Original Text",
113
+ lines=8
114
+ )
115
+
116
+ clean_text = gr.Textbox(
117
+ label="Cleaned Text",
118
+ lines=8
119
+ )
120
 
121
  btn = gr.Button("Transcribe")
122
 
123
  btn.click(
124
  fn=transcribe_audio,
125
  inputs=audio_input,
126
+ outputs=[raw_text, clean_text]
127
  )
128
 
129
 
130
  if __name__ == "__main__":
131
+ demo.launch()