Spaces:
Sleeping
Sleeping
Abid Ali Awan
commited on
Commit
·
8fa375c
1
Parent(s):
75f4da9
Add Urdu punctuation support in app.py by implementing a function to add full stops and optional commas, enhancing text output from the transcription process.
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import warnings
|
| 3 |
|
| 4 |
import gradio as gr
|
|
@@ -46,6 +47,32 @@ transcriber = pipeline(
|
|
| 46 |
)
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def transcribe(audio):
|
| 50 |
if audio is None:
|
| 51 |
return "No audio provided. Please record or upload an audio file."
|
|
@@ -64,7 +91,10 @@ def transcribe(audio):
|
|
| 64 |
# Inference under no_grad
|
| 65 |
with torch.no_grad():
|
| 66 |
result = transcriber({"sampling_rate": sr, "raw": y})
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
# —— Gradio UI ——
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
|
| 50 |
+
def add_urdu_punctuation(text):
|
| 51 |
+
"""
|
| 52 |
+
Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
|
| 53 |
+
This is a simple heuristic and may not be perfect for all cases.
|
| 54 |
+
"""
|
| 55 |
+
# List of common Urdu conjunctions (for optional comma insertion)
|
| 56 |
+
conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
|
| 57 |
+
# Add comma after conjunctions (optional, can be commented out if not desired)
|
| 58 |
+
for conj in conjunctions:
|
| 59 |
+
# Only add comma if not already present
|
| 60 |
+
text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
|
| 61 |
+
# Split sentences heuristically (by length or by pause words)
|
| 62 |
+
# Here, we split by newlines or keep as one if no punctuation
|
| 63 |
+
sentences = re.split(r"[\n]+", text)
|
| 64 |
+
processed = []
|
| 65 |
+
for s in sentences:
|
| 66 |
+
s = s.strip()
|
| 67 |
+
if not s:
|
| 68 |
+
continue
|
| 69 |
+
# Add Urdu full stop if not already present at end
|
| 70 |
+
if not s.endswith("۔") and not s.endswith("؟"):
|
| 71 |
+
s += "۔"
|
| 72 |
+
processed.append(s)
|
| 73 |
+
return " ".join(processed)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
def transcribe(audio):
|
| 77 |
if audio is None:
|
| 78 |
return "No audio provided. Please record or upload an audio file."
|
|
|
|
| 91 |
# Inference under no_grad
|
| 92 |
with torch.no_grad():
|
| 93 |
result = transcriber({"sampling_rate": sr, "raw": y})
|
| 94 |
+
text = result.get("text", "")
|
| 95 |
+
# Add Urdu punctuation
|
| 96 |
+
text = add_urdu_punctuation(text)
|
| 97 |
+
return text
|
| 98 |
|
| 99 |
|
| 100 |
# —— Gradio UI ——
|