Abid Ali Awan commited on
Commit
8fa375c
·
1 Parent(s): 75f4da9

Add Urdu punctuation support in app.py by implementing a function to add full stops and optional commas, enhancing text output from the transcription process.

Browse files
Files changed (1) hide show
  1. app.py +31 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import warnings
3
 
4
  import gradio as gr
@@ -46,6 +47,32 @@ transcriber = pipeline(
46
  )
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def transcribe(audio):
50
  if audio is None:
51
  return "No audio provided. Please record or upload an audio file."
@@ -64,7 +91,10 @@ def transcribe(audio):
64
  # Inference under no_grad
65
  with torch.no_grad():
66
  result = transcriber({"sampling_rate": sr, "raw": y})
67
- return result.get("text", "")
 
 
 
68
 
69
 
70
  # —— Gradio UI ——
 
1
  import os
2
+ import re
3
  import warnings
4
 
5
  import gradio as gr
 
47
  )
48
 
49
 
50
+ def add_urdu_punctuation(text):
51
+ """
52
+ Adds Urdu full stop (۔) at the end of sentences and optionally Urdu comma (،) after conjunctions.
53
+ This is a simple heuristic and may not be perfect for all cases.
54
+ """
55
+ # List of common Urdu conjunctions (for optional comma insertion)
56
+ conjunctions = ["اور", "لیکن", "مگر", "یا", "پھر", "جبکہ", "کیونکہ", "تاہم"]
57
+ # Add comma after conjunctions (optional, can be commented out if not desired)
58
+ for conj in conjunctions:
59
+ # Only add comma if not already present
60
+ text = re.sub(rf"\b({conj})\b(?!،)", r"\1،", text)
61
+ # Split sentences heuristically (by length or by pause words)
62
+ # Here, we split by newlines or keep as one if no punctuation
63
+ sentences = re.split(r"[\n]+", text)
64
+ processed = []
65
+ for s in sentences:
66
+ s = s.strip()
67
+ if not s:
68
+ continue
69
+ # Add Urdu full stop if not already present at end
70
+ if not s.endswith("۔") and not s.endswith("؟"):
71
+ s += "۔"
72
+ processed.append(s)
73
+ return " ".join(processed)
74
+
75
+
76
  def transcribe(audio):
77
  if audio is None:
78
  return "No audio provided. Please record or upload an audio file."
 
91
  # Inference under no_grad
92
  with torch.no_grad():
93
  result = transcriber({"sampling_rate": sr, "raw": y})
94
+ text = result.get("text", "")
95
+ # Add Urdu punctuation
96
+ text = add_urdu_punctuation(text)
97
+ return text
98
 
99
 
100
  # —— Gradio UI ——