Fatitommy commited on
Commit
f9aa059
·
verified ·
1 Parent(s): 646911f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -10
app.py CHANGED
@@ -80,15 +80,32 @@ def download_file(url: str, path: str):
80
 
81
  def detokenize(text: str) -> str:
82
  """
83
- Fairseq BPE output fix:
84
- '▁ساڈا ▁گھر ▁راولپنڈی ▁وچ ▁ہے' → 'ساڈا گھر راولپنڈی وچ ہے'
85
- = word boundary marker — replace with space, then clean up
 
 
 
 
 
 
86
  """
87
- # at start of word = space before it
88
- result = text.replace(" ▁", " ").replace("▁", "")
89
- # Clean extra spaces
90
- result = " ".join(result.split())
91
- return result.strip()
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def load_model(pair: str):
94
  cfg = MODELS_CONFIG[pair]
@@ -143,10 +160,21 @@ def translate(req: Req):
143
  model = load_model(pair)
144
  raw = model.translate(req.text.strip())
145
 
146
- print(f"[DEBUG] raw output [{pair}]: {raw}") # debug log
 
 
 
147
 
148
  result = detokenize(raw) if cfg["detokenize"] else raw
149
- return {"success": True, "translation": result, "pair": pair, "raw": raw}
 
 
 
 
 
 
 
 
150
 
151
  except Exception as e:
152
  print(f"Error [{pair}]: {e}")
 
80
 
81
  def detokenize(text: str) -> str:
82
  """
83
+ Fairseq sentencepiece/BPE output clean karo.
84
+
85
+ Possible formats:
86
+ 1. "▁ساڈا ▁گھر ▁راولپنڈی" → spaces ke saath
87
+ 2. "ت ُس ِیں ک ِو ے ہ ِو" → characters alag alag
88
+ 3. "▁ت▁و▁س▁ی▁ں" → chipke hue
89
+
90
+ Fix: pehle ▁ ko space se replace karo, phir
91
+ sirf word boundaries pe space rakho
92
  """
93
+ # Step 1: ko space se replace karo
94
+ text = text.replace("▁", " ")
95
+
96
+ # Step 2: multiple spaces ko single space karo
97
+ text = " ".join(text.split())
98
+
99
+ # Step 3: characters ke darmiyan wali extra spaces hatao
100
+ # (jab BPE ne har character ko alag token banaya ho)
101
+ # Unicode ranges: Shahmukhi (0600-06FF), Gurmukhi (0A00-0A7F)
102
+ import re
103
+ # Shahmukhi characters ke beech space hatao
104
+ text = re.sub(r'(?<=[\u0600-\u06FF])\s(?=[\u0600-\u06FF\u0610-\u061A\u064B-\u065F])', '', text)
105
+ # Gurmukhi characters ke beech space hatao
106
+ text = re.sub(r'(?<=[\u0A00-\u0A7F])\s(?=[\u0A00-\u0A7F])', '', text)
107
+
108
+ return text.strip()
109
 
110
  def load_model(pair: str):
111
  cfg = MODELS_CONFIG[pair]
 
160
  model = load_model(pair)
161
  raw = model.translate(req.text.strip())
162
 
163
+ # Debug logs mein dikhega
164
+ print(f"[DEBUG] pair={pair}")
165
+ print(f"[DEBUG] input={req.text}")
166
+ print(f"[DEBUG] raw={repr(raw)}")
167
 
168
  result = detokenize(raw) if cfg["detokenize"] else raw
169
+
170
+ print(f"[DEBUG] final={result}")
171
+
172
+ return {
173
+ "success": True,
174
+ "translation": result,
175
+ "pair": pair,
176
+ "raw": raw, # debug ke liye
177
+ }
178
 
179
  except Exception as e:
180
  print(f"Error [{pair}]: {e}")