piyazon commited on
Commit
87acc82
·
1 Parent(s): 233b82f

change model, fix string

Browse files
Files changed (2) hide show
  1. app.py +303 -3
  2. requirements.txt +3 -1
app.py CHANGED
@@ -8,6 +8,13 @@ import soundfile as sf
8
  from pydantic import BaseModel
9
  import string
10
  import unicodedata
 
 
 
 
 
 
 
11
 
12
  import os
13
  # Access the secret named "MY_API_KEY"
@@ -90,10 +97,301 @@ def fix_string(batch):
90
  return batch
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # model = VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic")
94
  # tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic")
95
- model_ug = VitsModel.from_pretrained("piyazon/TTS-CV-Unique-Ug", token=hf_token)
96
- tokenizer_ug = AutoTokenizer.from_pretrained("piyazon/TTS-CV-Unique-Ug", token=hf_token)
 
97
  # model_ug = VitsModel.from_pretrained("piyazon/qutadgu_bilik")
98
  # tokenizer_ug = AutoTokenizer.from_pretrained("piyazon/qutadgu_bilik")
99
 
@@ -117,7 +415,9 @@ async def generate_tts(input: TextInput):
117
  if input.lang=="ug":
118
  model = model_ug
119
  tokenizer = tokenizer_ug
120
- inputs = tokenizer(fix_string(input.text), return_tensors="pt")
 
 
121
  else:
122
  model = model_ru
123
  tokenizer = tokenizer_ru
 
8
  from pydantic import BaseModel
9
  import string
10
  import unicodedata
11
+ from pypinyin import pinyin, Style
12
+ import re
13
+ from umsc import UgMultiScriptConverter
14
+
15
+ # Initialize uyghur script converter
16
+ ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
17
+ ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
18
 
19
  import os
20
  # Access the secret named "MY_API_KEY"
 
97
  return batch
98
 
99
 
100
+ def number_to_uyghur_arabic_script(number_str):
101
+ """
102
+ Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
103
+ to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
104
+ Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
105
+
106
+ Args:
107
+ number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
108
+
109
+ Returns:
110
+ str: Uyghur pronunciation in Arabic script.
111
+ """
112
+ # Uyghur number words in Arabic script
113
+ digits = {
114
+ 0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
115
+ 6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
116
+ }
117
+ ordinals = {
118
+ 1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
119
+ 6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى'
120
+ }
121
+ tens = {
122
+ 10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
123
+ 60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
124
+ }
125
+ units = [
126
+ (1000000000, 'مىليارد'), # billion
127
+ (1000000, 'مىليون'), # million
128
+ (1000, 'مىڭ'), # thousand
129
+ (100, 'يۈز') # hundred
130
+ ]
131
+ fractions = {
132
+ 1: 'ئوندا', # tenths
133
+ 2: 'يۈزدە', # hundredths
134
+ 3: 'مىڭدە', # thousandths
135
+ 4: 'ئون مىڭدە', # ten-thousandths
136
+ 5: 'يۈز مىڭدە', # hundred-thousandths
137
+ 6: 'مىليوندا', # millionths
138
+ 7: 'ئون مىليوندا', # ten-millionths
139
+ 8: 'يۈز مىليوندا', # hundred-millionths
140
+ 9: 'مىليارددا' # billionths
141
+ }
142
+
143
+ # Convert integer part to words
144
+ def integer_to_words(num):
145
+ if num == 0:
146
+ return digits[0]
147
+
148
+ result = []
149
+ num = int(num)
150
+
151
+ # Handle large units (billion, million, thousand, hundred)
152
+ for value, unit_name in units:
153
+ if num >= value:
154
+ count = num // value
155
+ if count == 1 and value >= 100: # e.g., 100 → "يۈز", not "بىر يۈز"
156
+ result.append(unit_name)
157
+ else:
158
+ result.append(integer_to_words(count) + ' ' + unit_name)
159
+ num %= value
160
+
161
+ # Handle tens and ones
162
+ if num >= 10 and num in tens:
163
+ result.append(tens[num])
164
+ elif num > 10:
165
+ ten = (num // 10) * 10
166
+ one = num % 10
167
+ if one == 0:
168
+ result.append(tens[ten])
169
+ else:
170
+ result.append(tens[ten] + ' ' + digits[one])
171
+ elif num > 0:
172
+ result.append(digits[num])
173
+
174
+ return ' '.join(result)
175
+
176
+ # Clean the input (remove commas or spaces)
177
+ number_str = number_str.replace(',', '').replace(' ', '')
178
+
179
+ # Check for ordinal (ends with '_')
180
+ is_ordinal = number_str.endswith('_') or number_str.endswith('-')
181
+ if is_ordinal:
182
+ number_str = number_str[:-1] # Remove the _ sign
183
+ num = int(number_str)
184
+ if num > 999999999:
185
+ # raise ValueError("Ordinal number exceeds 9 digits")
186
+ return number_str
187
+ if num in ordinals: # Use special forms for single-digit ordinals
188
+ return ordinals[num]
189
+
190
+ # Convert to words and modify the last word for ordinal
191
+ words = integer_to_words(num).split()
192
+ last_num = num % 100 # Get the last two digits to handle tens and ones
193
+ if last_num in tens:
194
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
195
+ elif last_num % 10 == 0 and last_num > 0:
196
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
197
+ else:
198
+ last_digit = num % 10
199
+ if last_digit in ordinals:
200
+ words[-1] = ordinals[last_digit] + ' ' # Replace last digit with ordinal form
201
+ elif last_digit == 0:
202
+ words[-1] += 'ىنجى'
203
+ return ' '.join(words)
204
+
205
+ # Check for percentage
206
+ is_percentage = number_str.endswith('%')
207
+ if is_percentage:
208
+ number_str = number_str[:-1] # Remove the % sign
209
+
210
+ # Check for fraction
211
+ if '/' in number_str:
212
+ numerator, denominator = map(int, number_str.split('/'))
213
+ if numerator in digits and denominator in digits:
214
+ return f"{digits[denominator]}دە {digits[numerator]}"
215
+ else:
216
+ # raise ValueError("Fractions are only supported for single-digit numerators and denominators")
217
+ return number_str
218
+
219
+ # Split into integer and decimal parts
220
+ parts = number_str.split('.')
221
+ integer_part = parts[0]
222
+ decimal_part = parts[1] if len(parts) > 1 else None
223
+
224
+ # Validate integer part (up to 9 digits)
225
+ if len(integer_part) > 9:
226
+ # raise ValueError("Integer part exceeds 9 digits")
227
+ return number_str
228
+
229
+ # Validate decimal part (up to 9 digits)
230
+ if decimal_part and len(decimal_part) > 9:
231
+ # raise ValueError("Decimal part exceeds 9 digits")
232
+ return number_str
233
+
234
+ # Convert the integer part
235
+ pronunciation = integer_to_words(int(integer_part))
236
+
237
+ # Handle decimal part as a whole number with fractional term
238
+ if decimal_part:
239
+ pronunciation += ' پۈتۈن'
240
+ if decimal_part != '0': # Only pronounce non-zero decimal parts
241
+ decimal_value = int(decimal_part.rstrip('0')) # Remove trailing zeros
242
+ decimal_places = len(decimal_part.rstrip('0')) # Count significant decimal places
243
+ fraction_term = fractions.get(decimal_places, 'مىليارددا') # Fallback for beyond 9 digits
244
+ pronunciation += ' ' + fraction_term + ' ' + integer_to_words(decimal_value)
245
+
246
+ # Append percentage term if applicable
247
+ if is_percentage:
248
+ pronunciation += ' پىرسەنت'
249
+
250
+ return pronunciation.strip()
251
+ # return pronunciation
252
+
253
+
254
+ def process_uyghur_text_with_numbers(text):
255
+ """
256
+ Processes a string containing Uyghur text and numbers, converting valid numbers to their
257
+ Uyghur pronunciation in Arabic script while preserving non-numeric text.
258
+
259
+ Args:
260
+ text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
261
+
262
+ Returns:
263
+ str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
264
+ """
265
+ text = text.replace('%', ' پىرسەنت ')
266
+ # Valid number characters and symbols
267
+ digits = '0123456789'
268
+ number_symbols = '/.%_-'
269
+
270
+ result = []
271
+ i = 0
272
+ while i < len(text):
273
+ # Check for spaces and preserve them
274
+ if text[i].isspace():
275
+ result.append(text[i])
276
+ i += 1
277
+ continue
278
+
279
+ # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
280
+ number_start = i
281
+ number_str = ''
282
+ is_number = False
283
+
284
+ # Collect potential number characters
285
+ while i < len(text) and (text[i] in digits or text[i] in number_symbols):
286
+ number_str += text[i]
287
+ i += 1
288
+ is_number = True
289
+
290
+ # If we found a potential number, validate and convert it
291
+ if is_number:
292
+ # Check if the string is a valid number format
293
+ valid = False
294
+ if '/' in number_str and number_str.count('/') == 1:
295
+ # Fraction: e.g., "1/4"
296
+ num, denom = number_str.split('/')
297
+ if num.isdigit() and denom.isdigit():
298
+ valid = True
299
+ elif number_str.endswith('%'):
300
+ # Percentage: e.g., "25%"
301
+ if number_str[:-1].isdigit():
302
+ valid = True
303
+ elif number_str.endswith('_') or number_str.endswith('-'):
304
+ # Ordinal: e.g., "1_"
305
+ if number_str[:-1].isdigit():
306
+ valid = True
307
+ elif '.' in number_str and number_str.count('.') == 1:
308
+ # Decimal: e.g., "3.14"
309
+ whole, frac = number_str.split('.')
310
+ if whole.isdigit() and frac.isdigit():
311
+ valid = True
312
+ elif number_str.isdigit():
313
+ # Integer: e.g., "123"
314
+ valid = True
315
+
316
+ if valid:
317
+ try:
318
+ # Convert the number to Uyghur pronunciation
319
+ converted = number_to_uyghur_arabic_script(number_str)
320
+ result.append(converted)
321
+ except ValueError:
322
+ # If conversion fails, append the original number string
323
+ result.append(number_str)
324
+ else:
325
+ # If not a valid number format, treat as regular text
326
+ result.append(number_str)
327
+ else:
328
+ # Non-number character, append as is
329
+ result.append(text[i])
330
+ i += 1
331
+
332
+ # Join the result list into a string
333
+ return ''.join(result)
334
+
335
+ def fix_pauctuations(batch):
336
+ batch = batch.lower()
337
+ batch = unicodedata.normalize('NFKC', batch)
338
+ # extra_punctuation = "–؛;،؟?«»‹›−—¬”“•…" # Add your additional custom punctuation from the training set here
339
+ # all_punctuation = string.punctuation + extra_punctuation
340
+ # for char in all_punctuation:
341
+ # batch = batch.replace(char, ' ')
342
+ ## replace ug chars
343
+ # Replace 'ژ' with 'ج'
344
+ batch = batch.replace('ژ', 'ج')
345
+ batch = batch.replace('ک', 'ك')
346
+ batch = batch.replace('ی', 'ى')
347
+ batch = batch.replace('ه', 'ە')
348
+
349
+ vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك", "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
350
+
351
+ # Process each character in the batch
352
+ result = []
353
+ for char in batch:
354
+ if char in vocab:
355
+ result.append(char)
356
+ elif char in {'.', '?', '؟'}:
357
+ result.append(' ') # Replace dot with two spaces
358
+ else:
359
+ result.append(' ') # Replace other non-vocab characters with one space
360
+
361
+ # Join the result into a string
362
+ return ''.join(result)
363
+
364
+ def chinese_to_pinyin(mixed_text):
365
+ """
366
+ Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
367
+ preserving non-Chinese text, using only English letters.
368
+
369
+ Args:
370
+ mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
371
+
372
+ Returns:
373
+ str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
374
+ """
375
+ # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
376
+ chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
377
+
378
+ def replace_chinese(match):
379
+ chinese_text = match.group(0)
380
+ # Convert Chinese to Pinyin without tone marks, join syllables with spaces
381
+ pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
382
+ return ' '.join([item[0] for item in pinyin_list])
383
+
384
+ # Replace Chinese characters with their Pinyin, leave other text unchanged
385
+ result = chinese_pattern.sub(replace_chinese, mixed_text)
386
+ return result
387
+
388
+
389
+
390
  # model = VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic")
391
  # tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic")
392
+ uy_model_name = "piyazon/TTS-CV-Radio-RVC-Alikurban-Ug"
393
+ model_ug = VitsModel.from_pretrained(uy_model_name, token=hf_token)
394
+ tokenizer_ug = AutoTokenizer.from_pretrained(uy_model_name, token=hf_token)
395
  # model_ug = VitsModel.from_pretrained("piyazon/qutadgu_bilik")
396
  # tokenizer_ug = AutoTokenizer.from_pretrained("piyazon/qutadgu_bilik")
397
 
 
415
  if input.lang=="ug":
416
  model = model_ug
417
  tokenizer = tokenizer_ug
418
+ fixted_text = fix_pauctuations(process_uyghur_text_with_numbers(ug_latn_to_arab(chinese_to_pinyin(text))))
419
+ print(fixted_text)
420
+ inputs = tokenizer(fixted_text, return_tensors="pt")
421
  else:
422
  model = model_ru
423
  tokenizer = tokenizer_ru
requirements.txt CHANGED
@@ -13,4 +13,6 @@ torchcodec
13
  flask
14
  flask-cors
15
  pydantic
16
- soundfile
 
 
 
13
  flask
14
  flask-cors
15
  pydantic
16
+ soundfile
17
+ umsc
18
+ pypinyin