AlimOmar commited on
Commit
883d9c7
·
1 Parent(s): 5006f55

change to ug

Browse files
Files changed (1) hide show
  1. app.py +65 -1
app.py CHANGED
@@ -6,6 +6,8 @@ from fastapi.middleware.cors import CORSMiddleware
6
  import io
7
  import soundfile as sf
8
  from pydantic import BaseModel
 
 
9
 
10
 
11
  app = FastAPI()
@@ -19,6 +21,68 @@ app.add_middleware(
19
  allow_headers=["*"],
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  model = VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic")
23
  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic")
24
 
@@ -36,7 +100,7 @@ curl -X POST https://piyazon-tts-piyazon.hf.space/generate-tts \
36
  async def generate_tts(input: TextInput):
37
  try:
38
  # Tokenize input text
39
- inputs = tokenizer(input.text, return_tensors="pt")
40
 
41
  # Generate waveform
42
  with torch.no_grad():
 
6
  import io
7
  import soundfile as sf
8
  from pydantic import BaseModel
9
+ import string
10
+ import unicodedata
11
 
12
 
13
  app = FastAPI()
 
21
  allow_headers=["*"],
22
  )
23
 
24
+ def fix_string(batch):
25
+ batch = batch.lower()
26
+ batch = unicodedata.normalize('NFKC', batch)
27
+ ## replace ug chars
28
+ # Replace 'ژ' with 'ج'
29
+ batch = batch.replace('ژ', 'ج')
30
+ batch = batch.replace('ک', 'ك')
31
+ batch = batch.replace('ی', 'ى')
32
+ # batch = batch.replace('ه', 'ە')
33
+ batch = batch.replace('ه', 'ە')
34
+ ## replace nums
35
+ numbers_to_uyghur_map = {
36
+ '0': ' نۆل ',
37
+ '1': ' بىر ',
38
+ '2': ' ئىككى ',
39
+ '3': ' ئۈچ ',
40
+ '4': ' تۆت ',
41
+ '5': ' بەش ',
42
+ '6': ' ئالتە ',
43
+ '7': ' يەتتە ',
44
+ '8': ' سەككىز ',
45
+ '9': ' توققۇز '
46
+ }
47
+ for num_char, uyghur_char in numbers_to_uyghur_map.items():
48
+ batch = batch.replace(num_char, uyghur_char)
49
+ ## replace en chars
50
+ english_to_uyghur_map = {
51
+ 'a': ' ئېي ',
52
+ 'b': ' بى ',
53
+ 'c': ' سى ',
54
+ 'd': ' دى ',
55
+ 'e': ' ئى ',
56
+ 'f': ' ئەف ',
57
+ 'g': ' جى ',
58
+ 'h': ' ئېچ ',
59
+ 'i': ' ئاي ',
60
+ 'j': ' جېي ',
61
+ 'k': ' کېي ',
62
+ 'l': ' ئەل ',
63
+ 'm': ' ئەم ',
64
+ 'n': ' ئېن ',
65
+ 'o': ' ئو ',
66
+ 'p': ' پى ',
67
+ 'q': ' كىيۇ ',
68
+ 'r': ' ئار ',
69
+ 's': ' ئەس ',
70
+ 't': ' تى ',
71
+ 'u': ' يۇ ',
72
+ 'v': ' ۋى ',
73
+ 'w': ' دابىلىيۇ ',
74
+ 'x': ' ئېكىس ',
75
+ 'y': ' ۋاي ',
76
+ 'z': ' زى ',
77
+ }
78
+ for eng_char, uyghur_char in english_to_uyghur_map.items():
79
+ batch = batch.replace(eng_char, uyghur_char)
80
+ # batch = batch.replace('e', ' ئې ')
81
+ # Optional: Collapse multiple spaces into one
82
+ batch = ' '.join(batch.split())
83
+ return batch
84
+
85
+
86
  model = VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic")
87
  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic")
88
 
 
100
  async def generate_tts(input: TextInput):
101
  try:
102
  # Tokenize input text
103
+ inputs = tokenizer(fix_string(input.text), return_tensors="pt")
104
 
105
  # Generate waveform
106
  with torch.no_grad():