jahongirtech commited on
Commit
983579c
ยท
verified ยท
1 Parent(s): ff08343

Update matn_sozlovchi.py

Browse files
Files changed (1) hide show
  1. matn_sozlovchi.py +268 -0
matn_sozlovchi.py CHANGED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ O'zbek TTS uchun metadata.csv matnini normalizatsiya qiluvchi skript.
3
+ Vocabulary da yo'q belgilarni avtomatik aniqlaydi va almashtiradi.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import shutil
9
+ from collections import defaultdict
10
+
11
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
12
+ # CONFIG โ€” o'zingizga moslashtiring
13
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
14
+ DATASET_PATH = "/content/drive/MyDrive/tts/dataset_final"
15
+ METADATA_FILE = "metadata.csv"
16
+ SEPARATOR = "|" # metadata.csv da ustunlar ajratuvchisi
17
+
18
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
19
+ # VOCABULARY โ€” config dagi characters bilan mos
20
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
21
+ ALLOWED_CHARS = set(
22
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
23
+ "O'o'G'g'ShshChch'"
24
+ "0123456789"
25
+ "!,.? " # punctuations
26
+ "\n"
27
+ )
28
+
29
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
30
+ # NORMALIZATSIYA JADVALI
31
+ # Har bir "noto'g'ri" belgi โ†’ o'zbek ekvivalenti
32
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
33
+ REPLACEMENTS = {
34
+ # โ”€โ”€ Apostrof turlari โ†’ oddiy apostrof โ”€โ”€
35
+ "\u2018": "'", # ' (left single quotation)
36
+ "\u2019": "'", # ' (right single quotation)
37
+ "\u02bc": "'", # สผ (modifier letter apostrophe)
38
+ "\u02b9": "'", # สน (modifier letter prime)
39
+ "\u0060": "'", # ` (grave accent)
40
+ "\u00b4": "'", # ยด (acute accent)
41
+ "\u2032": "'", # โ€ฒ (prime)
42
+
43
+ # โ”€โ”€ Qo'shtirnoq โ†’ o'chirish โ”€โ”€
44
+ "\u201c": "", # " (left double quotation)
45
+ "\u201d": "", # " (right double quotation)
46
+ "\u00ab": "", # ยซ
47
+ "\u00bb": "", # ยป
48
+ '"': "",
49
+
50
+ # โ”€โ”€ Kirill harflari โ†’ lotin โ”€โ”€
51
+ "ะฐ": "a", "ะฑ": "b", "ะฒ": "v", "ะณ": "g", "ะด": "d",
52
+ "ะต": "e", "ั‘": "yo", "ะถ": "j", "ะท": "z", "ะธ": "i",
53
+ "ะน": "y", "ะบ": "k", "ะป": "l", "ะผ": "m", "ะฝ": "n",
54
+ "ะพ": "o", "ะฟ": "p", "ั€": "r", "ั": "s", "ั‚": "t",
55
+ "ัƒ": "u", "ั„": "f", "ั…": "x", "ั†": "ts", "ั‡": "ch",
56
+ "ัˆ": "sh", "ั‰": "sh", "ัŠ": "'", "ั‹": "i", "ัŒ": "",
57
+ "ั": "e", "ัŽ": "yu", "ั": "ya",
58
+ "ะ": "A", "ะ‘": "B", "ะ’": "V", "ะ“": "G", "ะ”": "D",
59
+ "ะ•": "E", "ะ": "Yo", "ะ–": "J", "ะ—": "Z", "ะ˜": "I",
60
+ "ะ™": "Y", "ะš": "K", "ะ›": "L", "ะœ": "M", "ะ": "N",
61
+ "ะž": "O", "ะŸ": "P", "ะ ": "R", "ะก": "S", "ะข": "T",
62
+ "ะฃ": "U", "ะค": "F", "ะฅ": "X", "ะฆ": "Ts", "ะง": "Ch",
63
+ "ะจ": "Sh", "ะฉ": "Sh", "ะช": "'", "ะซ": "I", "ะฌ": "",
64
+ "ะญ": "E", "ะฎ": "Yu", "ะฏ": "Ya",
65
+ # O'zbek kirill maxsus harflari
66
+ "ัž": "o'", "า›": "q", "า“": "g'", "าณ": "h", "าฃ": "ng",
67
+ "ะŽ": "O'", "าš": "Q", "า’": "G'", "าฒ": "H", "าข": "Ng",
68
+
69
+ # โ”€โ”€ Maxsus lotin harflari โ†’ o'zbek โ”€โ”€
70
+ "รก": "a", "ร ": "a", "รข": "a", "รค": "a", "ฤ": "a",
71
+ "รฉ": "e", "รจ": "e", "รช": "e", "รซ": "e", "ฤ›": "e",
72
+ "รญ": "i", "รฌ": "i", "รฎ": "i", "รฏ": "i", "ฤฑ": "i",
73
+ "รณ": "o", "รฒ": "o", "รด": "o", "รถ": "o", "ล‘": "o",
74
+ "รบ": "u", "รน": "u", "รป": "u", "รผ": "u", "ลฑ": "u",
75
+ "รฑ": "n", "รง": "ch", "ลพ": "zh", "ลก": "sh", "ฤ": "ch",
76
+ "ล™": "r", "รฝ": "y", "ฤท": "k", "ฤผ": "l", "ล†": "n",
77
+ "ฤฃ": "g", "ฤง": "h", "ฤต": "j", "ลต": "w",
78
+ "ร": "A", "ร‰": "E", "ร": "I", "ร“": "O", "รš": "U",
79
+ "ร‘": "N", "ร‡": "Ch", "ลฝ": "Zh", "ล ": "Sh", "ฤŒ": "Ch",
80
+
81
+ # โ”€โ”€ Raqamli/maxsus belgilar โ†’ matn โ”€โ”€
82
+ "%": " foiz",
83
+ "ยฐ": " daraja",
84
+ "โ„–": "raqam",
85
+ "ยง": "",
86
+ "ยฉ": "",
87
+ "ยฎ": "",
88
+ "โ„ข": "",
89
+ "โ€ฆ": "...", # ellipsis โ†’ uch nuqta (keyin nuqta qoladi)
90
+ "โ€“": "-", # en dash โ†’ oddiy chiziq
91
+ "โ€”": "-", # em dash โ†’ oddiy chiziq
92
+ "\u00ad": "", # soft hyphen โ†’ o'chirish
93
+ "\u200b": "", # zero-width space
94
+ "\u200c": "", # zero-width non-joiner
95
+ "\u200d": "", # zero-width joiner
96
+ "\u00a0": " ", # non-breaking space โ†’ oddiy bo'shliq
97
+ "\t": " ", # tab โ†’ bo'shliq
98
+
99
+ # โ”€โ”€ Brackets/slash โ†’ o'chirish โ”€โ”€
100
+ "(": "",
101
+ ")": "",
102
+ "[": "",
103
+ "]": "",
104
+ "{": "",
105
+ "}": "",
106
+ "/": " ",
107
+ "\\": " ",
108
+ "|": " ", # separator bilan aralashmasligi uchun
109
+ "_": " ",
110
+ "@": " at ",
111
+ "#": "",
112
+ "$": "",
113
+ "^": "",
114
+ "*": "",
115
+ "+": " va ",
116
+ "=": " teng ",
117
+ "<": "",
118
+ ">": "",
119
+ "~": "",
120
+ "`": "'",
121
+ "&": " va ",
122
+ ";": ",",
123
+ ":": ",",
124
+ }
125
+
126
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
127
+ # ASOSIY FUNKSIYALAR
128
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€๏ฟฝ๏ฟฝ๏ฟฝโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
129
+
130
+ def normalize_text(text: str) -> str:
131
+ """Matnni normalizatsiya qiladi."""
132
+ # 1. Almashtirishlar
133
+ for bad_char, good_char in REPLACEMENTS.items():
134
+ text = text.replace(bad_char, good_char)
135
+
136
+ # 2. Raqamlarni so'z ko'rinishiga o'tkazish (oddiy)
137
+ # (Murakkabroq holatlar uchun num2words kutubxonasi kerak)
138
+
139
+ # 3. Ko'p bo'shliqlarni birlari qilish
140
+ text = re.sub(r' +', ' ', text)
141
+
142
+ # 4. Boshidagi va oxiridagi bo'shliqni tozalash
143
+ text = text.strip()
144
+
145
+ # 5. Hali ham ruxsat etilmagan belgilar qolsa โ€” o'chirish
146
+ text = ''.join(c for c in text if c in ALLOWED_CHARS)
147
+
148
+ # 6. Yana ko'p bo'shliqlarni tozalash (o'chirishdan keyin)
149
+ text = re.sub(r' +', ' ', text).strip()
150
+
151
+ return text
152
+
153
+
154
+ def analyze_metadata(filepath: str) -> dict:
155
+ """Metadata dagi noto'g'ri belgilarni hisoblab chiqaradi."""
156
+ bad_chars = defaultdict(int)
157
+ total_lines = 0
158
+ bad_lines = 0
159
+
160
+ with open(filepath, "r", encoding="utf-8") as f:
161
+ for line in f:
162
+ line = line.strip()
163
+ if not line:
164
+ continue
165
+ total_lines += 1
166
+ cols = line.split(SEPARATOR)
167
+ if len(cols) < 2:
168
+ continue
169
+ text = cols[1]
170
+ for char in text:
171
+ if char not in ALLOWED_CHARS:
172
+ bad_chars[char] += 1
173
+ bad_lines += 1
174
+
175
+ return {
176
+ "total": total_lines,
177
+ "bad_lines": bad_lines,
178
+ "bad_chars": dict(sorted(bad_chars.items(), key=lambda x: -x[1]))
179
+ }
180
+
181
+
182
+ def process_metadata(dataset_path: str):
183
+ """Metadata ni o'qib, normalizatsiya qilib, qayta saqlaydi."""
184
+ filepath = os.path.join(dataset_path, METADATA_FILE)
185
+ backup_path = filepath + ".backup"
186
+
187
+ # โ”€โ”€ Tahlil โ”€โ”€
188
+ print("=" * 55)
189
+ print("๐Ÿ“Š TAHLIL โ€” Noto'g'ri belgilar:")
190
+ stats = analyze_metadata(filepath)
191
+ print(f" Jami qatorlar : {stats['total']}")
192
+ print(f" Xato qatorlar : {stats['bad_lines']}")
193
+ print(f"\n Belgi | Soni | Almashtiriladi")
194
+ print(f" {'โ”€'*40}")
195
+ for char, count in stats["bad_chars"].items():
196
+ replacement = REPLACEMENTS.get(char, "โŒ O'CHIRILADI")
197
+ print(f" '{char}' (U+{ord(char):04X}) | {count:4d} | โ†’ '{replacement}'")
198
+
199
+ if not stats["bad_chars"]:
200
+ print(" โœ… Hamma belgilar to'g'ri! Normalizatsiya shart emas.")
201
+ return
202
+
203
+ # โ”€โ”€ Backup โ”€โ”€
204
+ print(f"\n๐Ÿ’พ Backup saqlanmoqda: {backup_path}")
205
+ shutil.copy2(filepath, backup_path)
206
+
207
+ # โ”€โ”€ Normalizatsiya โ”€โ”€
208
+ print(f"๐Ÿ”„ Normalizatsiya boshlanmoqda...")
209
+ fixed_lines = 0
210
+ skipped_lines = 0
211
+ output_lines = []
212
+
213
+ with open(filepath, "r", encoding="utf-8") as f:
214
+ for line in f:
215
+ line = line.strip()
216
+ if not line:
217
+ continue
218
+ cols = line.split(SEPARATOR)
219
+ if len(cols) < 2:
220
+ skipped_lines += 1
221
+ continue
222
+
223
+ wav_id = cols[0].strip()
224
+ text = cols[1].strip()
225
+ original_text = text
226
+
227
+ text = normalize_text(text)
228
+
229
+ # Bo'sh matn qolsa โ€” o'tkazib yuboramiz
230
+ if not text:
231
+ skipped_lines += 1
232
+ continue
233
+
234
+ if text != original_text:
235
+ fixed_lines += 1
236
+
237
+ output_lines.append(f"{wav_id}{SEPARATOR}{text}")
238
+
239
+ # โ”€โ”€ Saqlash โ”€โ”€
240
+ with open(filepath, "w", encoding="utf-8") as f:
241
+ f.write("\n".join(output_lines) + "\n")
242
+
243
+ # โ”€โ”€ Natija โ”€โ”€
244
+ print(f"\n{'='*55}")
245
+ print(f"โœ… NATIJA:")
246
+ print(f" Tuzatilgan qatorlar : {fixed_lines}")
247
+ print(f" O'tkazilgan (bo'sh) : {skipped_lines}")
248
+ print(f" Saqlangan qatorlar : {len(output_lines)}")
249
+ print(f" Backup : {backup_path}")
250
+
251
+ # โ”€โ”€ Tekshiruv โ”€โ”€
252
+ print(f"\n๐Ÿ” Normalizatsiyadan keyin tekshiruv:")
253
+ stats_after = analyze_metadata(filepath)
254
+ if stats_after["bad_chars"]:
255
+ print(f" โš ๏ธ Hali ham noto'g'ri belgilar bor:")
256
+ for char, count in stats_after["bad_chars"].items():
257
+ print(f" '{char}' (U+{ord(char):04X}) โ†’ {count} marta")
258
+ print(f"\n ๐Ÿ’ก Ularni REPLACEMENTS jadvaliga qo'shing va qayta ishga tushiring.")
259
+ else:
260
+ print(f" โœ… Barcha belgilar to'g'ri! Metadata tayyor.")
261
+ print("=" * 55)
262
+
263
+
264
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
265
+ # ISHGA TUSHIRISH
266
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
267
+ if __name__ == "__main__":
268
+ process_metadata(DATASET_PATH)