Dionyssos commited on
Commit
2049895
·
1 Parent(s): 53e868b
Files changed (1) hide show
  1. textual.py +8 -80
textual.py CHANGED
@@ -3,18 +3,8 @@ import unicodedata
3
  from num2words import num2words
4
  from num2word_greek.numbers2words import convert_numbers
5
 
6
- def only_greek_or_only_latin(text, lang='grc'):
7
- '''
8
- str: The converted string in the specified target script.
9
- Characters not found in any mapping are preserved as is.
10
- Latin accented characters in the input (e.g., 'É', 'ü') will
11
- be preserved in their lowercase form (e.g., 'é', 'ü') if
12
- converting to Latin.
13
- '''
14
-
15
- # --- Mapping Dictionaries ---
16
- # Keys are in lowercase as input text is case-folded.
17
- # If the output needs to maintain original casing, additional logic is required.
18
 
19
  latin_to_greek_map = {
20
  'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
@@ -147,10 +137,7 @@ def only_greek_or_only_latin(text, lang='grc'):
147
  current_index += 1
148
 
149
  return ''.join(output_chars)
150
-
151
-
152
- # =====================================================
153
- #
154
 
155
  def fix_vocals(text, lang='ron'):
156
 
@@ -177,20 +164,15 @@ def fix_vocals(text, lang='ron'):
177
  '<': ' mai mic decât ',
178
  '>': ' mai mare decât',
179
  '%': ' la sută ', # percent (from previous)
180
- '≠': ' nu este egal cu ',
181
- '≤': ' mai mic sau egal cu ',
182
- '≥': ' mai mare sau egal cu ',
183
- '≈': ' aproximativ ',
184
- '∞': ' infinit ',
185
  '€': ' euro ',
186
  '$': ' dolar ',
187
  '£': ' liră ',
188
  '&': ' și ', # and
189
- '@': ' la ', # at
190
- '#': ' diez ', # hash
191
  '∑': ' sumă ',
192
  '∫': ' integrală ',
193
- '√': ' rădăcina pătrată a ', # more generic square root
194
  }
195
 
196
  eng_replacements = {
@@ -211,14 +193,6 @@ def fix_vocals(text, lang='ron'):
211
  '>': ' greater than ',
212
  # Additional common math symbols from previous list
213
  '%': ' percent ',
214
- '∑': ' sum ',
215
- '∫': ' integral ',
216
- '√': ' square root of ',
217
- '≠': ' not equals ',
218
- '≤': ' less than or equals ',
219
- '≥': ' greater than or equals ',
220
- '≈': ' approximately ',
221
- '∞': ' infinity ',
222
  '€': ' euro ',
223
  '$': ' dollar ',
224
  '£': ' pound ',
@@ -249,20 +223,9 @@ def fix_vocals(text, lang='ron'):
249
  '<': ' manje od ',
250
  '>': ' veće od ',
251
  '%': ' procenat ',
252
- '∑': ' suma ',
253
- '∫': ' integral ',
254
- '√': ' kvadratni koren ',
255
- '≠': ' nije jednako ',
256
- '≤': ' manje ili jednako od ',
257
- '≥': ' veće ili jednako od ',
258
- '≈': ' približno ',
259
- '∞': ' beskonačnost ',
260
  '€': ' evro ',
261
  '$': ' dolar ',
262
  '£': ' funta ',
263
- '&': ' i ',
264
- '@': ' et ',
265
- '#': ' taraba ',
266
  # Others
267
  # 'rn': 'rrn',
268
  # 'ć': 'č',
@@ -312,14 +275,6 @@ def fix_vocals(text, lang='ron'):
312
  '>': ' größer als',
313
  # Additional common math symbols from previous list
314
  '%': ' prozent ',
315
- '∑': ' Summe ',
316
- '∫': ' Integral ',
317
- '√': ' Quadratwurzel ',
318
- '≠': ' ungleich ',
319
- '≤': ' kleiner oder gleich ',
320
- '≥': ' größer oder gleich ',
321
- '≈': ' ungefähr ',
322
- '∞': ' unendlich ',
323
  '€': ' euro ',
324
  '$': ' dollar ',
325
  '£': ' pfund ',
@@ -345,20 +300,11 @@ def fix_vocals(text, lang='ron'):
345
  '>': ' supérieur à ',
346
  # Add more common math symbols as needed for French
347
  '%': ' pour cent ',
348
- '∑': ' somme ',
349
- '∫': ' intégrale ',
350
- '√': ' racine carrée ',
351
- '≠': ' n\'égale pas ',
352
- '≤': ' inférieur ou égal à ',
353
- '≥': ' supérieur ou égal à ',
354
- '≈': ' approximativement ',
355
- '∞': ' infini ',
356
  '€': ' euro ',
357
  '$': ' dollar ',
358
  '£': ' livre ',
359
  '&': ' et ',
360
  '@': ' arobase ',
361
- '#': ' dièse ',
362
  }
363
 
364
  hun_replacements = {
@@ -380,16 +326,7 @@ def fix_vocals(text, lang='ron'):
380
  'pi': ' pi ',
381
  '<': ' kisebb mint ',
382
  '>': ' nagyobb mint ',
383
- # Add more common math symbols as needed for Hungarian
384
  '%': ' százalék ',
385
- '∑': ' szumma ',
386
- '∫': ' integrál ',
387
- '√': ' négyzetgyök ',
388
- '≠': ' nem egyenlő ',
389
- '≤': ' kisebb vagy egyenlő ',
390
- '≥': ' nagyobb vagy egyenlő ',
391
- '≈': ' körülbelül ',
392
- '∞': ' végtelen ',
393
  '€': ' euró ',
394
  '$': ' dollár ',
395
  '£': ' font ',
@@ -406,22 +343,13 @@ def fix_vocals(text, lang='ron'):
406
  '^': ' εἰς τὴν δύναμιν ',
407
  '+': ' σὺν ',
408
  ' - ': ' χωρὶς ',
409
- '*': ' πολλάκις ',
410
  ' / ': ' διαιρέω ',
411
  '=': ' ἴσον ',
412
  'pi': ' πῖ ',
413
  '<': ' ἔλαττον ',
414
  '>': ' μεῖζον ',
415
- # Add more common math symbols as needed for Ancient Greek
416
  '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
417
- '∑': ' ἄθροισμα ',
418
- '∫': ' ὁλοκλήρωμα ',
419
- '√': ' τετραγωνικὴ ῥίζα ',
420
- '≠': ' οὐκ ἴσον ',
421
- '≤': ' ἔλαττον ἢ ἴσον ',
422
- '≥': ' μεῖζον ἢ ἴσον ',
423
- '≈': ' περίπου ',
424
- '∞': ' ἄπειρον ',
425
  '€': ' εὐρώ ',
426
  '$': ' δολάριον ',
427
  '£': ' λίρα ',
@@ -512,4 +440,4 @@ def transliterate_number(number_string,
512
  return match.group(0) # Return original if conversion fails
513
 
514
  pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
515
- return re.sub(pattern, replace_number, number_string)
 
3
  from num2words import num2words
4
  from num2word_greek.numbers2words import convert_numbers
5
 
6
+ def only_greek_or_only_latin(text,
7
+ lang='grc'):
 
 
 
 
 
 
 
 
 
 
8
 
9
  latin_to_greek_map = {
10
  'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
 
137
  current_index += 1
138
 
139
  return ''.join(output_chars)
140
+
 
 
 
141
 
142
  def fix_vocals(text, lang='ron'):
143
 
 
164
  '<': ' mai mic decât ',
165
  '>': ' mai mare decât',
166
  '%': ' la sută ', # percent (from previous)
 
 
 
 
 
167
  '€': ' euro ',
168
  '$': ' dolar ',
169
  '£': ' liră ',
170
  '&': ' și ', # and
171
+ #'@': ' la ', # at
172
+ #'#': ' diez ', # hash
173
  '∑': ' sumă ',
174
  '∫': ' integrală ',
175
+ #'√': ' rădăcina pătrată a ', # more generic square root
176
  }
177
 
178
  eng_replacements = {
 
193
  '>': ' greater than ',
194
  # Additional common math symbols from previous list
195
  '%': ' percent ',
 
 
 
 
 
 
 
 
196
  '€': ' euro ',
197
  '$': ' dollar ',
198
  '£': ' pound ',
 
223
  '<': ' manje od ',
224
  '>': ' veće od ',
225
  '%': ' procenat ',
 
 
 
 
 
 
 
 
226
  '€': ' evro ',
227
  '$': ' dolar ',
228
  '£': ' funta ',
 
 
 
229
  # Others
230
  # 'rn': 'rrn',
231
  # 'ć': 'č',
 
275
  '>': ' größer als',
276
  # Additional common math symbols from previous list
277
  '%': ' prozent ',
 
 
 
 
 
 
 
 
278
  '€': ' euro ',
279
  '$': ' dollar ',
280
  '£': ' pfund ',
 
300
  '>': ' supérieur à ',
301
  # Add more common math symbols as needed for French
302
  '%': ' pour cent ',
 
 
 
 
 
 
 
 
303
  '€': ' euro ',
304
  '$': ' dollar ',
305
  '£': ' livre ',
306
  '&': ' et ',
307
  '@': ' arobase ',
 
308
  }
309
 
310
  hun_replacements = {
 
326
  'pi': ' pi ',
327
  '<': ' kisebb mint ',
328
  '>': ' nagyobb mint ',
 
329
  '%': ' százalék ',
 
 
 
 
 
 
 
 
330
  '€': ' euró ',
331
  '$': ' dollár ',
332
  '£': ' font ',
 
343
  '^': ' εἰς τὴν δύναμιν ',
344
  '+': ' σὺν ',
345
  ' - ': ' χωρὶς ',
346
+ ' * ': ' πολλάκις ',
347
  ' / ': ' διαιρέω ',
348
  '=': ' ἴσον ',
349
  'pi': ' πῖ ',
350
  '<': ' ἔλαττον ',
351
  '>': ' μεῖζον ',
 
352
  '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
 
 
 
 
 
 
 
 
353
  '€': ' εὐρώ ',
354
  '$': ' δολάριον ',
355
  '£': ' λίρα ',
 
440
  return match.group(0) # Return original if conversion fails
441
 
442
  pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
443
+ return re.sub(pattern, replace_number, number_string)