txt ruls
Browse files- textual.py +8 -80
textual.py
CHANGED
|
@@ -3,18 +3,8 @@ import unicodedata
|
|
| 3 |
from num2words import num2words
|
| 4 |
from num2word_greek.numbers2words import convert_numbers
|
| 5 |
|
| 6 |
-
def only_greek_or_only_latin(text,
|
| 7 |
-
|
| 8 |
-
str: The converted string in the specified target script.
|
| 9 |
-
Characters not found in any mapping are preserved as is.
|
| 10 |
-
Latin accented characters in the input (e.g., 'É', 'ü') will
|
| 11 |
-
be preserved in their lowercase form (e.g., 'é', 'ü') if
|
| 12 |
-
converting to Latin.
|
| 13 |
-
'''
|
| 14 |
-
|
| 15 |
-
# --- Mapping Dictionaries ---
|
| 16 |
-
# Keys are in lowercase as input text is case-folded.
|
| 17 |
-
# If the output needs to maintain original casing, additional logic is required.
|
| 18 |
|
| 19 |
latin_to_greek_map = {
|
| 20 |
'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
|
|
@@ -147,10 +137,7 @@ def only_greek_or_only_latin(text, lang='grc'):
|
|
| 147 |
current_index += 1
|
| 148 |
|
| 149 |
return ''.join(output_chars)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
# =====================================================
|
| 153 |
-
#
|
| 154 |
|
| 155 |
def fix_vocals(text, lang='ron'):
|
| 156 |
|
|
@@ -177,20 +164,15 @@ def fix_vocals(text, lang='ron'):
|
|
| 177 |
'<': ' mai mic decât ',
|
| 178 |
'>': ' mai mare decât',
|
| 179 |
'%': ' la sută ', # percent (from previous)
|
| 180 |
-
'≠': ' nu este egal cu ',
|
| 181 |
-
'≤': ' mai mic sau egal cu ',
|
| 182 |
-
'≥': ' mai mare sau egal cu ',
|
| 183 |
-
'≈': ' aproximativ ',
|
| 184 |
-
'∞': ' infinit ',
|
| 185 |
'€': ' euro ',
|
| 186 |
'$': ' dolar ',
|
| 187 |
'£': ' liră ',
|
| 188 |
'&': ' și ', # and
|
| 189 |
-
'@': ' la ', # at
|
| 190 |
-
'#': ' diez ', # hash
|
| 191 |
'∑': ' sumă ',
|
| 192 |
'∫': ' integrală ',
|
| 193 |
-
'√': ' rădăcina pătrată a ', # more generic square root
|
| 194 |
}
|
| 195 |
|
| 196 |
eng_replacements = {
|
|
@@ -211,14 +193,6 @@ def fix_vocals(text, lang='ron'):
|
|
| 211 |
'>': ' greater than ',
|
| 212 |
# Additional common math symbols from previous list
|
| 213 |
'%': ' percent ',
|
| 214 |
-
'∑': ' sum ',
|
| 215 |
-
'∫': ' integral ',
|
| 216 |
-
'√': ' square root of ',
|
| 217 |
-
'≠': ' not equals ',
|
| 218 |
-
'≤': ' less than or equals ',
|
| 219 |
-
'≥': ' greater than or equals ',
|
| 220 |
-
'≈': ' approximately ',
|
| 221 |
-
'∞': ' infinity ',
|
| 222 |
'€': ' euro ',
|
| 223 |
'$': ' dollar ',
|
| 224 |
'£': ' pound ',
|
|
@@ -249,20 +223,9 @@ def fix_vocals(text, lang='ron'):
|
|
| 249 |
'<': ' manje od ',
|
| 250 |
'>': ' veće od ',
|
| 251 |
'%': ' procenat ',
|
| 252 |
-
'∑': ' suma ',
|
| 253 |
-
'∫': ' integral ',
|
| 254 |
-
'√': ' kvadratni koren ',
|
| 255 |
-
'≠': ' nije jednako ',
|
| 256 |
-
'≤': ' manje ili jednako od ',
|
| 257 |
-
'≥': ' veće ili jednako od ',
|
| 258 |
-
'≈': ' približno ',
|
| 259 |
-
'∞': ' beskonačnost ',
|
| 260 |
'€': ' evro ',
|
| 261 |
'$': ' dolar ',
|
| 262 |
'£': ' funta ',
|
| 263 |
-
'&': ' i ',
|
| 264 |
-
'@': ' et ',
|
| 265 |
-
'#': ' taraba ',
|
| 266 |
# Others
|
| 267 |
# 'rn': 'rrn',
|
| 268 |
# 'ć': 'č',
|
|
@@ -312,14 +275,6 @@ def fix_vocals(text, lang='ron'):
|
|
| 312 |
'>': ' größer als',
|
| 313 |
# Additional common math symbols from previous list
|
| 314 |
'%': ' prozent ',
|
| 315 |
-
'∑': ' Summe ',
|
| 316 |
-
'∫': ' Integral ',
|
| 317 |
-
'√': ' Quadratwurzel ',
|
| 318 |
-
'≠': ' ungleich ',
|
| 319 |
-
'≤': ' kleiner oder gleich ',
|
| 320 |
-
'≥': ' größer oder gleich ',
|
| 321 |
-
'≈': ' ungefähr ',
|
| 322 |
-
'∞': ' unendlich ',
|
| 323 |
'€': ' euro ',
|
| 324 |
'$': ' dollar ',
|
| 325 |
'£': ' pfund ',
|
|
@@ -345,20 +300,11 @@ def fix_vocals(text, lang='ron'):
|
|
| 345 |
'>': ' supérieur à ',
|
| 346 |
# Add more common math symbols as needed for French
|
| 347 |
'%': ' pour cent ',
|
| 348 |
-
'∑': ' somme ',
|
| 349 |
-
'∫': ' intégrale ',
|
| 350 |
-
'√': ' racine carrée ',
|
| 351 |
-
'≠': ' n\'égale pas ',
|
| 352 |
-
'≤': ' inférieur ou égal à ',
|
| 353 |
-
'≥': ' supérieur ou égal à ',
|
| 354 |
-
'≈': ' approximativement ',
|
| 355 |
-
'∞': ' infini ',
|
| 356 |
'€': ' euro ',
|
| 357 |
'$': ' dollar ',
|
| 358 |
'£': ' livre ',
|
| 359 |
'&': ' et ',
|
| 360 |
'@': ' arobase ',
|
| 361 |
-
'#': ' dièse ',
|
| 362 |
}
|
| 363 |
|
| 364 |
hun_replacements = {
|
|
@@ -380,16 +326,7 @@ def fix_vocals(text, lang='ron'):
|
|
| 380 |
'pi': ' pi ',
|
| 381 |
'<': ' kisebb mint ',
|
| 382 |
'>': ' nagyobb mint ',
|
| 383 |
-
# Add more common math symbols as needed for Hungarian
|
| 384 |
'%': ' százalék ',
|
| 385 |
-
'∑': ' szumma ',
|
| 386 |
-
'∫': ' integrál ',
|
| 387 |
-
'√': ' négyzetgyök ',
|
| 388 |
-
'≠': ' nem egyenlő ',
|
| 389 |
-
'≤': ' kisebb vagy egyenlő ',
|
| 390 |
-
'≥': ' nagyobb vagy egyenlő ',
|
| 391 |
-
'≈': ' körülbelül ',
|
| 392 |
-
'∞': ' végtelen ',
|
| 393 |
'€': ' euró ',
|
| 394 |
'$': ' dollár ',
|
| 395 |
'£': ' font ',
|
|
@@ -406,22 +343,13 @@ def fix_vocals(text, lang='ron'):
|
|
| 406 |
'^': ' εἰς τὴν δύναμιν ',
|
| 407 |
'+': ' σὺν ',
|
| 408 |
' - ': ' χωρὶς ',
|
| 409 |
-
'*': ' πολλάκις ',
|
| 410 |
' / ': ' διαιρέω ',
|
| 411 |
'=': ' ἴσον ',
|
| 412 |
'pi': ' πῖ ',
|
| 413 |
'<': ' ἔλαττον ',
|
| 414 |
'>': ' μεῖζον ',
|
| 415 |
-
# Add more common math symbols as needed for Ancient Greek
|
| 416 |
'%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
|
| 417 |
-
'∑': ' ἄθροισμα ',
|
| 418 |
-
'∫': ' ὁλοκλήρωμα ',
|
| 419 |
-
'√': ' τετραγωνικὴ ῥίζα ',
|
| 420 |
-
'≠': ' οὐκ ἴσον ',
|
| 421 |
-
'≤': ' ἔλαττον ἢ ἴσον ',
|
| 422 |
-
'≥': ' μεῖζον ἢ ἴσον ',
|
| 423 |
-
'≈': ' περίπου ',
|
| 424 |
-
'∞': ' ἄπειρον ',
|
| 425 |
'€': ' εὐρώ ',
|
| 426 |
'$': ' δολάριον ',
|
| 427 |
'£': ' λίρα ',
|
|
@@ -512,4 +440,4 @@ def transliterate_number(number_string,
|
|
| 512 |
return match.group(0) # Return original if conversion fails
|
| 513 |
|
| 514 |
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
|
| 515 |
-
return re.sub(pattern, replace_number, number_string)
|
|
|
|
| 3 |
from num2words import num2words
|
| 4 |
from num2word_greek.numbers2words import convert_numbers
|
| 5 |
|
| 6 |
+
def only_greek_or_only_latin(text,
|
| 7 |
+
lang='grc'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
latin_to_greek_map = {
|
| 10 |
'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
|
|
|
|
| 137 |
current_index += 1
|
| 138 |
|
| 139 |
return ''.join(output_chars)
|
| 140 |
+
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
def fix_vocals(text, lang='ron'):
|
| 143 |
|
|
|
|
| 164 |
'<': ' mai mic decât ',
|
| 165 |
'>': ' mai mare decât',
|
| 166 |
'%': ' la sută ', # percent (from previous)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
'€': ' euro ',
|
| 168 |
'$': ' dolar ',
|
| 169 |
'£': ' liră ',
|
| 170 |
'&': ' și ', # and
|
| 171 |
+
#'@': ' la ', # at
|
| 172 |
+
#'#': ' diez ', # hash
|
| 173 |
'∑': ' sumă ',
|
| 174 |
'∫': ' integrală ',
|
| 175 |
+
#'√': ' rădăcina pătrată a ', # more generic square root
|
| 176 |
}
|
| 177 |
|
| 178 |
eng_replacements = {
|
|
|
|
| 193 |
'>': ' greater than ',
|
| 194 |
# Additional common math symbols from previous list
|
| 195 |
'%': ' percent ',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
'€': ' euro ',
|
| 197 |
'$': ' dollar ',
|
| 198 |
'£': ' pound ',
|
|
|
|
| 223 |
'<': ' manje od ',
|
| 224 |
'>': ' veće od ',
|
| 225 |
'%': ' procenat ',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
'€': ' evro ',
|
| 227 |
'$': ' dolar ',
|
| 228 |
'£': ' funta ',
|
|
|
|
|
|
|
|
|
|
| 229 |
# Others
|
| 230 |
# 'rn': 'rrn',
|
| 231 |
# 'ć': 'č',
|
|
|
|
| 275 |
'>': ' größer als',
|
| 276 |
# Additional common math symbols from previous list
|
| 277 |
'%': ' prozent ',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
'€': ' euro ',
|
| 279 |
'$': ' dollar ',
|
| 280 |
'£': ' pfund ',
|
|
|
|
| 300 |
'>': ' supérieur à ',
|
| 301 |
# Add more common math symbols as needed for French
|
| 302 |
'%': ' pour cent ',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
'€': ' euro ',
|
| 304 |
'$': ' dollar ',
|
| 305 |
'£': ' livre ',
|
| 306 |
'&': ' et ',
|
| 307 |
'@': ' arobase ',
|
|
|
|
| 308 |
}
|
| 309 |
|
| 310 |
hun_replacements = {
|
|
|
|
| 326 |
'pi': ' pi ',
|
| 327 |
'<': ' kisebb mint ',
|
| 328 |
'>': ' nagyobb mint ',
|
|
|
|
| 329 |
'%': ' százalék ',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
'€': ' euró ',
|
| 331 |
'$': ' dollár ',
|
| 332 |
'£': ' font ',
|
|
|
|
| 343 |
'^': ' εἰς τὴν δύναμιν ',
|
| 344 |
'+': ' σὺν ',
|
| 345 |
' - ': ' χωρὶς ',
|
| 346 |
+
' * ': ' πολλάκις ',
|
| 347 |
' / ': ' διαιρέω ',
|
| 348 |
'=': ' ἴσον ',
|
| 349 |
'pi': ' πῖ ',
|
| 350 |
'<': ' ἔλαττον ',
|
| 351 |
'>': ' μεῖζον ',
|
|
|
|
| 352 |
'%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
'€': ' εὐρώ ',
|
| 354 |
'$': ' δολάριον ',
|
| 355 |
'£': ' λίρα ',
|
|
|
|
| 440 |
return match.group(0) # Return original if conversion fails
|
| 441 |
|
| 442 |
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
|
| 443 |
+
return re.sub(pattern, replace_number, number_string)
|