|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import print_function |
|
|
import sys, getopt, codecs, os, re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buck2uni = {"'": u"\u0621", |
|
|
"|": u"\u0622", |
|
|
">": u"\u0623", |
|
|
"&": u"\u0624", |
|
|
"<": u"\u0625", |
|
|
"}": u"\u0626", |
|
|
"A": u"\u0627", |
|
|
"b": u"\u0628", |
|
|
"p": u"\u0629", |
|
|
"t": u"\u062A", |
|
|
"v": u"\u062B", |
|
|
"j": u"\u062C", |
|
|
"H": u"\u062D", |
|
|
"x": u"\u062E", |
|
|
"d": u"\u062F", |
|
|
"*": u"\u0630", |
|
|
"r": u"\u0631", |
|
|
"z": u"\u0632", |
|
|
"s": u"\u0633", |
|
|
"$": u"\u0634", |
|
|
"S": u"\u0635", |
|
|
"D": u"\u0636", |
|
|
"T": u"\u0637", |
|
|
"Z": u"\u0638", |
|
|
"E": u"\u0639", |
|
|
"g": u"\u063A", |
|
|
"_": u"\u0640", |
|
|
"f": u"\u0641", |
|
|
"q": u"\u0642", |
|
|
"k": u"\u0643", |
|
|
"l": u"\u0644", |
|
|
"m": u"\u0645", |
|
|
"n": u"\u0646", |
|
|
"h": u"\u0647", |
|
|
"w": u"\u0648", |
|
|
"Y": u"\u0649", |
|
|
"y": u"\u064A", |
|
|
"F": u"\u064B", |
|
|
"N": u"\u064C", |
|
|
"K": u"\u064D", |
|
|
"a": u"\u064E", |
|
|
"u": u"\u064F", |
|
|
"i": u"\u0650", |
|
|
"~": u"\u0651", |
|
|
"o": u"\u0652", |
|
|
"`": u"\u0670", |
|
|
"{": u"\u0671", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uni2buck = {} |
|
|
|
|
|
|
|
|
for (key, value) in buck2uni.items(): |
|
|
|
|
|
|
|
|
uni2buck[value] = key |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inFilename = "" |
|
|
outFilename = "" |
|
|
inEnc = "" |
|
|
outEnc = "" |
|
|
ignoreChars = "" |
|
|
columnRange = "" |
|
|
delimiter = "" |
|
|
reverse = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def usage(): |
|
|
print("Usage: {} -i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR".format(sys.argv[0])) |
|
|
print(" -r -e INPUT_ENCODING, -E OUTPUT ENCODING]") |
|
|
print(" {} -l".format(sys.argv[0])) |
|
|
print(" {} -h".format(sys.argv[0])) |
|
|
print("") |
|
|
print(" -i INFILE, --input=INFILE:") |
|
|
print(" Path to text file to be transliterated to Unicode.") |
|
|
print(" -o OUTFILE, --output=OUTFILE:") |
|
|
print(" Path of file to output the newly transliterated text.") |
|
|
print(" -e ENC, --input-encoding=ENC:") |
|
|
print(" Specify the text encoding of the source file. Default: latin_1.") |
|
|
print(" -E ENC, --output-encoding=ENC:") |
|
|
print(" Specify the text encoding of the target file. Default: utf_8.") |
|
|
print(" -g CHARS, --ignore-lines=CHARS:") |
|
|
print(" Will not transliterate lines that start with any of the CHARS") |
|
|
print(" given. E.g., -g #; will not alter lines starting with # or ;.") |
|
|
print(" (May need to be -g \#\; on some platforms. See README.txt.)") |
|
|
print(" -c RANGE, --columns=RANGE:") |
|
|
print(" If in columns, select columns to apply transliteration. Can be") |
|
|
print(" comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3.") |
|
|
print(" -d CHAR, --delimiter=CHAR:") |
|
|
print(" Specify the delimiter that defines the column if using the -c") |
|
|
print(" option above. Default is ' ' (space).") |
|
|
print(" -r, --reverse:") |
|
|
print(" Reverses the transliteration, i.e., Arabic to Buckwalter.") |
|
|
print(" When used, it will change the default input encoding to utf_8 and") |
|
|
print(" output encoding to latin_1") |
|
|
print(" -l, --list-encodings:") |
|
|
print(" Displays all supported file encodings.") |
|
|
print(" -h, --help:") |
|
|
print(" Displays this page.") |
|
|
print("") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def displayEncodings(): |
|
|
print("Codec Aliases Languages") |
|
|
print("ascii 646, us-ascii English") |
|
|
print("cp037 IBM037, IBM039 English") |
|
|
print("cp424 EBCDIC-CP-HE, IBM424 Hebrew") |
|
|
print("cp437 437, IBM437 English") |
|
|
print("cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe") |
|
|
print("cp737 Greek") |
|
|
print("cp775 IBM775 Baltic languages") |
|
|
print("cp850 850, IBM850 Western Europe") |
|
|
print("cp852 852, IBM852 Central and Eastern Europe") |
|
|
print("cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian") |
|
|
print("cp856 Hebrew") |
|
|
print("cp857 857, IBM857 Turkish") |
|
|
print("cp860 860, IBM860 Portuguese") |
|
|
print("cp861 861, CP-IS, IBM861 Icelandic") |
|
|
print("cp862 862, IBM862 Hebrew") |
|
|
print("cp863 863, IBM863 Canadian") |
|
|
print("cp864 IBM864 Arabic") |
|
|
print("cp865 865, IBM865 Danish, Norwegian") |
|
|
print("cp869 869, CP-GR, IBM869 Greek") |
|
|
print("cp874 Thai") |
|
|
print("cp875 Greek") |
|
|
print("cp1006 Urdu") |
|
|
print("cp1026 ibm1026 Turkish") |
|
|
print("cp1140 ibm1140 Western Europe") |
|
|
print("cp1250 windows-1250 Central and Eastern Europe") |
|
|
print("cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian") |
|
|
print("cp1252 windows-1252 Western Europe") |
|
|
print("cp1253 windows-1253 Greek") |
|
|
print("cp1254 windows-1254 Turkish") |
|
|
print("cp1255 windows-1255 Hebrew") |
|
|
print("cp1256 windows-1256 Arabic") |
|
|
print("cp1257 windows-1257 Baltic languages") |
|
|
print("cp1258 windows-1258 Vietnamese") |
|
|
print("latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 West Europe") |
|
|
print("iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe") |
|
|
print("iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese") |
|
|
print("iso8859_4 iso-8859-4, latin4, L4 Baltic languagues") |
|
|
print("iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian") |
|
|
print("iso8859_6 iso-8859-6, arabic Arabic") |
|
|
print("iso8859_7 iso-8859-7, greek, greek8 Greek") |
|
|
print("iso8859_8 iso-8859-8, hebrew Hebrew") |
|
|
print("iso8859_9 iso-8859-9, latin5, L5 Turkish") |
|
|
print("iso8859_10 iso-8859-10, latin6, L6 Nordic languages") |
|
|
print("iso8859_13 iso-8859-13 Baltic languages") |
|
|
print("iso8859_14 iso-8859-14, latin8, L8 Celtic languages") |
|
|
print("iso8859_15 iso-8859-15 Western Europe") |
|
|
print("koi8_r Russian") |
|
|
print("koi8_u Ukrainian") |
|
|
print("mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian") |
|
|
print("mac_greek macgreek Greek") |
|
|
print("mac_iceland maciceland Icelandic") |
|
|
print("mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe") |
|
|
print("mac_roman macroman Western Europe") |
|
|
print("mac_turkish macturkish Turkish") |
|
|
print("utf_16 U16, utf16 all languages") |
|
|
print("utf_16_be UTF-16BE all languages (BMP only)") |
|
|
print("utf_16_le UTF-16LE all languages (BMP only)") |
|
|
print("utf_7 U7 all languages") |
|
|
print("utf_8 U8, UTF, utf8 all languages") |
|
|
|
|
|
def parseIgnoreString(string): |
|
|
|
|
|
symbols = [] |
|
|
|
|
|
for char in string: |
|
|
symbols.append(char) |
|
|
|
|
|
return symbols |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
(options, args) = getopt.getopt(sys.argv[1:], "i:o:e:E:g:c:d:rlh", |
|
|
["input=","output=", "input-encoding=", "output-encoding=", |
|
|
"ignore-lines=", "columns=", "delimiter=" "reverse", "list-encodings", |
|
|
"help"]) |
|
|
|
|
|
except getopt.GetoptError: |
|
|
|
|
|
usage() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
for (x, y) in options: |
|
|
if x in ("-h", "--help"): |
|
|
usage() |
|
|
sys.exit(0) |
|
|
|
|
|
if x in ("-l", "--list-encodings"): |
|
|
displayEncodings() |
|
|
sys.exit(0) |
|
|
|
|
|
if x in ("-i", "--input"): inFilename = y |
|
|
if x in ("-o", "--output"): outFilename = y |
|
|
if x in ("-e", "--input-encoding"): inEnc= y |
|
|
if x in ("-E", "--output-encoding"): outEnc= y |
|
|
if x in ("-r", "--reverse"): reverse = 1 |
|
|
if x in ("-g", "--ignore-lines"): ignoreChars = y |
|
|
if x in ("-c", "--columns"): columnRange = y |
|
|
if x in ("-d", "--delimiter"): |
|
|
delimiter = y |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
delimiter = delimiter.replace("\\t", "\t") |
|
|
|
|
|
if len(delimiter) > 1: |
|
|
print("Delimeter should only be a single character. Using first character" + delimiter[0], file=sys.stderr) |
|
|
delimiter = delimiter[0] |
|
|
|
|
|
if buck2uni.get(delimiter): |
|
|
print("Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set.", file=sys.stderr) |
|
|
print("This will obviously cause much confusion as a delimiter!", file=sys.stderr) |
|
|
print("Please try again. Aborting...", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if not delimiter: |
|
|
delimiter = " " |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not inEnc: |
|
|
if reverse: |
|
|
inEnc = "utf_8" |
|
|
else: |
|
|
inEnc = "latin_1" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not outEnc: |
|
|
if reverse: |
|
|
outEnc = "latin_1" |
|
|
else: |
|
|
outEnc = "utf_8" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if outFilename: |
|
|
try: |
|
|
|
|
|
|
|
|
outFile = codecs.open(outFilename, "w", outEnc) |
|
|
|
|
|
except IOError as msg: |
|
|
|
|
|
|
|
|
print(msg) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
print("Must specify a file to use store the output! Aborting...") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if inFilename: |
|
|
try: |
|
|
|
|
|
|
|
|
inFile = codecs.open(inFilename, "r", inEnc) |
|
|
|
|
|
except IOError as msg: |
|
|
|
|
|
|
|
|
print(msg) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
else: |
|
|
print("Must specify a file to use as input! Aborting...") |
|
|
sys.exit(1) |
|
|
|
|
|
def getColsFromRange(cRange): |
|
|
|
|
|
columns = [] |
|
|
hyphenSearch = re.compile(r'-') |
|
|
|
|
|
rangeElements = cRange.split(",") |
|
|
|
|
|
for i in rangeElements: |
|
|
|
|
|
if hyphenSearch.search(i): |
|
|
[start, end] = i.split("-") |
|
|
columns = columns + list(range(int(start)-1,int(end))) |
|
|
else: |
|
|
columns.append(int(i)-1) |
|
|
|
|
|
return columns |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transliterate(inString, lineNumber): |
|
|
out = "" |
|
|
|
|
|
if columnRange: |
|
|
columns = getColsFromRange(columnRange) |
|
|
|
|
|
|
|
|
lineCols = inString.split(delimiter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(lineCols)): |
|
|
|
|
|
|
|
|
if i == 0: |
|
|
if i in columns: |
|
|
out = transliterateString(lineCols[i]) |
|
|
else : |
|
|
out = lineCols[i] |
|
|
else : |
|
|
if i in columns: |
|
|
out = out + delimiter + transliterateString(lineCols[i]) |
|
|
else : |
|
|
out = out + delimiter + lineCols[i] |
|
|
|
|
|
else: |
|
|
out = transliterateString(inString) |
|
|
|
|
|
|
|
|
|
|
|
return out |
|
|
|
|
|
def transliterateString(inString): |
|
|
|
|
|
out = "" |
|
|
|
|
|
|
|
|
if not reverse: |
|
|
|
|
|
|
|
|
for char in inString: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
out = out + buck2uni.get(char, char) |
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
for char in inString: |
|
|
out = out + uni2buck.get(char, char) |
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lines = inFile.readlines() |
|
|
|
|
|
currentLineNumber = 1 |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
if not ignoreChars: |
|
|
outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) |
|
|
else: |
|
|
if line[0] in parseIgnoreString(ignoreChars): |
|
|
outFile.write(line + " " + os.linesep) |
|
|
else: |
|
|
outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) |
|
|
|
|
|
currentLineNumber = currentLineNumber + 1 |
|
|
|
|
|
except UnicodeError as msg: |
|
|
|
|
|
print(msg) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
inFile.close() |
|
|
outFile.close() |
|
|
|
|
|
|
|
|
|