Spaces:
Runtime error
Runtime error
| INDIC_NLP_LIB_HOME = "indic_nlp_library" | |
| INDIC_NLP_RESOURCES = "indic_nlp_resources" | |
| import sys | |
| sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) | |
| from indicnlp import common | |
| common.set_resources_path(INDIC_NLP_RESOURCES) | |
| from indicnlp import loader | |
| loader.load() | |
| from sacremoses import MosesPunctNormalizer | |
| from sacremoses import MosesTokenizer | |
| from sacremoses import MosesDetokenizer | |
| from collections import defaultdict | |
| from tqdm import tqdm | |
| from joblib import Parallel, delayed | |
| from indicnlp.tokenize import indic_tokenize | |
| from indicnlp.tokenize import indic_detokenize | |
| from indicnlp.normalize import indic_normalize | |
| from indicnlp.transliterate import unicode_transliterate | |
| en_tok = MosesTokenizer(lang="en") | |
| en_normalizer = MosesPunctNormalizer() | |
| def preprocess_line(line, normalizer, lang, transliterate=False): | |
| if lang == "en": | |
| return " ".join( | |
| en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False) | |
| ) | |
| elif transliterate: | |
| # line = indic_detokenize.trivial_detokenize(line.strip(), lang) | |
| return unicode_transliterate.UnicodeIndicTransliterator.transliterate( | |
| " ".join( | |
| indic_tokenize.trivial_tokenize( | |
| normalizer.normalize(line.strip()), lang | |
| ) | |
| ), | |
| lang, | |
| "hi", | |
| ).replace(" ् ", "्") | |
| else: | |
| # we only need to transliterate for joint training | |
| return " ".join( | |
| indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang) | |
| ) | |
| def preprocess(infname, outfname, lang, transliterate=False): | |
| """ | |
| Normalize, tokenize and script convert(for Indic) | |
| return number of sentences input file | |
| """ | |
| n = 0 | |
| num_lines = sum(1 for line in open(infname, "r")) | |
| if lang == "en": | |
| with open(infname, "r", encoding="utf-8") as infile, open( | |
| outfname, "w", encoding="utf-8" | |
| ) as outfile: | |
| out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( | |
| delayed(preprocess_line)(line, None, lang) | |
| for line in tqdm(infile, total=num_lines) | |
| ) | |
| for line in out_lines: | |
| outfile.write(line + "\n") | |
| n += 1 | |
| else: | |
| normfactory = indic_normalize.IndicNormalizerFactory() | |
| normalizer = normfactory.get_normalizer(lang) | |
| # reading | |
| with open(infname, "r", encoding="utf-8") as infile, open( | |
| outfname, "w", encoding="utf-8" | |
| ) as outfile: | |
| out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( | |
| delayed(preprocess_line)(line, normalizer, lang, transliterate) | |
| for line in tqdm(infile, total=num_lines) | |
| ) | |
| for line in out_lines: | |
| outfile.write(line + "\n") | |
| n += 1 | |
| return n | |
| def old_preprocess(infname, outfname, lang): | |
| """ | |
| Preparing each corpus file: | |
| - Normalization | |
| - Tokenization | |
| - Script coversion to Devanagari for Indic scripts | |
| """ | |
| n = 0 | |
| num_lines = sum(1 for line in open(infname, "r")) | |
| # reading | |
| with open(infname, "r", encoding="utf-8") as infile, open( | |
| outfname, "w", encoding="utf-8" | |
| ) as outfile: | |
| if lang == "en": | |
| en_tok = MosesTokenizer(lang="en") | |
| en_normalizer = MosesPunctNormalizer() | |
| for line in tqdm(infile, total=num_lines): | |
| outline = " ".join( | |
| en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False) | |
| ) | |
| outfile.write(outline + "\n") | |
| n += 1 | |
| else: | |
| normfactory = indic_normalize.IndicNormalizerFactory() | |
| normalizer = normfactory.get_normalizer(lang) | |
| for line in tqdm(infile, total=num_lines): | |
| outline = ( | |
| unicode_transliterate.UnicodeIndicTransliterator.transliterate( | |
| " ".join( | |
| indic_tokenize.trivial_tokenize( | |
| normalizer.normalize(line.strip()), lang | |
| ) | |
| ), | |
| lang, | |
| "hi", | |
| ).replace(" ् ", "्") | |
| ) | |
| outfile.write(outline + "\n") | |
| n += 1 | |
| return n | |
| if __name__ == "__main__": | |
| # INDIC_NLP_LIB_HOME = "indic_nlp_library" | |
| # INDIC_NLP_RESOURCES = "indic_nlp_resources" | |
| # sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME)) | |
| # common.set_resources_path(INDIC_NLP_RESOURCES) | |
| # data_dir = '../joint_training/v1' | |
| # new_dir = data_dir + '.norm' | |
| # for path, subdirs, files in os.walk(data_dir): | |
| # for name in files: | |
| # infile = os.path.join(path, name) | |
| # lang = infile.split('.')[-1] | |
| # outfile = os.path.join(path.replace(data_dir, new_dir), name) | |
| # preprocess(infile, outfile, lang) | |
| # loader.load() | |
| infname = sys.argv[1] | |
| outfname = sys.argv[2] | |
| lang = sys.argv[3] | |
| if len(sys.argv) == 4: | |
| transliterate = False | |
| elif len(sys.argv) == 5: | |
| transliterate = sys.argv[4] | |
| if transliterate.lower() == "true": | |
| transliterate = True | |
| else: | |
| transliterate = False | |
| else: | |
| print(f"Invalid arguments: {sys.argv}") | |
| exit() | |
| print(preprocess(infname, outfname, lang, transliterate)) | |