Spaces:

tyfsadik
/

Humanizer

Running

App Files Files Community

tyfsadik commited on 13 days ago

Commit

88cda3e

verified ·

1 Parent(s): e59b6d4

Upload 4 files

Browse files

Files changed (4) hide show

utils/filter_brackets.py +35 -0
utils/helpers.py +233 -0
utils/prepare_clc_fce_data.py +123 -0
utils/preprocess_data.py +488 -0

utils/filter_brackets.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import argparse
+import re
+from helpers import write_lines
+def filter_line(line):
+    if "-LRB-" in line and "-RRB-" in line:
+        rep = re.sub(r'\-.*?LRB.*?\-.*?\-.*?RRB.*?\-', '', line)
+        line_cleaned = rep
+    elif ("-LRB-" in line and "-RRB-" not in line) or (
+            "-LRB-" not in line and "-RRB-" in line):
+        line_cleaned = line.replace("-LRB-", '"').replace("-RRB-", '"')
+    else:
+        line_cleaned = line
+    return line_cleaned
+def main(args):
+    with open(args.source) as f:
+        data = [row.rstrip() for row in f]
+    write_lines(args.output, [filter_line(row) for row in data])
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-s', '--source',
+                        help='Path to the source file',
+                        required=True)
+    parser.add_argument('-o', '--output',
+                        help='Path to the output file',
+                        required=True)
+    args = parser.parse_args()
+    main(args)

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+from pathlib import Path
+VOCAB_DIR = Path(__file__).resolve().parent.parent / "data"
+PAD = "@@PADDING@@"
+UNK = "@@UNKNOWN@@"
+START_TOKEN = "$START"
+SEQ_DELIMETERS = {"tokens": " ",
+                  "labels": "SEPL|||SEPR",
+                  "operations": "SEPL__SEPR"}
+REPLACEMENTS = {
+    "''": '"',
+    '--': '—',
+    '`': "'",
+    "'ve": "' ve",
+}
+def get_verb_form_dicts():
+    path_to_dict = os.path.join(VOCAB_DIR, "verb-form-vocab.txt")
+    encode, decode = {}, {}
+    with open(path_to_dict, encoding="utf-8") as f:
+        for line in f:
+            words, tags = line.split(":")
+            word1, word2 = words.split("_")
+            tag1, tag2 = tags.split("_")
+            decode_key = f"{word1}_{tag1}_{tag2.strip()}"
+            if decode_key not in decode:
+                encode[words] = tags
+                decode[decode_key] = word2
+    return encode, decode
+ENCODE_VERB_DICT, DECODE_VERB_DICT = get_verb_form_dicts()
+def get_target_sent_by_edits(source_tokens, edits):
+    target_tokens = source_tokens[:]
+    shift_idx = 0
+    for edit in edits:
+        start, end, label, _ = edit
+        target_pos = start + shift_idx
+        source_token = target_tokens[target_pos] \
+            if len(target_tokens) > target_pos >= 0 else ''
+        if label == "":
+            del target_tokens[target_pos]
+            shift_idx -= 1
+        elif start == end:
+            word = label.replace("$APPEND_", "")
+            target_tokens[target_pos: target_pos] = [word]
+            shift_idx += 1
+        elif label.startswith("$TRANSFORM_"):
+            word = apply_reverse_transformation(source_token, label)
+            if word is None:
+                word = source_token
+            target_tokens[target_pos] = word
+        elif start == end - 1:
+            word = label.replace("$REPLACE_", "")
+            target_tokens[target_pos] = word
+        elif label.startswith("$MERGE_"):
+            target_tokens[target_pos + 1: target_pos + 1] = [label]
+            shift_idx += 1
+    return replace_merge_transforms(target_tokens)
+def replace_merge_transforms(tokens):
+    if all(not x.startswith("$MERGE_") for x in tokens):
+        return tokens
+    target_line = " ".join(tokens)
+    target_line = target_line.replace(" $MERGE_HYPHEN ", "-")
+    target_line = target_line.replace(" $MERGE_SPACE ", "")
+    return target_line.split()
+def convert_using_case(token, smart_action):
+    if not smart_action.startswith("$TRANSFORM_CASE_"):
+        return token
+    if smart_action.endswith("LOWER"):
+        return token.lower()
+    elif smart_action.endswith("UPPER"):
+        return token.upper()
+    elif smart_action.endswith("CAPITAL"):
+        return token.capitalize()
+    elif smart_action.endswith("CAPITAL_1"):
+        return token[0] + token[1:].capitalize()
+    elif smart_action.endswith("UPPER_-1"):
+        return token[:-1].upper() + token[-1]
+    else:
+        return token
+def convert_using_verb(token, smart_action):
+    key_word = "$TRANSFORM_VERB_"
+    if not smart_action.startswith(key_word):
+        raise Exception(f"Unknown action type {smart_action}")
+    encoding_part = f"{token}_{smart_action[len(key_word):]}"
+    decoded_target_word = decode_verb_form(encoding_part)
+    return decoded_target_word
+def convert_using_split(token, smart_action):
+    key_word = "$TRANSFORM_SPLIT"
+    if not smart_action.startswith(key_word):
+        raise Exception(f"Unknown action type {smart_action}")
+    target_words = token.split("-")
+    return " ".join(target_words)
+def convert_using_plural(token, smart_action):
+    if smart_action.endswith("PLURAL"):
+        return token + "s"
+    elif smart_action.endswith("SINGULAR"):
+        return token[:-1]
+    else:
+        raise Exception(f"Unknown action type {smart_action}")
+def apply_reverse_transformation(source_token, transform):
+    if transform.startswith("$TRANSFORM"):
+        # deal with equal
+        if transform == "$KEEP":
+            return source_token
+        # deal with case
+        if transform.startswith("$TRANSFORM_CASE"):
+            return convert_using_case(source_token, transform)
+        # deal with verb
+        if transform.startswith("$TRANSFORM_VERB"):
+            return convert_using_verb(source_token, transform)
+        # deal with split
+        if transform.startswith("$TRANSFORM_SPLIT"):
+            return convert_using_split(source_token, transform)
+        # deal with single/plural
+        if transform.startswith("$TRANSFORM_AGREEMENT"):
+            return convert_using_plural(source_token, transform)
+        # raise exception if not find correct type
+        raise Exception(f"Unknown action type {transform}")
+    else:
+        return source_token
+def read_parallel_lines(fn1, fn2):
+    lines1 = read_lines(fn1, skip_strip=True)
+    lines2 = read_lines(fn2, skip_strip=True)
+    assert len(lines1) == len(lines2)
+    out_lines1, out_lines2 = [], []
+    for line1, line2 in zip(lines1, lines2):
+        if not line1.strip() or not line2.strip():
+            continue
+        else:
+            out_lines1.append(line1)
+            out_lines2.append(line2)
+    return out_lines1, out_lines2
+def read_lines(fn, skip_strip=False):
+    if not os.path.exists(fn):
+        return []
+    with open(fn, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    return [s.strip() for s in lines if s.strip() or skip_strip]
+def write_lines(fn, lines, mode='w'):
+    if mode == 'w' and os.path.exists(fn):
+        os.remove(fn)
+    with open(fn, encoding='utf-8', mode=mode) as f:
+        f.writelines(['%s\n' % s for s in lines])
+def decode_verb_form(original):
+    return DECODE_VERB_DICT.get(original)
+def encode_verb_form(original_word, corrected_word):
+    decoding_request = original_word + "_" + corrected_word
+    decoding_response = ENCODE_VERB_DICT.get(decoding_request, "").strip()
+    if original_word and decoding_response:
+        answer = decoding_response
+    else:
+        answer = None
+    return answer
+def get_weights_name(transformer_name, lowercase):
+    if transformer_name == 'bert' and lowercase:
+        return 'bert-base-uncased'
+    if transformer_name == 'bert' and not lowercase:
+        return 'bert-base-cased'
+    if transformer_name == 'bert-large' and not lowercase:
+        return 'bert-large-cased'
+    if transformer_name == 'distilbert':
+        if not lowercase:
+            print('Warning! This model was trained only on uncased sentences.')
+        return 'distilbert-base-uncased'
+    if transformer_name == 'albert':
+        if not lowercase:
+            print('Warning! This model was trained only on uncased sentences.')
+        return 'albert-base-v1'
+    if lowercase:
+        print('Warning! This model was trained only on cased sentences.')
+    if transformer_name == 'roberta':
+        return 'roberta-base'
+    if transformer_name == 'roberta-large':
+        return 'roberta-large'
+    if transformer_name == 'gpt2':
+        return 'gpt2'
+    if transformer_name == 'transformerxl':
+        return 'transfo-xl-wt103'
+    if transformer_name == 'xlnet':
+        return 'xlnet-base-cased'
+    if transformer_name == 'xlnet-large':
+        return 'xlnet-large-cased'
+def remove_double_tokens(sent):
+    tokens = sent.split(' ')
+    deleted_idx = []
+    for i in range(len(tokens) -1):
+        if tokens[i] == tokens[i + 1]:
+            deleted_idx.append(i + 1)
+    if deleted_idx:
+        tokens = [tokens[i] for i in range(len(tokens)) if i not in deleted_idx]
+    return ' '.join(tokens)
+def normalize(sent):
+    sent = remove_double_tokens(sent)
+    for fr, to in REPLACEMENTS.items():
+        sent = sent.replace(fr, to)
+    return sent.lower()

utils/prepare_clc_fce_data.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python
+"""
+Convert CLC-FCE dataset (The Cambridge Learner Corpus) to the parallel sentences format.
+"""
+import argparse
+import glob
+import os
+import re
+from xml.etree import cElementTree
+from nltk.tokenize import sent_tokenize, word_tokenize
+from tqdm import tqdm
+def annotate_fce_doc(xml):
+    """Takes a FCE xml document and yields sentences with annotated errors."""
+    result = []
+    doc = cElementTree.fromstring(xml)
+    paragraphs = doc.findall('head/text/*/coded_answer/p')
+    for p in paragraphs:
+        text = _get_formatted_text(p)
+        result.append(text)
+    return '\n'.join(result)
+def _get_formatted_text(elem, ignore_tags=None):
+    text = elem.text or ''
+    ignore_tags = [tag.upper() for tag in (ignore_tags or [])]
+    correct = None
+    mistake = None
+    for child in elem.getchildren():
+        tag = child.tag.upper()
+        if tag == 'NS':
+            text += _get_formatted_text(child)
+        elif tag == 'UNKNOWN':
+            text += ' UNKNOWN '
+        elif tag == 'C':
+            assert correct is None
+            correct = _get_formatted_text(child)
+        elif tag == 'I':
+            assert mistake is None
+            mistake = _get_formatted_text(child)
+        elif tag in ignore_tags:
+            pass
+        else:
+            raise ValueError(f"Unknown tag `{child.tag}`", text)
+    if correct or mistake:
+        correct = correct or ''
+        mistake = mistake or ''
+        if '=>' not in mistake:
+            text += f'{{{mistake}=>{correct}}}'
+        else:
+            text += mistake
+    text += elem.tail or ''
+    return text
+def convert_fce(fce_dir):
+    """Processes the whole FCE directory. Yields annotated documents (strings)."""
+    # Ensure we got the valid dataset path
+    if not os.path.isdir(fce_dir):
+        raise UserWarning(
+            f"{fce_dir} is not a valid path")
+    dataset_dir = os.path.join(fce_dir, 'dataset')
+    if not os.path.exists(dataset_dir):
+        raise UserWarning(
+            f"{fce_dir} doesn't point to a dataset's root dir")
+    # Convert XML docs to the corpora format
+    filenames = sorted(glob.glob(os.path.join(dataset_dir, '*/*.xml')))
+    docs = []
+    for filename in filenames:
+        with open(filename, encoding='utf-8') as f:
+            doc = annotate_fce_doc(f.read())
+            docs.append(doc)
+    return docs
+def main():
+    fce = convert_fce(args.fce_dataset_path)
+    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
+            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
+        for doc in tqdm(fce, unit='doc'):
+            sents = re.split(r"\n +\n", doc)
+            for sent in sents:
+                tokenized_sents = sent_tokenize(sent)
+                for i in range(len(tokenized_sents)):
+                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
+                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
+                        tokenized_sents[i] = ""
+                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
+                    original = re.sub(regexp, r"\1", tokenized_sents[i])
+                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
+                    # filter out nested alerts
+                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
+                            and not re.search(r"[{}=]", applied):
+                        out_original.write(" ".join(word_tokenize(original)) + "\n")
+                        out_applied.write(" ".join(word_tokenize(applied)) + "\n")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=(
+        "Convert CLC-FCE dataset to the parallel sentences format."))
+    parser.add_argument('fce_dataset_path',
+                        help='Path to the folder with the FCE dataset')
+    parser.add_argument('--output',
+                        help='Path to the output folder')
+    args = parser.parse_args()
+    main()

utils/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import argparse
+import os
+from difflib import SequenceMatcher
+import Levenshtein
+import numpy as np
+from tqdm import tqdm
+from helpers import write_lines, read_parallel_lines, encode_verb_form, \
+    apply_reverse_transformation, SEQ_DELIMETERS, START_TOKEN
+def perfect_align(t, T, insertions_allowed=0,
+                  cost_function=Levenshtein.distance):
+    # dp[i, j, k] is a minimal cost of matching first `i` tokens of `t` with
+    # first `j` tokens of `T`, after making `k` insertions after last match of
+    # token from `t`. In other words t[:i] aligned with T[:j].
+    # Initialize with INFINITY (unknown)
+    shape = (len(t) + 1, len(T) + 1, insertions_allowed + 1)
+    dp = np.ones(shape, dtype=int) * int(1e9)
+    come_from = np.ones(shape, dtype=int) * int(1e9)
+    come_from_ins = np.ones(shape, dtype=int) * int(1e9)
+    dp[0, 0, 0] = 0  # The only known starting point. Nothing matched to nothing.
+    for i in range(len(t) + 1):  # Go inclusive
+        for j in range(len(T) + 1):  # Go inclusive
+            for q in range(insertions_allowed + 1):  # Go inclusive
+                if i < len(t):
+                    # Given matched sequence of t[:i] and T[:j], match token
+                    # t[i] with following tokens T[j:k].
+                    for k in range(j, len(T) + 1):
+                        transform = \
+                            apply_transformation(t[i], '   '.join(T[j:k]))
+                        if transform:
+                            cost = 0
+                        else:
+                            cost = cost_function(t[i], '   '.join(T[j:k]))
+                        current = dp[i, j, q] + cost
+                        if dp[i + 1, k, 0] > current:
+                            dp[i + 1, k, 0] = current
+                            come_from[i + 1, k, 0] = j
+                            come_from_ins[i + 1, k, 0] = q
+                if q < insertions_allowed:
+                    # Given matched sequence of t[:i] and T[:j], create
+                    # insertion with following tokens T[j:k].
+                    for k in range(j, len(T) + 1):
+                        cost = len('   '.join(T[j:k]))
+                        current = dp[i, j, q] + cost
+                        if dp[i, k, q + 1] > current:
+                            dp[i, k, q + 1] = current
+                            come_from[i, k, q + 1] = j
+                            come_from_ins[i, k, q + 1] = q
+    # Solution is in the dp[len(t), len(T), *]. Backtracking from there.
+    alignment = []
+    i = len(t)
+    j = len(T)
+    q = dp[i, j, :].argmin()
+    while i > 0 or q > 0:
+        is_insert = (come_from_ins[i, j, q] != q) and (q != 0)
+        j, k, q = come_from[i, j, q], j, come_from_ins[i, j, q]
+        if not is_insert:
+            i -= 1
+        if is_insert:
+            alignment.append(['INSERT', T[j:k], (i, i)])
+        else:
+            alignment.append([f'REPLACE_{t[i]}', T[j:k], (i, i + 1)])
+    assert j == 0
+    return dp[len(t), len(T)].min(), list(reversed(alignment))
+def _split(token):
+    if not token:
+        return []
+    parts = token.split()
+    return parts or [token]
+def apply_merge_transformation(source_tokens, target_words, shift_idx):
+    edits = []
+    if len(source_tokens) > 1 and len(target_words) == 1:
+        # check merge
+        transform = check_merge(source_tokens, target_words)
+        if transform:
+            for i in range(len(source_tokens) - 1):
+                edits.append([(shift_idx + i, shift_idx + i + 1), transform])
+            return edits
+    if len(source_tokens) == len(target_words) == 2:
+        # check swap
+        transform = check_swap(source_tokens, target_words)
+        if transform:
+            edits.append([(shift_idx, shift_idx + 1), transform])
+    return edits
+def is_sent_ok(sent, delimeters=SEQ_DELIMETERS):
+    for del_val in delimeters.values():
+        if del_val in sent and del_val != delimeters["tokens"]:
+            return False
+    return True
+def check_casetype(source_token, target_token):
+    if source_token.lower() != target_token.lower():
+        return None
+    if source_token.lower() == target_token:
+        return "$TRANSFORM_CASE_LOWER"
+    elif source_token.capitalize() == target_token:
+        return "$TRANSFORM_CASE_CAPITAL"
+    elif source_token.upper() == target_token:
+        return "$TRANSFORM_CASE_UPPER"
+    elif source_token[1:].capitalize() == target_token[1:] and source_token[0] == target_token[0]:
+        return "$TRANSFORM_CASE_CAPITAL_1"
+    elif source_token[:-1].upper() == target_token[:-1] and source_token[-1] == target_token[-1]:
+        return "$TRANSFORM_CASE_UPPER_-1"
+    else:
+        return None
+def check_equal(source_token, target_token):
+    if source_token == target_token:
+        return "$KEEP"
+    else:
+        return None
+def check_split(source_token, target_tokens):
+    if source_token.split("-") == target_tokens:
+        return "$TRANSFORM_SPLIT_HYPHEN"
+    else:
+        return None
+def check_merge(source_tokens, target_tokens):
+    if "".join(source_tokens) == "".join(target_tokens):
+        return "$MERGE_SPACE"
+    elif "-".join(source_tokens) == "-".join(target_tokens):
+        return "$MERGE_HYPHEN"
+    else:
+        return None
+def check_swap(source_tokens, target_tokens):
+    if source_tokens == [x for x in reversed(target_tokens)]:
+        return "$MERGE_SWAP"
+    else:
+        return None
+def check_plural(source_token, target_token):
+    if source_token.endswith("s") and source_token[:-1] == target_token:
+        return "$TRANSFORM_AGREEMENT_SINGULAR"
+    elif target_token.endswith("s") and source_token == target_token[:-1]:
+        return "$TRANSFORM_AGREEMENT_PLURAL"
+    else:
+        return None
+def check_verb(source_token, target_token):
+    encoding = encode_verb_form(source_token, target_token)
+    if encoding:
+        return f"$TRANSFORM_VERB_{encoding}"
+    else:
+        return None
+def apply_transformation(source_token, target_token):
+    target_tokens = target_token.split()
+    if len(target_tokens) > 1:
+        # check split
+        transform = check_split(source_token, target_tokens)
+        if transform:
+            return transform
+    checks = [check_equal, check_casetype, check_verb, check_plural]
+    for check in checks:
+        transform = check(source_token, target_token)
+        if transform:
+            return transform
+    return None
+def align_sequences(source_sent, target_sent):
+    # check if sent is OK
+    if not is_sent_ok(source_sent) or not is_sent_ok(target_sent):
+        return None
+    source_tokens = source_sent.split()
+    target_tokens = target_sent.split()
+    matcher = SequenceMatcher(None, source_tokens, target_tokens)
+    diffs = list(matcher.get_opcodes())
+    all_edits = []
+    for diff in diffs:
+        tag, i1, i2, j1, j2 = diff
+        source_part = _split(" ".join(source_tokens[i1:i2]))
+        target_part = _split(" ".join(target_tokens[j1:j2]))
+        if tag == 'equal':
+            continue
+        elif tag == 'delete':
+            # delete all words separatly
+            for j in range(i2 - i1):
+                edit = [(i1 + j, i1 + j + 1), '$DELETE']
+                all_edits.append(edit)
+        elif tag == 'insert':
+            # append to the previous word
+            for target_token in target_part:
+                edit = ((i1 - 1, i1), f"$APPEND_{target_token}")
+                all_edits.append(edit)
+        else:
+            # check merge first of all
+            edits = apply_merge_transformation(source_part, target_part,
+                                               shift_idx=i1)
+            if edits:
+                all_edits.extend(edits)
+                continue
+            # normalize alignments if need (make them singleton)
+            _, alignments = perfect_align(source_part, target_part,
+                                          insertions_allowed=0)
+            for alignment in alignments:
+                new_shift = alignment[2][0]
+                edits = convert_alignments_into_edits(alignment,
+                                                      shift_idx=i1 + new_shift)
+                all_edits.extend(edits)
+    # get labels
+    labels = convert_edits_into_labels(source_tokens, all_edits)
+    # match tags to source tokens
+    sent_with_tags = add_labels_to_the_tokens(source_tokens, labels)
+    return sent_with_tags
+def convert_edits_into_labels(source_tokens, all_edits):
+    # make sure that edits are flat
+    flat_edits = []
+    for edit in all_edits:
+        (start, end), edit_operations = edit
+        if isinstance(edit_operations, list):
+            for operation in edit_operations:
+                new_edit = [(start, end), operation]
+                flat_edits.append(new_edit)
+        elif isinstance(edit_operations, str):
+            flat_edits.append(edit)
+        else:
+            raise Exception("Unknown operation type")
+    all_edits = flat_edits[:]
+    labels = []
+    total_labels = len(source_tokens) + 1
+    if not all_edits:
+        labels = [["$KEEP"] for x in range(total_labels)]
+    else:
+        for i in range(total_labels):
+            edit_operations = [x[1] for x in all_edits if x[0][0] == i - 1
+                               and x[0][1] == i]
+            if not edit_operations:
+                labels.append(["$KEEP"])
+            else:
+                labels.append(edit_operations)
+    return labels
+def convert_alignments_into_edits(alignment, shift_idx):
+    edits = []
+    action, target_tokens, new_idx = alignment
+    source_token = action.replace("REPLACE_", "")
+    # check if delete
+    if not target_tokens:
+        edit = [(shift_idx, 1 + shift_idx), "$DELETE"]
+        return [edit]
+    # check splits
+    for i in range(1, len(target_tokens)):
+        target_token = " ".join(target_tokens[:i + 1])
+        transform = apply_transformation(source_token, target_token)
+        if transform:
+            edit = [(shift_idx, shift_idx + 1), transform]
+            edits.append(edit)
+            target_tokens = target_tokens[i + 1:]
+            for target in target_tokens:
+                edits.append([(shift_idx, shift_idx + 1), f"$APPEND_{target}"])
+            return edits
+    transform_costs = []
+    transforms = []
+    for target_token in target_tokens:
+        transform = apply_transformation(source_token, target_token)
+        if transform:
+            cost = 0
+            transforms.append(transform)
+        else:
+            cost = Levenshtein.distance(source_token, target_token)
+            transforms.append(None)
+        transform_costs.append(cost)
+    min_cost_idx = transform_costs.index(min(transform_costs))
+    # append to the previous word
+    for i in range(0, min_cost_idx):
+        target = target_tokens[i]
+        edit = [(shift_idx - 1, shift_idx), f"$APPEND_{target}"]
+        edits.append(edit)
+    # replace/transform target word
+    transform = transforms[min_cost_idx]
+    target = transform if transform is not None \
+        else f"$REPLACE_{target_tokens[min_cost_idx]}"
+    edit = [(shift_idx, 1 + shift_idx), target]
+    edits.append(edit)
+    # append to this word
+    for i in range(min_cost_idx + 1, len(target_tokens)):
+        target = target_tokens[i]
+        edit = [(shift_idx, 1 + shift_idx), f"$APPEND_{target}"]
+        edits.append(edit)
+    return edits
+def add_labels_to_the_tokens(source_tokens, labels, delimeters=SEQ_DELIMETERS):
+    tokens_with_all_tags = []
+    source_tokens_with_start = [START_TOKEN] + source_tokens
+    for token, label_list in zip(source_tokens_with_start, labels):
+        all_tags = delimeters['operations'].join(label_list)
+        comb_record = token + delimeters['labels'] + all_tags
+        tokens_with_all_tags.append(comb_record)
+    return delimeters['tokens'].join(tokens_with_all_tags)
+def convert_data_from_raw_files(source_file, target_file, output_file, chunk_size):
+    tagged = []
+    source_data, target_data = read_parallel_lines(source_file, target_file)
+    print(f"The size of raw dataset is {len(source_data)}")
+    cnt_total, cnt_all, cnt_tp = 0, 0, 0
+    for source_sent, target_sent in tqdm(zip(source_data, target_data)):
+        try:
+            aligned_sent = align_sequences(source_sent, target_sent)
+        except Exception:
+            aligned_sent = align_sequences(source_sent, target_sent)
+        if source_sent != target_sent:
+            cnt_tp += 1
+        alignments = [aligned_sent]
+        cnt_all += len(alignments)
+        try:
+            check_sent = convert_tagged_line(aligned_sent)
+        except Exception:
+            # debug mode
+            aligned_sent = align_sequences(source_sent, target_sent)
+            check_sent = convert_tagged_line(aligned_sent)
+        if "".join(check_sent.split()) != "".join(
+                target_sent.split()):
+            # do it again for debugging
+            aligned_sent = align_sequences(source_sent, target_sent)
+            check_sent = convert_tagged_line(aligned_sent)
+            print(f"Incorrect pair: \n{target_sent}\n{check_sent}")
+            continue
+        if alignments:
+            cnt_total += len(alignments)
+            tagged.extend(alignments)
+        if len(tagged) > chunk_size:
+            write_lines(output_file, tagged, mode='a')
+            tagged = []
+    print(f"Overall extracted {cnt_total}. "
+          f"Original TP {cnt_tp}."
+          f" Original TN {cnt_all - cnt_tp}")
+    if tagged:
+        write_lines(output_file, tagged, 'a')
+def convert_labels_into_edits(labels):
+    all_edits = []
+    for i, label_list in enumerate(labels):
+        if label_list == ["$KEEP"]:
+            continue
+        else:
+            edit = [(i - 1, i), label_list]
+            all_edits.append(edit)
+    return all_edits
+def get_target_sent_by_levels(source_tokens, labels):
+    relevant_edits = convert_labels_into_edits(labels)
+    target_tokens = source_tokens[:]
+    leveled_target_tokens = {}
+    if not relevant_edits:
+        target_sentence = " ".join(target_tokens)
+        return leveled_target_tokens, target_sentence
+    max_level = max([len(x[1]) for x in relevant_edits])
+    for level in range(max_level):
+        rest_edits = []
+        shift_idx = 0
+        for edits in relevant_edits:
+            (start, end), label_list = edits
+            label = label_list[0]
+            target_pos = start + shift_idx
+            source_token = target_tokens[target_pos] if target_pos >= 0 else START_TOKEN
+            if label == "$DELETE":
+                del target_tokens[target_pos]
+                shift_idx -= 1
+            elif label.startswith("$APPEND_"):
+                word = label.replace("$APPEND_", "")
+                target_tokens[target_pos + 1: target_pos + 1] = [word]
+                shift_idx += 1
+            elif label.startswith("$REPLACE_"):
+                word = label.replace("$REPLACE_", "")
+                target_tokens[target_pos] = word
+            elif label.startswith("$TRANSFORM"):
+                word = apply_reverse_transformation(source_token, label)
+                if word is None:
+                    word = source_token
+                target_tokens[target_pos] = word
+            elif label.startswith("$MERGE_"):
+                # apply merge only on last stage
+                if level == (max_level - 1):
+                    target_tokens[target_pos + 1: target_pos + 1] = [label]
+                    shift_idx += 1
+                else:
+                    rest_edit = [(start + shift_idx, end + shift_idx), [label]]
+                    rest_edits.append(rest_edit)
+            rest_labels = label_list[1:]
+            if rest_labels:
+                rest_edit = [(start + shift_idx, end + shift_idx), rest_labels]
+                rest_edits.append(rest_edit)
+        leveled_tokens = target_tokens[:]
+        # update next step
+        relevant_edits = rest_edits[:]
+        if level == (max_level - 1):
+            leveled_tokens = replace_merge_transforms(leveled_tokens)
+        leveled_labels = convert_edits_into_labels(leveled_tokens,
+                                                   relevant_edits)
+        leveled_target_tokens[level + 1] = {"tokens": leveled_tokens,
+                                            "labels": leveled_labels}
+    target_sentence = " ".join(leveled_target_tokens[max_level]["tokens"])
+    return leveled_target_tokens, target_sentence
+def replace_merge_transforms(tokens):
+    if all(not x.startswith("$MERGE_") for x in tokens):
+        return tokens
+    target_tokens = tokens[:]
+    allowed_range = (1, len(tokens) - 1)
+    for i in range(len(tokens)):
+        target_token = tokens[i]
+        if target_token.startswith("$MERGE"):
+            if target_token.startswith("$MERGE_SWAP") and i in allowed_range:
+                target_tokens[i - 1] = tokens[i + 1]
+                target_tokens[i + 1] = tokens[i - 1]
+                target_tokens[i: i + 1] = []
+    target_line = " ".join(target_tokens)
+    target_line = target_line.replace(" $MERGE_HYPHEN ", "-")
+    target_line = target_line.replace(" $MERGE_SPACE ", "")
+    return target_line.split()
+def convert_tagged_line(line, delimeters=SEQ_DELIMETERS):
+    label_del = delimeters['labels']
+    source_tokens = [x.split(label_del)[0]
+                     for x in line.split(delimeters['tokens'])][1:]
+    labels = [x.split(label_del)[1].split(delimeters['operations'])
+              for x in line.split(delimeters['tokens'])]
+    assert len(source_tokens) + 1 == len(labels)
+    levels_dict, target_line = get_target_sent_by_levels(source_tokens, labels)
+    return target_line
+def main(args):
+    convert_data_from_raw_files(args.source, args.target, args.output_file, args.chunk_size)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-s', '--source',
+                        help='Path to the source file',
+                        required=True)
+    parser.add_argument('-t', '--target',
+                        help='Path to the target file',
+                        required=True)
+    parser.add_argument('-o', '--output_file',
+                        help='Path to the output file',
+                        required=True)
+    parser.add_argument('--chunk_size',
+                        type=int,
+                        help='Dump each chunk size.',
+                        default=1000000)
+    args = parser.parse_args()
+    main(args)