| |
| """ |
| Convert CLC-FCE dataset (The Cambridge Learner Corpus) to the parallel sentences format. |
| """ |
|
|
| import argparse |
| import glob |
| import os |
| import re |
| from xml.etree import cElementTree |
|
|
| from nltk.tokenize import sent_tokenize, word_tokenize |
| from tqdm import tqdm |
|
|
|
|
| def annotate_fce_doc(xml): |
| """Takes a FCE xml document and yields sentences with annotated errors.""" |
| result = [] |
| doc = cElementTree.fromstring(xml) |
| paragraphs = doc.findall('head/text/*/coded_answer/p') |
| for p in paragraphs: |
| text = _get_formatted_text(p) |
| result.append(text) |
|
|
| return '\n'.join(result) |
|
|
|
|
| def _get_formatted_text(elem, ignore_tags=None): |
| text = elem.text or '' |
| ignore_tags = [tag.upper() for tag in (ignore_tags or [])] |
| correct = None |
| mistake = None |
|
|
| for child in elem.getchildren(): |
| tag = child.tag.upper() |
| if tag == 'NS': |
| text += _get_formatted_text(child) |
|
|
| elif tag == 'UNKNOWN': |
| text += ' UNKNOWN ' |
|
|
| elif tag == 'C': |
| assert correct is None |
| correct = _get_formatted_text(child) |
|
|
| elif tag == 'I': |
| assert mistake is None |
| mistake = _get_formatted_text(child) |
|
|
| elif tag in ignore_tags: |
| pass |
|
|
| else: |
| raise ValueError(f"Unknown tag `{child.tag}`", text) |
|
|
| if correct or mistake: |
| correct = correct or '' |
| mistake = mistake or '' |
| if '=>' not in mistake: |
| text += f'{{{mistake}=>{correct}}}' |
| else: |
| text += mistake |
|
|
| text += elem.tail or '' |
| return text |
|
|
|
|
| def convert_fce(fce_dir): |
| """Processes the whole FCE directory. Yields annotated documents (strings).""" |
|
|
| |
| if not os.path.isdir(fce_dir): |
| raise UserWarning( |
| f"{fce_dir} is not a valid path") |
|
|
| dataset_dir = os.path.join(fce_dir, 'dataset') |
| if not os.path.exists(dataset_dir): |
| raise UserWarning( |
| f"{fce_dir} doesn't point to a dataset's root dir") |
|
|
| |
| filenames = sorted(glob.glob(os.path.join(dataset_dir, '*/*.xml'))) |
|
|
| docs = [] |
| for filename in filenames: |
| with open(filename, encoding='utf-8') as f: |
| doc = annotate_fce_doc(f.read()) |
| docs.append(doc) |
| return docs |
|
|
|
|
| def main(): |
| fce = convert_fce(args.fce_dataset_path) |
| with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \ |
| open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied: |
| for doc in tqdm(fce, unit='doc'): |
| sents = re.split(r"\n +\n", doc) |
| for sent in sents: |
| tokenized_sents = sent_tokenize(sent) |
| for i in range(len(tokenized_sents)): |
| if re.search(r"[{>][.?!]$", tokenized_sents[i]): |
| tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1] |
| tokenized_sents[i] = "" |
| regexp = r'{([^{}]*?)=>([^{}]*?)}' |
| original = re.sub(regexp, r"\1", tokenized_sents[i]) |
| applied = re.sub(regexp, r"\2", tokenized_sents[i]) |
| |
| if original != "" and applied != "" and not re.search(r"[{}=]", original) \ |
| and not re.search(r"[{}=]", applied): |
| out_original.write(" ".join(word_tokenize(original)) + "\n") |
| out_applied.write(" ".join(word_tokenize(applied)) + "\n") |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser(description=( |
| "Convert CLC-FCE dataset to the parallel sentences format.")) |
| parser.add_argument('fce_dataset_path', |
| help='Path to the folder with the FCE dataset') |
| parser.add_argument('--output', |
| help='Path to the output folder') |
| args = parser.parse_args() |
|
|
| main() |
|
|