|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
This script can be used to preprocess Spoken Wikipedia corpus before running ctc-segmentation. |
|
|
The input folder consists of subfolders with following stricture |
|
|
├── <Name of Wikipedia article> |
|
|
│ ├── aligned.swc |
|
|
│ ├── audiometa.txt |
|
|
│ ├── audio.ogg |
|
|
│ ├── info.json |
|
|
│ ├── wiki.html |
|
|
│ ├── wiki.txt |
|
|
│ └── wiki.xml |
|
|
|
|
|
|
|
|
## The destination folder will contain look enumerated .ogg and .txt files like this: |
|
|
├── audio |
|
|
| ├── 1.ogg |
|
|
| ├── 2.ogg |
|
|
| ... |
|
|
└── text |
|
|
├── 1.txt |
|
|
├── 2.txt |
|
|
... |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import re |
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--input_folder", required=True, type=str, help="Input folder in which each subfolder contains an article" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--destination_folder", required=True, type=str, help="Destination folder with audio and text subfolder" |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
def replace_diacritics(text): |
|
|
text = re.sub(r"[éèëēêęěė]", "e", text) |
|
|
text = re.sub(r"[ãâāáäăâàąåạả]", "a", text) |
|
|
text = re.sub(r"[úūüùưûů]", "u", text) |
|
|
text = re.sub(r"[ôōóöõòő]", "o", text) |
|
|
text = re.sub(r"[ćçč]", "c", text) |
|
|
text = re.sub(r"[ïīíîıì]", "i", text) |
|
|
text = re.sub(r"[ñńňņ]", "n", text) |
|
|
text = re.sub(r"[țť]", "t", text) |
|
|
text = re.sub(r"[łľ]", "l", text) |
|
|
text = re.sub(r"[żžź]", "z", text) |
|
|
text = re.sub(r"[ğ]", "g", text) |
|
|
text = re.sub(r"[ř]", "r", text) |
|
|
text = re.sub(r"[ý]", "y", text) |
|
|
text = re.sub(r"[æ]", "ae", text) |
|
|
text = re.sub(r"[œ]", "oe", text) |
|
|
text = re.sub(r"[șşšś]", "s", text) |
|
|
return text |
|
|
|
|
|
|
|
|
def get_audio(name, n): |
|
|
""" |
|
|
Copies .ogg file. If there are several .ogg files, concatenates them. |
|
|
|
|
|
Args: |
|
|
name - name of folder within Spoken Wikipedia |
|
|
n - integer that will serve as output file name, e.g. if n=1, file 1.ogg will be created |
|
|
""" |
|
|
audio_path = os.path.join(args.input_folder, name, "audio.ogg") |
|
|
if not os.path.exists(audio_path): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
multiple_ogg_files = [] |
|
|
for i in range(1, 5): |
|
|
path = os.path.join(args.input_folder, name, "audio" + str(i) + ".ogg") |
|
|
if os.path.exists(path): |
|
|
multiple_ogg_files.append(path) |
|
|
else: |
|
|
break |
|
|
if len(multiple_ogg_files) == 0: |
|
|
return |
|
|
elif len(multiple_ogg_files) == 1: |
|
|
os.system("cp \"" + multiple_ogg_files[0] + "\" \"" + audio_path + "\"") |
|
|
else: |
|
|
tmp_file_name = "ffmeg_inputs.txt" |
|
|
print("tmp_file_name=", tmp_file_name) |
|
|
with open(tmp_file_name, "w", encoding="utf-8") as tmp_file: |
|
|
for path in multiple_ogg_files: |
|
|
tmp_file.write("file '" + path + "'\n") |
|
|
cmd = "ffmpeg -f concat -i \"" + tmp_file_name + "\" -c copy \"" + audio_path + "\"" |
|
|
print(cmd) |
|
|
os.system(cmd) |
|
|
|
|
|
output_audio_path = args.destination_folder + "/audio/" + str(n) + ".ogg" |
|
|
os.system("cp \"" + audio_path + "\" " + output_audio_path) |
|
|
|
|
|
|
|
|
def get_text(name, n): |
|
|
""" |
|
|
Cleans wiki.txt. |
|
|
|
|
|
Args: |
|
|
name - name of folder within Spoken Wikipedia |
|
|
n - integer that will serve as output file name, e.g. if n=1, file 1.txt will be created |
|
|
""" |
|
|
|
|
|
|
|
|
out_text = open(args.destination_folder + "/text/" + str(n) + ".txt", "w", encoding="utf-8") |
|
|
with open(args.input_folder + "/" + name + "/wiki.txt", "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
do_break = False |
|
|
line2 = line.strip() |
|
|
ref_parts = line2.split("<ref") |
|
|
for idx, s in enumerate(ref_parts): |
|
|
if idx != 0: |
|
|
s = "<ref" + s |
|
|
if s.startswith("[[Image") and s.endswith("]]"): |
|
|
continue |
|
|
if s.startswith("[[File") and s.endswith("]]"): |
|
|
continue |
|
|
if s.startswith(":"): |
|
|
continue |
|
|
if s.startswith("{|") or s.startswith("|}") or s.startswith("|") or s.startswith("!"): |
|
|
continue |
|
|
if s.startswith("{{") and (s.endswith("}}") or "}}" not in s): |
|
|
continue |
|
|
if s.startswith("{{pp-move"): |
|
|
continue |
|
|
s = re.sub(r"\[\[Image\:[^\]]+\]\]", r"", s) |
|
|
s = re.sub(r"\[\[File\:[^\]]+\]\]", r"", s) |
|
|
s = re.sub(r"\[http[^\]]+\]", r"", s) |
|
|
s = re.sub(r"<math>[^<>]+</math>", r"", s) |
|
|
s = re.sub(r"<!\-\-.+\-\->", r"", s) |
|
|
s = re.sub(r"<ref>.+</ref>", r"", s) |
|
|
s = re.sub(r"<ref .+</ref>", r"", s) |
|
|
s = re.sub(r"<ref[^<>]+/>", r"", s) |
|
|
s = re.sub(r"<[^ <>]+>", r"", s) |
|
|
if ( |
|
|
re.match(r"== *Notes *==", s) |
|
|
or re.match(r"== *References *==", s) |
|
|
or re.match(r"== *External links *==", s) |
|
|
or re.match(r"== *See also *==", s) |
|
|
): |
|
|
do_break = True |
|
|
break |
|
|
s = re.sub(r"{{convert\|(\d+)\|(\w+)\|[^}]+}}", r"\g<1> \g<2>", s) |
|
|
s = re.sub(r"{{cquote\|", r"", s) |
|
|
s = re.sub(r"{{[^{}]+}}", r"", s) |
|
|
s = s.replace("{{", "").replace("}}", "") |
|
|
s = re.sub(r"(lang[^()]+)", r"", s) |
|
|
s = re.sub(r"==+", r"", s) |
|
|
s = re.sub(r"''+", r" ", s) |
|
|
s = re.sub(r" '", r" ", s) |
|
|
s = re.sub(r"' ", r" ", s) |
|
|
s = re.sub(r"[…\*]", r" ", s) |
|
|
s = re.sub(r"\\u....", r" ", s) |
|
|
s = re.sub(r"&[^ ;&]+;", r"", s) |
|
|
|
|
|
s = replace_diacritics(s) |
|
|
|
|
|
s = re.sub(r"\[\[[^\]]+\|([^\]]+)\]\]", r"\g<1>", s) |
|
|
s = re.sub(r"\[\[([^\]]+)\]\]", r"\g<1>", s) |
|
|
|
|
|
out_text.write(s + "\n") |
|
|
if do_break: |
|
|
break |
|
|
out_text.close() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
n = 0 |
|
|
for name in os.listdir(args.input_folder): |
|
|
n += 1 |
|
|
if not os.path.exists(args.input_folder + "/" + name + "/wiki.txt"): |
|
|
print("wiki.txt does not exist in " + name) |
|
|
continue |
|
|
get_audio(name, n) |
|
|
get_text(name, n) |
|
|
|