Upload prepare_data.py
Browse files- scripts/prepare_data.py +186 -0
scripts/prepare_data.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
import re
|
| 5 |
+
import random
|
| 6 |
+
import logging
|
| 7 |
+
import argparse
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from lxml import etree
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# --- ๋ก๊น
์ค์ ---
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 16 |
+
handlers=[logging.StreamHandler()]
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
def extract_text_from_paragraph_xml(paragraph_element):
|
| 20 |
+
"""
|
| 21 |
+
<paragraph> XML ์์์์ <annotation> ํ๊ทธ๋ฅผ ์ ์ธํ ์์ ํ
์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
# XPath๋ฅผ ์ฌ์ฉํ์ฌ <annotation>์ ์์์ด ์๋ ๋ชจ๋ text ๋
ธ๋๋ฅผ ๊ฐ์ ธ์ต๋๋ค.
|
| 25 |
+
text_nodes = paragraph_element.xpath("descendant::text()[not(ancestor::annotation)]")
|
| 26 |
+
full_text = ''.join(text_nodes)
|
| 27 |
+
# ๋ถํ์ํ ๊ณต๋ฐฑ์ ์ ๊ทํํฉ๋๋ค.
|
| 28 |
+
cleaned_text = re.sub(r'\s+', ' ', full_text).strip()
|
| 29 |
+
return cleaned_text
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logging.warning(f"ํ
์คํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์ (extract_text_from_paragraph_xml): {e}")
|
| 32 |
+
return ""
|
| 33 |
+
|
| 34 |
+
def check_xml_structure(paragraph_element):
|
| 35 |
+
"""
|
| 36 |
+
ํด๋น <paragraph>๊ฐ 'level5 > text > content' ๊ตฌ์กฐ ๋ด์ ์๋์ง ํ์ธํฉ๋๋ค.
|
| 37 |
+
"""
|
| 38 |
+
try:
|
| 39 |
+
parent = paragraph_element.getparent()
|
| 40 |
+
grandparent = parent.getparent() if parent is not None else None
|
| 41 |
+
greatgrandparent = grandparent.getparent() if grandparent is not None else None
|
| 42 |
+
return (
|
| 43 |
+
parent is not None and parent.tag == 'content' and
|
| 44 |
+
grandparent is not None and grandparent.tag == 'text' and
|
| 45 |
+
greatgrandparent is not None and greatgrandparent.tag == 'level5'
|
| 46 |
+
)
|
| 47 |
+
except AttributeError:
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
def save_text_data_to_file(filepath, data_list, description="Saving file"):
|
| 51 |
+
"""
|
| 52 |
+
ํ
์คํธ ๋ฆฌ์คํธ๋ฅผ ์ง์ ๋ ๊ฒฝ๋ก์ ํ์ผ๋ก ์ ์ฅํฉ๋๋ค.
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 56 |
+
for line in tqdm(data_list, desc=description):
|
| 57 |
+
f.write(line + '\n')
|
| 58 |
+
logging.info(f" ์ด {len(data_list):,} ์ค์ {filepath}์ ์ ์ฅํ์ต๋๋ค.")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logging.error(f" ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์ {filepath}: {e}")
|
| 61 |
+
|
| 62 |
+
def calculate_text_stats(data_list):
|
| 63 |
+
"""
|
| 64 |
+
๋ฐ์ดํฐ ๋ฆฌ์คํธ์ ํต๊ณ ์ ๋ณด(๊ฐ์, ์ต์/์ต๋/ํ๊ท /์ค์๊ฐ ๊ธธ์ด)๋ฅผ ๊ณ์ฐํฉ๋๋ค.
|
| 65 |
+
"""
|
| 66 |
+
if not data_list:
|
| 67 |
+
return {"count": 0, "min_len": 0, "max_len": 0, "avg_len": 0.0, "median_len": 0.0}
|
| 68 |
+
lengths = [len(s) for s in data_list]
|
| 69 |
+
return {
|
| 70 |
+
"count": len(lengths),
|
| 71 |
+
"min_len": np.min(lengths) if lengths else 0,
|
| 72 |
+
"max_len": np.max(lengths) if lengths else 0,
|
| 73 |
+
"avg_len": np.mean(lengths) if lengths else 0.0,
|
| 74 |
+
"median_len": np.median(lengths) if lengths else 0.0,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
def prepare_sillok_data(xml_dir, output_dir, min_len, markers_pattern, val_ratio, test_ratio, seed_val):
|
| 78 |
+
"""
|
| 79 |
+
๋ฉ์ธ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ํจ์.
|
| 80 |
+
XML ๋๋ ํ ๋ฆฌ์์ ๋ฐ์ดํฐ๋ฅผ ์ฝ์ด ์ ์ ํ๊ณ , ํ์ต/๊ฒ์ฆ/ํ
์คํธ ์ฉ์ผ๋ก ๋ถํ ํ์ฌ ์ ์ฅํฉ๋๋ค.
|
| 81 |
+
"""
|
| 82 |
+
logging.info("--- ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์์ ---")
|
| 83 |
+
logging.info(f"1. XML ์๋ณธ ๊ฒฝ๋ก: {xml_dir}")
|
| 84 |
+
logging.info(f"2. ์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ๋ฌผ ์ ์ฅ ๊ฒฝ๋ก: {output_dir}")
|
| 85 |
+
logging.info(f"3. ์ต์ ๋ฌธ๋จ ๊ธธ์ด ํํฐ: {min_len}")
|
| 86 |
+
logging.info(f"4. ์ ๊ฑฐํ ๋ฌธ๋จ ์์ ๊ธฐํธ (์ ๊ท์): '{markers_pattern}'")
|
| 87 |
+
logging.info(f"5. ๋ถํ ๋น์จ (ํ์ต/๊ฒ์ฆ/ํ
์คํธ): {1 - val_ratio - test_ratio:.2f}/{val_ratio:.2f}/{test_ratio:.2f}")
|
| 88 |
+
logging.info(f"6. ๋ถํ ์ ์ฌ์ฉํ ๋๋ค ์๋: {seed_val}")
|
| 89 |
+
|
| 90 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
# ์ง์ ๋ ๋๋ ํ ๋ฆฌ ํ์์ ๋ชจ๋ .xml ํ์ผ ๊ฒ์
|
| 93 |
+
xml_files = glob.glob(os.path.join(xml_dir, '**', '*.xml'), recursive=True)
|
| 94 |
+
if not xml_files:
|
| 95 |
+
logging.error(f"์ค๋ฅ: {xml_dir} ๊ฒฝ๋ก์์ XML ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 96 |
+
return
|
| 97 |
+
logging.info(f"\n์ด {len(xml_files)}๊ฐ์ XML ํ์ผ์ ๋ฐ๊ฒฌํ์ต๋๋ค.")
|
| 98 |
+
|
| 99 |
+
all_valid_paragraphs = []
|
| 100 |
+
|
| 101 |
+
logging.info("\nXML ํ์ผ์ ์ฒ๋ฆฌํ์ฌ ๋ฌธ๋จ์ ์ถ์ถํ๊ณ ํํฐ๋งํฉ๋๋ค...")
|
| 102 |
+
for xml_file_path in tqdm(xml_files, desc="XML ํ์ผ ์ฒ๋ฆฌ ์ค"):
|
| 103 |
+
try:
|
| 104 |
+
tree = etree.parse(xml_file_path)
|
| 105 |
+
root = tree.getroot()
|
| 106 |
+
paragraphs = root.xpath('//level5//paragraph')
|
| 107 |
+
|
| 108 |
+
for para_element in paragraphs:
|
| 109 |
+
if not check_xml_structure(para_element):
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
text = extract_text_from_paragraph_xml(para_element)
|
| 113 |
+
if not text or len(text) < min_len:
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
processed_text = re.sub(markers_pattern, '', text).strip()
|
| 117 |
+
|
| 118 |
+
if processed_text:
|
| 119 |
+
all_valid_paragraphs.append(processed_text)
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logging.warning(f"\nํ์ผ ์ฒ๋ฆฌ ์ค ์์์น ๋ชปํ ์ค๋ฅ ๋ฐ์ ({xml_file_path}): {e}")
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
logging.info(f"\nXML ์ฒ๋ฆฌ ์๋ฃ. ์ด {len(all_valid_paragraphs):,}๊ฐ์ ์ ํจํ ๋ฌธ๋จ์ ์ถ์ถํ์ต๋๋ค.")
|
| 126 |
+
|
| 127 |
+
if not all_valid_paragraphs:
|
| 128 |
+
logging.error("์ ํจํ ๋ฌธ๋จ์ด ์์ด ์ฒ๋ฆฌ๋ฅผ ์ค๋จํฉ๋๋ค.")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
logging.info("\n๋ฐ์ดํฐ๋ฅผ ํ์ต, ๊ฒ์ฆ, ํ
์คํธ ์ธํธ๋ก ๋ถํ ํฉ๋๋ค...")
|
| 132 |
+
random.seed(seed_val)
|
| 133 |
+
random.shuffle(all_valid_paragraphs)
|
| 134 |
+
|
| 135 |
+
total_count = len(all_valid_paragraphs)
|
| 136 |
+
test_idx = int(total_count * test_ratio)
|
| 137 |
+
valid_idx = test_idx + int(total_count * val_ratio)
|
| 138 |
+
|
| 139 |
+
test_data = all_valid_paragraphs[:test_idx]
|
| 140 |
+
valid_data = all_valid_paragraphs[test_idx:valid_idx]
|
| 141 |
+
train_data = all_valid_paragraphs[valid_idx:]
|
| 142 |
+
|
| 143 |
+
logging.info("\n--- ๋ฐ์ดํฐ ๋ถํ ๊ฒฐ๊ณผ ๋ฐ ํต๊ณ (๊ธ์ ์ ๊ธฐ์ค) ---")
|
| 144 |
+
datasets_for_stats = {"ํ์ต": train_data, "๊ฒ์ฆ": valid_data, "ํ
์คํธ": test_data}
|
| 145 |
+
for name, data in datasets_for_stats.items():
|
| 146 |
+
stats = calculate_text_stats(data)
|
| 147 |
+
percentage_of_total = (stats['count'] / total_count if total_count > 0 else 0.0)
|
| 148 |
+
log_msg = (f" - {name} ๋ฐ์ดํฐ: {stats['count']:,} ๋ฌธ๋จ ({percentage_of_total:.1%}) | "
|
| 149 |
+
f"Min: {stats['min_len']}, Max: {stats['max_len']}, "
|
| 150 |
+
f"Avg: {stats['avg_len']:.1f}, Median: {stats['median_len']:.1f}")
|
| 151 |
+
logging.info(log_msg)
|
| 152 |
+
|
| 153 |
+
logging.info("\n๋ถํ ๋ ๋ฐ์ดํฐ์
์ ํ
์คํธ ํ์ผ๋ก ์ ์ฅํฉ๋๋ค...")
|
| 154 |
+
train_filepath = os.path.join(output_dir, "train.txt")
|
| 155 |
+
valid_filepath = os.path.join(output_dir, "validation.txt")
|
| 156 |
+
test_filepath = os.path.join(output_dir, "test.txt")
|
| 157 |
+
|
| 158 |
+
save_text_data_to_file(train_filepath, train_data, "train.txt ์ ์ฅ ์ค")
|
| 159 |
+
save_text_data_to_file(valid_filepath, valid_data, "validation.txt ์ ์ฅ ์ค")
|
| 160 |
+
save_text_data_to_file(test_filepath, test_data, "test.txt ์ ์ฅ ์ค")
|
| 161 |
+
|
| 162 |
+
logging.info("\n--- ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์๋ฃ ---")
|
| 163 |
+
logging.info(f"๊ฒฐ๊ณผ๋ฌผ์ด ๋ค์ ๊ฒฝ๋ก์ ์ ์ฅ๋์์ต๋๋ค: {output_dir}")
|
| 164 |
+
|
| 165 |
+
if __name__ == '__main__':
|
| 166 |
+
parser = argparse.ArgumentParser(description="์กฐ์ ์์กฐ์ค๋ก XML ๋ฐ์ดํฐ๋ฅผ ์ ์ฒ๋ฆฌํ์ฌ ํ
์คํธ ํ์ผ๋ก ๋ณํํฉ๋๋ค.")
|
| 167 |
+
parser.add_argument("--xml_dir", type=str, required=True, help="XML ํ์ผ๋ค์ด ์๋ ์๋ณธ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก")
|
| 168 |
+
parser.add_argument("--output_dir", type=str, required=True, help="์ ์ฒ๋ฆฌ๋ ํ
์คํธ ํ์ผ์ ์ ์ฅํ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก")
|
| 169 |
+
parser.add_argument("--min_len", type=int, default=10, help="ํํฐ๋งํ ๋ฌธ๋จ์ ์ต์ ๊ธ์ ์")
|
| 170 |
+
parser.add_argument("--val_ratio", type=float, default=0.05, help="๊ฒ์ฆ ๋ฐ์ดํฐ์
์ ๋น์จ")
|
| 171 |
+
parser.add_argument("--test_ratio", type=float, default=0.05, help="ํ
์คํธ ๋ฐ์ดํฐ์
์ ๋น์จ")
|
| 172 |
+
parser.add_argument("--seed", type=int, default=85, help="๋ฐ์ดํฐ ๋ถํ ์ ์ฌ์ฉํ ๋๋ค ์๋๊ฐ")
|
| 173 |
+
|
| 174 |
+
args = parser.parse_args()
|
| 175 |
+
|
| 176 |
+
LEADING_MARKERS_PATTERN = r"^[โโฒโโโโโทโถโฝโผโฃโ โกโชโซโโจ]+"
|
| 177 |
+
|
| 178 |
+
prepare_sillok_data(
|
| 179 |
+
xml_dir=args.xml_dir,
|
| 180 |
+
output_dir=args.output_dir,
|
| 181 |
+
min_len=args.min_len,
|
| 182 |
+
markers_pattern=LEADING_MARKERS_PATTERN,
|
| 183 |
+
val_ratio=args.val_ratio,
|
| 184 |
+
test_ratio=args.test_ratio,
|
| 185 |
+
seed_val=args.seed
|
| 186 |
+
)
|