|
|
|
|
|
import os |
|
|
import glob |
|
|
import re |
|
|
import random |
|
|
import logging |
|
|
import argparse |
|
|
from tqdm import tqdm |
|
|
from lxml import etree |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s [%(levelname)s] %(message)s", |
|
|
handlers=[logging.StreamHandler()] |
|
|
) |
|
|
|
|
|
def extract_text_from_paragraph_xml(paragraph_element): |
|
|
""" |
|
|
<paragraph> XML ์์์์ <annotation> ํ๊ทธ๋ฅผ ์ ์ธํ ์์ ํ
์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค. |
|
|
""" |
|
|
try: |
|
|
|
|
|
text_nodes = paragraph_element.xpath("descendant::text()[not(ancestor::annotation)]") |
|
|
full_text = ''.join(text_nodes) |
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', full_text).strip() |
|
|
return cleaned_text |
|
|
except Exception as e: |
|
|
logging.warning(f"ํ
์คํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์ (extract_text_from_paragraph_xml): {e}") |
|
|
return "" |
|
|
|
|
|
def check_xml_structure(paragraph_element): |
|
|
""" |
|
|
ํด๋น <paragraph>๊ฐ 'level5 > text > content' ๊ตฌ์กฐ ๋ด์ ์๋์ง ํ์ธํฉ๋๋ค. |
|
|
""" |
|
|
try: |
|
|
parent = paragraph_element.getparent() |
|
|
grandparent = parent.getparent() if parent is not None else None |
|
|
greatgrandparent = grandparent.getparent() if grandparent is not None else None |
|
|
return ( |
|
|
parent is not None and parent.tag == 'content' and |
|
|
grandparent is not None and grandparent.tag == 'text' and |
|
|
greatgrandparent is not None and greatgrandparent.tag == 'level5' |
|
|
) |
|
|
except AttributeError: |
|
|
return False |
|
|
|
|
|
def save_text_data_to_file(filepath, data_list, description="Saving file"): |
|
|
""" |
|
|
ํ
์คํธ ๋ฆฌ์คํธ๋ฅผ ์ง์ ๋ ๊ฒฝ๋ก์ ํ์ผ๋ก ์ ์ฅํฉ๋๋ค. |
|
|
""" |
|
|
try: |
|
|
with open(filepath, 'w', encoding='utf-8') as f: |
|
|
for line in tqdm(data_list, desc=description): |
|
|
f.write(line + '\n') |
|
|
logging.info(f" ์ด {len(data_list):,} ์ค์ {filepath}์ ์ ์ฅํ์ต๋๋ค.") |
|
|
except Exception as e: |
|
|
logging.error(f" ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์ {filepath}: {e}") |
|
|
|
|
|
def calculate_text_stats(data_list): |
|
|
""" |
|
|
๋ฐ์ดํฐ ๋ฆฌ์คํธ์ ํต๊ณ ์ ๋ณด(๊ฐ์, ์ต์/์ต๋/ํ๊ท /์ค์๊ฐ ๊ธธ์ด)๋ฅผ ๊ณ์ฐํฉ๋๋ค. |
|
|
""" |
|
|
if not data_list: |
|
|
return {"count": 0, "min_len": 0, "max_len": 0, "avg_len": 0.0, "median_len": 0.0} |
|
|
lengths = [len(s) for s in data_list] |
|
|
return { |
|
|
"count": len(lengths), |
|
|
"min_len": np.min(lengths) if lengths else 0, |
|
|
"max_len": np.max(lengths) if lengths else 0, |
|
|
"avg_len": np.mean(lengths) if lengths else 0.0, |
|
|
"median_len": np.median(lengths) if lengths else 0.0, |
|
|
} |
|
|
|
|
|
def prepare_sillok_data(xml_dir, output_dir, min_len, markers_pattern, val_ratio, test_ratio, seed_val): |
|
|
""" |
|
|
๋ฉ์ธ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ํจ์. |
|
|
XML ๋๋ ํ ๋ฆฌ์์ ๋ฐ์ดํฐ๋ฅผ ์ฝ์ด ์ ์ ํ๊ณ , ํ์ต/๊ฒ์ฆ/ํ
์คํธ ์ฉ์ผ๋ก ๋ถํ ํ์ฌ ์ ์ฅํฉ๋๋ค. |
|
|
""" |
|
|
logging.info("--- ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์์ ---") |
|
|
logging.info(f"1. XML ์๋ณธ ๊ฒฝ๋ก: {xml_dir}") |
|
|
logging.info(f"2. ์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ๋ฌผ ์ ์ฅ ๊ฒฝ๋ก: {output_dir}") |
|
|
logging.info(f"3. ์ต์ ๋ฌธ๋จ ๊ธธ์ด ํํฐ: {min_len}") |
|
|
logging.info(f"4. ์ ๊ฑฐํ ๋ฌธ๋จ ์์ ๊ธฐํธ (์ ๊ท์): '{markers_pattern}'") |
|
|
logging.info(f"5. ๋ถํ ๋น์จ (ํ์ต/๊ฒ์ฆ/ํ
์คํธ): {1 - val_ratio - test_ratio:.2f}/{val_ratio:.2f}/{test_ratio:.2f}") |
|
|
logging.info(f"6. ๋ถํ ์ ์ฌ์ฉํ ๋๋ค ์๋: {seed_val}") |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
xml_files = glob.glob(os.path.join(xml_dir, '**', '*.xml'), recursive=True) |
|
|
if not xml_files: |
|
|
logging.error(f"์ค๋ฅ: {xml_dir} ๊ฒฝ๋ก์์ XML ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค.") |
|
|
return |
|
|
logging.info(f"\n์ด {len(xml_files)}๊ฐ์ XML ํ์ผ์ ๋ฐ๊ฒฌํ์ต๋๋ค.") |
|
|
|
|
|
all_valid_paragraphs = [] |
|
|
|
|
|
logging.info("\nXML ํ์ผ์ ์ฒ๋ฆฌํ์ฌ ๋ฌธ๋จ์ ์ถ์ถํ๊ณ ํํฐ๋งํฉ๋๋ค...") |
|
|
for xml_file_path in tqdm(xml_files, desc="XML ํ์ผ ์ฒ๋ฆฌ ์ค"): |
|
|
try: |
|
|
tree = etree.parse(xml_file_path) |
|
|
root = tree.getroot() |
|
|
paragraphs = root.xpath('//level5//paragraph') |
|
|
|
|
|
for para_element in paragraphs: |
|
|
if not check_xml_structure(para_element): |
|
|
continue |
|
|
|
|
|
text = extract_text_from_paragraph_xml(para_element) |
|
|
if not text or len(text) < min_len: |
|
|
continue |
|
|
|
|
|
processed_text = re.sub(markers_pattern, '', text).strip() |
|
|
|
|
|
if processed_text: |
|
|
all_valid_paragraphs.append(processed_text) |
|
|
|
|
|
except Exception as e: |
|
|
logging.warning(f"\nํ์ผ ์ฒ๋ฆฌ ์ค ์์์น ๋ชปํ ์ค๋ฅ ๋ฐ์ ({xml_file_path}): {e}") |
|
|
continue |
|
|
|
|
|
logging.info(f"\nXML ์ฒ๋ฆฌ ์๋ฃ. ์ด {len(all_valid_paragraphs):,}๊ฐ์ ์ ํจํ ๋ฌธ๋จ์ ์ถ์ถํ์ต๋๋ค.") |
|
|
|
|
|
if not all_valid_paragraphs: |
|
|
logging.error("์ ํจํ ๋ฌธ๋จ์ด ์์ด ์ฒ๋ฆฌ๋ฅผ ์ค๋จํฉ๋๋ค.") |
|
|
return |
|
|
|
|
|
logging.info("\n๋ฐ์ดํฐ๋ฅผ ํ์ต, ๊ฒ์ฆ, ํ
์คํธ ์ธํธ๋ก ๋ถํ ํฉ๋๋ค...") |
|
|
random.seed(seed_val) |
|
|
random.shuffle(all_valid_paragraphs) |
|
|
|
|
|
total_count = len(all_valid_paragraphs) |
|
|
test_idx = int(total_count * test_ratio) |
|
|
valid_idx = test_idx + int(total_count * val_ratio) |
|
|
|
|
|
test_data = all_valid_paragraphs[:test_idx] |
|
|
valid_data = all_valid_paragraphs[test_idx:valid_idx] |
|
|
train_data = all_valid_paragraphs[valid_idx:] |
|
|
|
|
|
logging.info("\n--- ๋ฐ์ดํฐ ๋ถํ ๊ฒฐ๊ณผ ๋ฐ ํต๊ณ (๊ธ์ ์ ๊ธฐ์ค) ---") |
|
|
datasets_for_stats = {"ํ์ต": train_data, "๊ฒ์ฆ": valid_data, "ํ
์คํธ": test_data} |
|
|
for name, data in datasets_for_stats.items(): |
|
|
stats = calculate_text_stats(data) |
|
|
percentage_of_total = (stats['count'] / total_count if total_count > 0 else 0.0) |
|
|
log_msg = (f" - {name} ๋ฐ์ดํฐ: {stats['count']:,} ๋ฌธ๋จ ({percentage_of_total:.1%}) | " |
|
|
f"Min: {stats['min_len']}, Max: {stats['max_len']}, " |
|
|
f"Avg: {stats['avg_len']:.1f}, Median: {stats['median_len']:.1f}") |
|
|
logging.info(log_msg) |
|
|
|
|
|
logging.info("\n๋ถํ ๋ ๋ฐ์ดํฐ์
์ ํ
์คํธ ํ์ผ๋ก ์ ์ฅํฉ๋๋ค...") |
|
|
train_filepath = os.path.join(output_dir, "train.txt") |
|
|
valid_filepath = os.path.join(output_dir, "validation.txt") |
|
|
test_filepath = os.path.join(output_dir, "test.txt") |
|
|
|
|
|
save_text_data_to_file(train_filepath, train_data, "train.txt ์ ์ฅ ์ค") |
|
|
save_text_data_to_file(valid_filepath, valid_data, "validation.txt ์ ์ฅ ์ค") |
|
|
save_text_data_to_file(test_filepath, test_data, "test.txt ์ ์ฅ ์ค") |
|
|
|
|
|
logging.info("\n--- ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์๋ฃ ---") |
|
|
logging.info(f"๊ฒฐ๊ณผ๋ฌผ์ด ๋ค์ ๊ฒฝ๋ก์ ์ ์ฅ๋์์ต๋๋ค: {output_dir}") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser(description="์กฐ์ ์์กฐ์ค๋ก XML ๋ฐ์ดํฐ๋ฅผ ์ ์ฒ๋ฆฌํ์ฌ ํ
์คํธ ํ์ผ๋ก ๋ณํํฉ๋๋ค.") |
|
|
parser.add_argument("--xml_dir", type=str, required=True, help="XML ํ์ผ๋ค์ด ์๋ ์๋ณธ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก") |
|
|
parser.add_argument("--output_dir", type=str, required=True, help="์ ์ฒ๋ฆฌ๋ ํ
์คํธ ํ์ผ์ ์ ์ฅํ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก") |
|
|
parser.add_argument("--min_len", type=int, default=10, help="ํํฐ๋งํ ๋ฌธ๋จ์ ์ต์ ๊ธ์ ์") |
|
|
parser.add_argument("--val_ratio", type=float, default=0.05, help="๊ฒ์ฆ ๋ฐ์ดํฐ์
์ ๋น์จ") |
|
|
parser.add_argument("--test_ratio", type=float, default=0.05, help="ํ
์คํธ ๋ฐ์ดํฐ์
์ ๋น์จ") |
|
|
parser.add_argument("--seed", type=int, default=85, help="๋ฐ์ดํฐ ๋ถํ ์ ์ฌ์ฉํ ๋๋ค ์๋๊ฐ") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
LEADING_MARKERS_PATTERN = r"^[โโฒโโโโโทโถโฝโผโฃโ โกโชโซโโจ]+" |
|
|
|
|
|
prepare_sillok_data( |
|
|
xml_dir=args.xml_dir, |
|
|
output_dir=args.output_dir, |
|
|
min_len=args.min_len, |
|
|
markers_pattern=LEADING_MARKERS_PATTERN, |
|
|
val_ratio=args.val_ratio, |
|
|
test_ratio=args.test_ratio, |
|
|
seed_val=args.seed |
|
|
) |
|
|
|