ddokbaro commited on
Commit
3c309fa
ยท
verified ยท
1 Parent(s): 5521115

Upload prepare_data.py

Browse files
Files changed (1) hide show
  1. scripts/prepare_data.py +186 -0
scripts/prepare_data.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import glob
4
+ import re
5
+ import random
6
+ import logging
7
+ import argparse
8
+ from tqdm import tqdm
9
+ from lxml import etree
10
+ import numpy as np
11
+
12
+ # --- ๋กœ๊น… ์„ค์ • ---
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s [%(levelname)s] %(message)s",
16
+ handlers=[logging.StreamHandler()]
17
+ )
18
+
19
+ def extract_text_from_paragraph_xml(paragraph_element):
20
+ """
21
+ <paragraph> XML ์š”์†Œ์—์„œ <annotation> ํƒœ๊ทธ๋ฅผ ์ œ์™ธํ•œ ์ˆœ์ˆ˜ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
22
+ """
23
+ try:
24
+ # XPath๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ <annotation>์˜ ์ž์†์ด ์•„๋‹Œ ๋ชจ๋“  text ๋…ธ๋“œ๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
25
+ text_nodes = paragraph_element.xpath("descendant::text()[not(ancestor::annotation)]")
26
+ full_text = ''.join(text_nodes)
27
+ # ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ์„ ์ •๊ทœํ™”ํ•ฉ๋‹ˆ๋‹ค.
28
+ cleaned_text = re.sub(r'\s+', ' ', full_text).strip()
29
+ return cleaned_text
30
+ except Exception as e:
31
+ logging.warning(f"ํ…์ŠคํŠธ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (extract_text_from_paragraph_xml): {e}")
32
+ return ""
33
+
34
+ def check_xml_structure(paragraph_element):
35
+ """
36
+ ํ•ด๋‹น <paragraph>๊ฐ€ 'level5 > text > content' ๊ตฌ์กฐ ๋‚ด์— ์žˆ๋Š”์ง€ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
37
+ """
38
+ try:
39
+ parent = paragraph_element.getparent()
40
+ grandparent = parent.getparent() if parent is not None else None
41
+ greatgrandparent = grandparent.getparent() if grandparent is not None else None
42
+ return (
43
+ parent is not None and parent.tag == 'content' and
44
+ grandparent is not None and grandparent.tag == 'text' and
45
+ greatgrandparent is not None and greatgrandparent.tag == 'level5'
46
+ )
47
+ except AttributeError:
48
+ return False
49
+
50
+ def save_text_data_to_file(filepath, data_list, description="Saving file"):
51
+ """
52
+ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ง€์ •๋œ ๊ฒฝ๋กœ์— ํŒŒ์ผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
53
+ """
54
+ try:
55
+ with open(filepath, 'w', encoding='utf-8') as f:
56
+ for line in tqdm(data_list, desc=description):
57
+ f.write(line + '\n')
58
+ logging.info(f" ์ด {len(data_list):,} ์ค„์„ {filepath}์— ์ €์žฅํ–ˆ์Šต๋‹ˆ๋‹ค.")
59
+ except Exception as e:
60
+ logging.error(f" ํŒŒ์ผ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ {filepath}: {e}")
61
+
62
+ def calculate_text_stats(data_list):
63
+ """
64
+ ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ์˜ ํ†ต๊ณ„ ์ •๋ณด(๊ฐœ์ˆ˜, ์ตœ์†Œ/์ตœ๋Œ€/ํ‰๊ท /์ค‘์•™๊ฐ’ ๊ธธ์ด)๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.
65
+ """
66
+ if not data_list:
67
+ return {"count": 0, "min_len": 0, "max_len": 0, "avg_len": 0.0, "median_len": 0.0}
68
+ lengths = [len(s) for s in data_list]
69
+ return {
70
+ "count": len(lengths),
71
+ "min_len": np.min(lengths) if lengths else 0,
72
+ "max_len": np.max(lengths) if lengths else 0,
73
+ "avg_len": np.mean(lengths) if lengths else 0.0,
74
+ "median_len": np.median(lengths) if lengths else 0.0,
75
+ }
76
+
77
+ def prepare_sillok_data(xml_dir, output_dir, min_len, markers_pattern, val_ratio, test_ratio, seed_val):
78
+ """
79
+ ๋ฉ”์ธ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜.
80
+ XML ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด ์ •์ œํ•˜๊ณ , ํ•™์Šต/๊ฒ€์ฆ/ํ…Œ์ŠคํŠธ ์šฉ์œผ๋กœ ๋ถ„ํ• ํ•˜์—ฌ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
81
+ """
82
+ logging.info("--- ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘ ---")
83
+ logging.info(f"1. XML ์›๋ณธ ๊ฒฝ๋กœ: {xml_dir}")
84
+ logging.info(f"2. ์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๋ฌผ ์ €์žฅ ๊ฒฝ๋กœ: {output_dir}")
85
+ logging.info(f"3. ์ตœ์†Œ ๋ฌธ๋‹จ ๊ธธ์ด ํ•„ํ„ฐ: {min_len}")
86
+ logging.info(f"4. ์ œ๊ฑฐํ•  ๋ฌธ๋‹จ ์‹œ์ž‘ ๊ธฐํ˜ธ (์ •๊ทœ์‹): '{markers_pattern}'")
87
+ logging.info(f"5. ๋ถ„ํ•  ๋น„์œจ (ํ•™์Šต/๊ฒ€์ฆ/ํ…Œ์ŠคํŠธ): {1 - val_ratio - test_ratio:.2f}/{val_ratio:.2f}/{test_ratio:.2f}")
88
+ logging.info(f"6. ๋ถ„ํ•  ์‹œ ์‚ฌ์šฉํ•  ๋žœ๋ค ์‹œ๋“œ: {seed_val}")
89
+
90
+ os.makedirs(output_dir, exist_ok=True)
91
+
92
+ # ์ง€์ •๋œ ๋””๋ ‰ํ† ๋ฆฌ ํ•˜์œ„์˜ ๋ชจ๋“  .xml ํŒŒ์ผ ๊ฒ€์ƒ‰
93
+ xml_files = glob.glob(os.path.join(xml_dir, '**', '*.xml'), recursive=True)
94
+ if not xml_files:
95
+ logging.error(f"์˜ค๋ฅ˜: {xml_dir} ๊ฒฝ๋กœ์—์„œ XML ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
96
+ return
97
+ logging.info(f"\n์ด {len(xml_files)}๊ฐœ์˜ XML ํŒŒ์ผ์„ ๋ฐœ๊ฒฌํ–ˆ์Šต๋‹ˆ๋‹ค.")
98
+
99
+ all_valid_paragraphs = []
100
+
101
+ logging.info("\nXML ํŒŒ์ผ์„ ์ฒ˜๋ฆฌํ•˜์—ฌ ๋ฌธ๋‹จ์„ ์ถ”์ถœํ•˜๊ณ  ํ•„ํ„ฐ๋งํ•ฉ๋‹ˆ๋‹ค...")
102
+ for xml_file_path in tqdm(xml_files, desc="XML ํŒŒ์ผ ์ฒ˜๋ฆฌ ์ค‘"):
103
+ try:
104
+ tree = etree.parse(xml_file_path)
105
+ root = tree.getroot()
106
+ paragraphs = root.xpath('//level5//paragraph')
107
+
108
+ for para_element in paragraphs:
109
+ if not check_xml_structure(para_element):
110
+ continue
111
+
112
+ text = extract_text_from_paragraph_xml(para_element)
113
+ if not text or len(text) < min_len:
114
+ continue
115
+
116
+ processed_text = re.sub(markers_pattern, '', text).strip()
117
+
118
+ if processed_text:
119
+ all_valid_paragraphs.append(processed_text)
120
+
121
+ except Exception as e:
122
+ logging.warning(f"\nํŒŒ์ผ ์ฒ˜๋ฆฌ ์ค‘ ์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋ฅ˜ ๋ฐœ์ƒ ({xml_file_path}): {e}")
123
+ continue
124
+
125
+ logging.info(f"\nXML ์ฒ˜๋ฆฌ ์™„๋ฃŒ. ์ด {len(all_valid_paragraphs):,}๊ฐœ์˜ ์œ ํšจํ•œ ๋ฌธ๋‹จ์„ ์ถ”์ถœํ–ˆ์Šต๋‹ˆ๋‹ค.")
126
+
127
+ if not all_valid_paragraphs:
128
+ logging.error("์œ ํšจํ•œ ๋ฌธ๋‹จ์ด ์—†์–ด ์ฒ˜๋ฆฌ๋ฅผ ์ค‘๋‹จํ•ฉ๋‹ˆ๋‹ค.")
129
+ return
130
+
131
+ logging.info("\n๋ฐ์ดํ„ฐ๋ฅผ ํ•™์Šต, ๊ฒ€์ฆ, ํ…Œ์ŠคํŠธ ์„ธํŠธ๋กœ ๋ถ„ํ• ํ•ฉ๋‹ˆ๋‹ค...")
132
+ random.seed(seed_val)
133
+ random.shuffle(all_valid_paragraphs)
134
+
135
+ total_count = len(all_valid_paragraphs)
136
+ test_idx = int(total_count * test_ratio)
137
+ valid_idx = test_idx + int(total_count * val_ratio)
138
+
139
+ test_data = all_valid_paragraphs[:test_idx]
140
+ valid_data = all_valid_paragraphs[test_idx:valid_idx]
141
+ train_data = all_valid_paragraphs[valid_idx:]
142
+
143
+ logging.info("\n--- ๋ฐ์ดํ„ฐ ๋ถ„ํ•  ๊ฒฐ๊ณผ ๋ฐ ํ†ต๊ณ„ (๊ธ€์ž ์ˆ˜ ๊ธฐ์ค€) ---")
144
+ datasets_for_stats = {"ํ•™์Šต": train_data, "๊ฒ€์ฆ": valid_data, "ํ…Œ์ŠคํŠธ": test_data}
145
+ for name, data in datasets_for_stats.items():
146
+ stats = calculate_text_stats(data)
147
+ percentage_of_total = (stats['count'] / total_count if total_count > 0 else 0.0)
148
+ log_msg = (f" - {name} ๋ฐ์ดํ„ฐ: {stats['count']:,} ๋ฌธ๋‹จ ({percentage_of_total:.1%}) | "
149
+ f"Min: {stats['min_len']}, Max: {stats['max_len']}, "
150
+ f"Avg: {stats['avg_len']:.1f}, Median: {stats['median_len']:.1f}")
151
+ logging.info(log_msg)
152
+
153
+ logging.info("\n๋ถ„ํ• ๋œ ๋ฐ์ดํ„ฐ์…‹์„ ํ…์ŠคํŠธ ํŒŒ์ผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค...")
154
+ train_filepath = os.path.join(output_dir, "train.txt")
155
+ valid_filepath = os.path.join(output_dir, "validation.txt")
156
+ test_filepath = os.path.join(output_dir, "test.txt")
157
+
158
+ save_text_data_to_file(train_filepath, train_data, "train.txt ์ €์žฅ ์ค‘")
159
+ save_text_data_to_file(valid_filepath, valid_data, "validation.txt ์ €์žฅ ์ค‘")
160
+ save_text_data_to_file(test_filepath, test_data, "test.txt ์ €์žฅ ์ค‘")
161
+
162
+ logging.info("\n--- ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ ---")
163
+ logging.info(f"๊ฒฐ๊ณผ๋ฌผ์ด ๋‹ค์Œ ๊ฒฝ๋กœ์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {output_dir}")
164
+
165
+ if __name__ == '__main__':
166
+ parser = argparse.ArgumentParser(description="์กฐ์„ ์™•์กฐ์‹ค๋ก XML ๋ฐ์ดํ„ฐ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ ํŒŒ์ผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.")
167
+ parser.add_argument("--xml_dir", type=str, required=True, help="XML ํŒŒ์ผ๋“ค์ด ์žˆ๋Š” ์›๋ณธ ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ")
168
+ parser.add_argument("--output_dir", type=str, required=True, help="์ „์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ ํŒŒ์ผ์„ ์ €์žฅํ•  ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ")
169
+ parser.add_argument("--min_len", type=int, default=10, help="ํ•„ํ„ฐ๋งํ•  ๋ฌธ๋‹จ์˜ ์ตœ์†Œ ๊ธ€์ž ์ˆ˜")
170
+ parser.add_argument("--val_ratio", type=float, default=0.05, help="๊ฒ€์ฆ ๋ฐ์ดํ„ฐ์…‹์˜ ๋น„์œจ")
171
+ parser.add_argument("--test_ratio", type=float, default=0.05, help="ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ์…‹์˜ ๋น„์œจ")
172
+ parser.add_argument("--seed", type=int, default=85, help="๋ฐ์ดํ„ฐ ๋ถ„ํ•  ์‹œ ์‚ฌ์šฉํ•  ๋žœ๋ค ์‹œ๋“œ๊ฐ’")
173
+
174
+ args = parser.parse_args()
175
+
176
+ LEADING_MARKERS_PATTERN = r"^[โ—‹โ–ฒโ—โ—Žโ—‡โ—ˆโ–ทโ–ถโ–ฝโ–ผโ–ฃโ– โ–กโ–ชโ–ซโ˜žโ‡จ]+"
177
+
178
+ prepare_sillok_data(
179
+ xml_dir=args.xml_dir,
180
+ output_dir=args.output_dir,
181
+ min_len=args.min_len,
182
+ markers_pattern=LEADING_MARKERS_PATTERN,
183
+ val_ratio=args.val_ratio,
184
+ test_ratio=args.test_ratio,
185
+ seed_val=args.seed
186
+ )