File size: 1,400 Bytes
01b7e90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b5869a
01b7e90
 
 
 
 
 
 
4b5869a
01b7e90
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import re


def is_jang(line):
    pattern = r"์ œ\d{1,2}์žฅ"
    match = re.search(pattern, line)
    if match:
        return line
    return None


def is_jo(line):
    pattern = r"์ œ\d{1,2}์กฐ"
    match = re.search(pattern, line)
    if match:
        return match.group()
    return None


def process_text_document(filepath):
    jang_info = ''
    result = '***'
    delim = '*****\n'
    filename = os.path.basename(filepath).split('.')[0]
    dirname = 'docs'

    with open(filepath, "r") as f:
        for line in f:
            jang_info = is_jang(line) if is_jang(line) else jang_info
            # ์ƒˆ๋กœ์šด ์žฅ์ด๋ฉด ๊ธฐ์–ตํ•˜๊ณ  ์žˆ๋‹ค๊ฐ€ ์กฐ ์•ž์— ๋ถ™์ธ๋‹ค.
            # ์ด๋ฒˆ ์ค„์ด ์ƒˆ๋กœ์šด ์กฐํ•ญ์ด๋ฉด ์•ž์— ๋ธ๋ฆผ์„ ์‚ฝ์ž…ํ•˜๊ณ , ์žฅ ์ •๋ณด๋ฅผ ์‚ฝ์ž…ํ•œ ๋’ค
            # ๋ฆฌ์ ˆํŠธ์— ์ด์–ด๋ถ™์ธ๋‹ค.
            if is_jo(line):
                result += delim + jang_info
            # ๋…ธ์ด์ฆˆ๊ฐ€ ๋ ์ˆ˜ ์žˆ๋Š” ๊ด„ํ˜ธ์•ˆ์˜ ์ •๋ณด๋Š” ์ œ๊ฑฐํ•œ๋‹ค.
            result += re.sub(r"<.*?>|\[.*?\]", "", line)

    with open(os.path.join(dirname, 'processed', filename + '_processed.txt'), 'w') as f:
        f.write(result)


if __name__ == '__main__':
    # potential path bug exists
    for f in os.listdir(os.getcwd() + '/docs'):
        if f.endswith('.txt'):
            print(f)
            process_text_document(os.path.join(os.getcwd() + '/docs', f))