Spaces:

subwayman
/

btc-chat-bot

Runtime error

File size: 1,400 Bytes

import os
import re


def is_jang(line):
    pattern = r"제\d{1,2}장"
    match = re.search(pattern, line)
    if match:
        return line
    return None


def is_jo(line):
    pattern = r"제\d{1,2}조"
    match = re.search(pattern, line)
    if match:
        return match.group()
    return None


def process_text_document(filepath):
    jang_info = ''
    result = '***'
    delim = '*****\n'
    filename = os.path.basename(filepath).split('.')[0]
    dirname = 'docs'

    with open(filepath, "r") as f:
        for line in f:
            jang_info = is_jang(line) if is_jang(line) else jang_info
            # 새로운 장이면 기억하고 있다가 조 앞에 붙인다.
            # 이번 줄이 새로운 조항이면 앞에 델림을 삽입하고, 장 정보를 삽입한 뒤
            # 리절트에 이어붙인다.
            if is_jo(line):
                result += delim + jang_info
            # 노이즈가 될수 있는 괄호안의 정보는 제거한다.
            result += re.sub(r"<.*?>|\[.*?\]", "", line)

    with open(os.path.join(dirname, 'processed', filename + '_processed.txt'), 'w') as f:
        f.write(result)


if __name__ == '__main__':
    # potential path bug exists
    for f in os.listdir(os.getcwd() + '/docs'):
        if f.endswith('.txt'):
            print(f)
            process_text_document(os.path.join(os.getcwd() + '/docs', f))