btc-chat-bot / document_processor.py
atoye1's picture
create: history tab
4b5869a
import os
import re
def is_jang(line):
pattern = r"์ œ\d{1,2}์žฅ"
match = re.search(pattern, line)
if match:
return line
return None
def is_jo(line):
pattern = r"์ œ\d{1,2}์กฐ"
match = re.search(pattern, line)
if match:
return match.group()
return None
def process_text_document(filepath):
jang_info = ''
result = '***'
delim = '*****\n'
filename = os.path.basename(filepath).split('.')[0]
dirname = 'docs'
with open(filepath, "r") as f:
for line in f:
jang_info = is_jang(line) if is_jang(line) else jang_info
# ์ƒˆ๋กœ์šด ์žฅ์ด๋ฉด ๊ธฐ์–ตํ•˜๊ณ  ์žˆ๋‹ค๊ฐ€ ์กฐ ์•ž์— ๋ถ™์ธ๋‹ค.
# ์ด๋ฒˆ ์ค„์ด ์ƒˆ๋กœ์šด ์กฐํ•ญ์ด๋ฉด ์•ž์— ๋ธ๋ฆผ์„ ์‚ฝ์ž…ํ•˜๊ณ , ์žฅ ์ •๋ณด๋ฅผ ์‚ฝ์ž…ํ•œ ๋’ค
# ๋ฆฌ์ ˆํŠธ์— ์ด์–ด๋ถ™์ธ๋‹ค.
if is_jo(line):
result += delim + jang_info
# ๋…ธ์ด์ฆˆ๊ฐ€ ๋ ์ˆ˜ ์žˆ๋Š” ๊ด„ํ˜ธ์•ˆ์˜ ์ •๋ณด๋Š” ์ œ๊ฑฐํ•œ๋‹ค.
result += re.sub(r"<.*?>|\[.*?\]", "", line)
with open(os.path.join(dirname, 'processed', filename + '_processed.txt'), 'w') as f:
f.write(result)
if __name__ == '__main__':
# potential path bug exists
for f in os.listdir(os.getcwd() + '/docs'):
if f.endswith('.txt'):
print(f)
process_text_document(os.path.join(os.getcwd() + '/docs', f))