ChaTech / crawl.py
m00k10m's picture
Upload 5 files
52a84f2 verified
import os
import time
import torch
import requests
import fitz
import olefile
import zlib
import re
import traceback
import chromadb
import argparse
import zipfile
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from docx import Document
from urllib.parse import urljoin
from pathlib import Path
from chromadb.utils import embedding_functions
def make_db(download_dir: str, db_dir: str, collection_name: str, *, embedf_name: str = "") -> chromadb.Collection:
"""
chromadb์˜ collection ๊ฐ์ฒด๋ฅผ ๋งŒ๋“ค์–ด์„œ ๋ฐ˜ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
input
download_dir : ์›นํŽ˜์ด์ง€์—์„œ ๊ฐ ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•ด์„œ ์ €์žฅํ•  ํด๋” (์ ˆ๋Œ€ ๊ฒฝ๋กœ)
db_dir : chroma db๋ฅผ ์ €์žฅํ•  ๋กœ์ปฌ ํด๋” (์ ˆ๋Œ€ ๊ฒฝ๋กœ)
embedf_name(optional) : ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉํ•  ํ•จ์ˆ˜ ์ด๋ฆ„
output
collection ๊ฐ์ฒด
***
DefaultEmbeddingFunction ๋ง๊ณ  ๋‹ค๋ฅธ๊ฑธ ์‚ฌ์šฉํ•˜๋ ค๋ฉด
pip install -q sentence-transformers ์„ค์น˜ํ•˜๊ณ 
"""
print("chromadb collection ๊ฐ์ฒด๋ฅผ ๋กœ์ปฌ์— ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ")
if not os.path.exists(download_dir):
os.makedirs(download_dir)
print(f"๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•  ํด๋”๊ฐ€ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค : {download_dir}")
else:
print(f"๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•  ํด๋”๊ฐ€ ์ด๋ฏธ ์กด์žฌํ•ฉ๋‹ˆ๋‹ค. :{download_dir}")
if not os.path.exists(db_dir):
os.makedirs(db_dir)
print(f"chromadb collection์„ ์ €์žฅํ•  ํด๋”๊ฐ€ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค : {db_dir}")
else:
print(f"chromadb collection์„ ์ €์žฅํ•  ํด๋”๊ฐ€ ์ด๋ฏธ ์กด์žฌํ•ฉ๋‹ˆ๋‹ค. : {db_dir}")
# chroma db ํด๋ผ์ด์–ธํŠธ ๊ฐ์ฒด ์ƒ์„ฑ, ๋ฐ์ดํ„ฐ๋Š” ๋กœ์ปฌ์— ์ €์žฅ
chroma_client = chromadb.PersistentClient(path = db_dir)
print("chromadb client๊ฐ€ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
# ๋ฐ์ดํ„ฐ๋ฅผ embeddingํ•  ๋•Œ ์‚ฌ์šฉํ•  ํ•จ์ˆ˜ ์ง€์ •, ์‚ฌ์šฉ๊ฐ€๋Šฅํ•œ ํ•จ์ˆ˜ ๋ชฉ๋ก์€ ์•„๋ž˜ ์›นํŽ˜์ด์ง€ ์ฐธ๊ณ 
# https://docs.trychroma.com/docs/embeddings/embedding-functions
if embedf_name:
embed_fun = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name = embedf_name,
device = "cuda" if torch.cuda.is_available() else "cpu"
)
print(f"์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๋กœ {embedf_name} ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ")
else:
embed_fun = embedding_functions.DefaultEmbeddingFunction()
print("์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๋กœ ๊ธฐ๋ณธ ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ")
# Collection ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ ๋ฐ ์ƒ์„ฑ ํ˜น์€ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
existing_collections = [c.name for c in chroma_client.list_collections()]
# Collection์ด ์ด๋ฏธ ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ -> ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
if collection_name in existing_collections:
print(f"๊ธฐ์กด ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค๋ฅผ ๋ฐœ๊ฒฌํ–ˆ์Šต๋‹ˆ๋‹ค. \'{collection_name}\' Collection์„ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค.")
collection = chroma_client.get_collection(
name = collection_name,
embedding_function = embed_fun
)
print(f"Collection ๊ฐ์ฒด๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ๋ถˆ๋Ÿฌ์™”์Šต๋‹ˆ๋‹ค : {db_dir}")
# Collection์ด ์กด์žฌํ•˜์ง€ ์•Š๋Š” ๊ฒฝ์šฐ -> ์ƒˆ๋กœ ์ƒ์„ฑ
else:
print(f"๊ธฐ์กด ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. \'{collection_name}\' Collection์„ ์ƒˆ๋กœ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.")
collection = chroma_client.create_collection(
name = collection_name,
embedding_function = embed_fun
)
print(f"Collection ๊ฐ์ฒด๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค : {db_dir}")
return collection
def get_post_urls(current_url: str) -> list[str]:
"""
๊ฒŒ์‹œํŒ ํŽ˜์ด์ง€์—์„œ ๊ฐœ๋ณ„ ๊ฒŒ์‹œ๊ธ€์˜ url์„ ์ถ”์ถœํ•ด์„œ ๋ฆฌ์ŠคํŠธ๋กœ ๋ชจ์•„์ฃผ๋Š” ํ•จ์ˆ˜
input
current_url : ๊ฒŒ์‹œํŒ์˜ ํ˜„์žฌ ํŽ˜์ด์ง€ ๋งํฌ, ๊ธฐ๋ณธ์ ์œผ๋กœ {args.base_url}?page=1 (์„œ์šธ๊ณผ๊ธฐ๋Œ€ ๊ณต์ง€์‚ฌํ•ญ url)
output
post_urls : ํ˜„์žฌ ํŽ˜์ด์ง€ ๋‚ด์˜ ๊ฐ ๊ฒŒ์‹œ๊ธ€ ๋งํฌ๋“ค์˜ ๋ฆฌ์ŠคํŠธ
"""
print(f"{current_url}์˜ ๊ฒŒ์‹œ๊ธ€์„ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค. ")
post_urls = []
try:
res = requests.get(current_url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select("td.tit a")
if not links:
links = soup.select(".board_list a")
for link in links:
href = link.get("href")
if href and "javascript" not in href:
full_url = urljoin(current_url, href)
post_urls.append(full_url)
output = list(set(post_urls))
print(f"{len(output)}๊ฐœ์˜ ๊ฒŒ์‹œ๊ธ€ ๋งํฌ๋ฅผ ์ˆ˜์ง‘ ์™„๋ฃŒํ–ˆ์Šต๋‹ˆ๋‹ค. ")
return output
except Exception as e:
print(f"๊ฒŒ์‹œ๊ธ€ ๋งํฌ ์ˆ˜์ง‘ ์ค‘ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค : {e}")
return []
def download_file(url: str, filename :str, download_dir: str) -> None:
"""
์ฃผ์–ด์ง„ ๋‹ค์šด๋กœ๋“œ ๋งํฌ๋กœ๋ถ€ํ„ฐ ํŒŒ์ผ์„ ๋กœ์ปฌ์— ๋‹ค์šด๋กœ๋“œํ•˜๋Š” ํ•จ์ˆ˜
input
url : ๋‹ค์šด๋กœ๋“œ ๋งํฌ
filename : ์ €์žฅํ•  ํŒŒ์ผ ์ด๋ฆ„
download_dir : ํŒŒ์ผ์„ ์ €์žฅํ•  ํด๋” (์ ˆ๋Œ€ ๊ฒฝ๋กœ)
output
-
"""
print(f"{url} ๋กœ๋ถ€ํ„ฐ \'{filename}\' ์„(๋ฅผ) {download_dir} ์— ๋‹ค์šด๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.")
try:
r = requests.get(url, stream=True)
r.raise_for_status()
# ํŒŒ์ผ๋ช…์— ์œˆ๋„์šฐ์—์„œ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†๋Š” ํŠน์ˆ˜๋ฌธ์ž๊ฐ€ ์žˆ๋‹ค๋ฉด ์ œ๊ฑฐ
valid_name = "".join(c for c in filename if c not in '<>:"/\\|?*')
save_path = os.path.join(download_dir, valid_name)
with open(save_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"๋‹ค์šด๋กœ๋“œ ์„ฑ๊ณต : {filename}")
except Exception as e:
print(f"๋‹ค์šด๋กœ๋“œ ์‹คํŒจ ({filename}): {e}")
class TextParser:
"""
๊ฐ ํŒŒ์ผ์˜ ํ™•์žฅ์ž์— ๋งž๊ฒŒ ํŒŒ์ผ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ์—ญํ• 
@staticmethod ์˜๋ฏธ
: ํ•ด๋‹น ๋ฉ”์†Œ๋“œ๊ฐ€ ํด๋ž˜์Šค ์•ˆ์— ์žˆ๊ธด ํ•˜์ง€๋งŒ, ํด๋ž˜์Šค ๋‚ด๋ถ€ ์ •๋ณด๋Š” ํ•„์š” ์—†๋‹ค
= self.xxx ์ด๋Ÿฐ์‹์œผ๋กœ ํด๋ž˜์Šค ๋‚ด๋ถ€ ๋ณ€์ˆ˜ ๋“ฑ ์‚ฌ์šฉ X
๊ทธ๋ƒฅ ์†Œ์†๋งŒ ์ด ํด๋ž˜์Šค๊ณ  ํด๋ž˜์Šค๋ž‘์€ ๋…๋ฆฝ์ ์œผ๋กœ ์‚ฌ์šฉํ•˜๊ฒ ๋‹ค. ์š”๋Ÿฐ ์˜๋ฏผ๊ฐ€๋ด„
์‚ฌ์šฉํ•˜๋Š” ์ด์œ  : ์—ฌ๋Ÿฌ ์šฉ๋„๊ฐ€ ๋น„์Šทํ•œ ํ•จ์ˆ˜๋“ค์„ ํ•˜๋‚˜์˜ ํด๋ž˜์Šค ์•ˆ์— ๋ชจ์•„๋†“๋Š” ์šฉ๋„
"""
@staticmethod
def parse_pdf(file_path: str) -> list[str]:
"""
PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
input
file_path : (๋กœ์ปฌ์—) ์ €์žฅ๋œ .pdf ํ˜•์‹ ํŒŒ์ผ์˜ ๊ฒฝ๋กœ
output
texts : pdf ํŒŒ์ผ์—์„œ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
"""
texts = []
try:
doc = fitz.open(file_path)
for page in doc:
text = page.get_text().strip()
if text:
texts.append(text)
doc.close()
print(f"{os.path.basename(file_path)}๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ํŒŒ์‹ฑํ•˜์˜€์Šต๋‹ˆ๋‹ค. ")
return texts
except Exception as e:
err_msg = f"{os.path.basename(file_path)}๋ฅผ ํŒŒ์‹ฑํ•˜๋Š” ๊ณผ์ •์—์„œ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค : {str(e)}]"
print(err_msg)
return [err_msg]
# pdf ํŒŒ์‹ฑ์€ ๋ฌธ์ œ ์—†์Œ
@staticmethod
def parse_docx(file_path: str) -> list[str]:
"""
์›Œ๋“œ ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
input
- file_path : (๋กœ์ปฌ์—) ์ €์žฅ๋œ .docx ํ˜•์‹ ํŒŒ์ผ์˜ ๊ฒฝ๋กœ
output
- texts : docx ํŒŒ์ผ์—์„œ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
"""
texts = []
try:
doc = Document(file_path)
for para in doc.paragraphs:
text = para.text.strip()
if text:
texts.append(text)
print(f"{os.path.basename(file_path)}๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ํŒŒ์‹ฑํ•˜์˜€์Šต๋‹ˆ๋‹ค. ")
return texts
except Exception as e:
err_msg = f"{os.path.basename(file_path)}๋ฅผ ํŒŒ์‹ฑํ•˜๋Š” ๊ณผ์ •์—์„œ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค : {str(e)}]"
print(err_msg)
return [err_msg]
@staticmethod
def parse_hwp(file_path: str) -> list[str]:
"""
ํ•œ๊ธ€ ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜ ### hwp5-txt ๋“ฑ์˜ ๋„๊ตฌ ์ด์šฉ?
ํ˜„์žฌ๋Š” ์ ฌ๋ฏธ๋‹ˆ๊ฐ€ ์งœ์ค€ ์ฝ”๋“œ ๊ทธ๋Œ€๋กœ ๊ฐ€์ ธ์˜ด -> ์ตœ์ ํ™” ํ•„์š”
input
file_path : (๋กœ์ปฌ์—) ์ €์žฅ๋œ .hwp ํ˜•์‹ ํŒŒ์ผ์˜ ๊ฒฝ๋กœ
output
texts : hwp ํŒŒ์ผ์—์„œ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
"""
texts = []
try:
# .hwp ํŒŒ์ผ์ธ์ง€ ํ™•์ธ
if not olefile.isOleFile(file_path):
err_msg = f"{os.path.basename}์€(๋Š”) ์œ ํšจํ•œ OLE ํŒŒ์ผ์ด ์•„๋‹™๋‹ˆ๋‹ค"
print(err_msg)
return [err_msg]
f = olefile.OleFileIO(file_path)
# 2. ๋ณธ๋ฌธ(BodyText) ์„น์…˜ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ
# HWP ํŒŒ์ผ ๋‚ด๋ถ€๋Š” 'BodyText/Section0', 'BodyText/Section1'... ํ˜•ํƒœ๋กœ ์ €์žฅ๋จ
dirs = f.listdir()
# 'BodyText' ๋””๋ ‰ํ† ๋ฆฌ ํ•˜์œ„์˜ 'Section'์œผ๋กœ ์‹œ์ž‘ํ•˜๋Š” ์ŠคํŠธ๋ฆผ๋งŒ ํ•„ํ„ฐ๋งํ•˜๊ณ  ๋ฒˆํ˜ธ์ˆœ ์ •๋ ฌ
body_sections = [d for d in dirs if d[0] == "BodyText" and d[1].startswith("Section")]
# Section ๋’ค์˜ ๋ฒˆํ˜ธ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ •๋ ฌ (Section0, Section1, ...)
body_sections.sort(key=lambda x: int(x[1].replace("Section", "")))
# 3. ๊ฐ ์„น์…˜์˜ ํ…์ŠคํŠธ ์ถ”์ถœ
for section in body_sections:
# ์ŠคํŠธ๋ฆผ ์ฝ๊ธฐ
body_stream = f.openstream(section)
data = body_stream.read()
# HWP 5.0๋ถ€ํ„ฐ๋Š” ๋ณธ๋ฌธ์ด zlib์œผ๋กœ ์••์ถ•๋˜์–ด ์žˆ์Œ (-15๋Š” ํ—ค๋” ์—†๋Š” raw stream์„ ์˜๋ฏธ)
try:
unpacked_data = zlib.decompress(data, -15)
except Exception:
# ์••์ถ•๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ(๋งค์šฐ ๋“œ๋ญ„)๋ฅผ ๋Œ€๋น„
unpacked_data = data
# UTF-16LE๋กœ ๋””์ฝ”๋”ฉ
extracted_text = unpacked_data.decode('utf-16le', errors='ignore')
# 4. ํ…์ŠคํŠธ ์ •์ œ
# HWP ๋ฐ”์ด๋„ˆ๋ฆฌ์—๋Š” ํ…์ŠคํŠธ ์™ธ์— ์ œ์–ด ๋ฌธ์ž๋‚˜ ํƒœ๊ทธ ์ •๋ณด๊ฐ€ ์„ž์—ฌ ์žˆ์–ด ์ •์ œ๊ฐ€ ํ•„์š”ํ•จ
# ํ•œ๊ธ€, ์˜๋ฌธ, ์ˆซ์ž, ๊ธฐ๋ณธ ํŠน์ˆ˜๋ฌธ์ž ๋ฐ ๊ฐœํ–‰ ๋ฌธ์ž๋งŒ ๋‚จ๊ธฐ๊ณ  ํ•„ํ„ฐ๋งํ•˜๋Š” ๋ฐฉ์‹ ์‚ฌ์šฉ
cleaned_text = re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9\s.,?!()~%+-]', '', extracted_text)
# ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
if cleaned_text:
texts.append(cleaned_text)
f.close()
print(f"{os.path.basename(file_path)}๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ํŒŒ์‹ฑํ•˜์˜€์Šต๋‹ˆ๋‹ค.")
return texts
except Exception as e:
err_msg2 = f"{os.path.basename(file_path)}๋ฅผ ํŒŒ์‹ฑํ•˜๋Š” ๊ณผ์ •์—์„œ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค : {str(e)}]"
print(err_msg2)
return [err_msg2]
@staticmethod
def parse_hwpx(file_path: str) -> list[str]:
"""
hwpx ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
hwpx๋Š” zip ํŒŒ์ผ ํ˜•์‹์ด๋ฏ€๋กœ zipfile์„ ์ด์šฉํ•ด xml์„ ํŒŒ์‹ฑํ•จ
input
file_path : (๋กœ์ปฌ์—) ์ €์žฅ๋œ .hwpx ํ˜•์‹ ํŒŒ์ผ์˜ ๊ฒฝ๋กœ
output
texts : hwpx ํŒŒ์ผ์—์„œ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
"""
texts = []
try:
# zip ํŒŒ์ผ๋กœ ์—ด๊ธฐ
with zipfile.ZipFile(file_path, 'r') as zf:
# zip ํŒŒ์ผ ๋‚ด์˜ ํŒŒ์ผ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ
file_list = zf.namelist()
# ๋ณธ๋ฌธ ํ…์ŠคํŠธ๊ฐ€ ๋“ค์–ด์žˆ๋Š” xml ํŒŒ์ผ ์ฐพ๊ธฐ (๋ณดํ†ต Contents/section0.xml ํ˜•ํƒœ)
# ์ˆœ์„œ๋ฅผ ๋ณด์žฅํ•˜๊ธฐ ์œ„ํ•ด ์ •๋ ฌ
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
if not section_files:
return ["๋ณธ๋ฌธ ์„น์…˜์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."]
for section in section_files:
# xml ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ
xml_data = zf.read(section)
# XML ํŒŒ์‹ฑ
root = ET.fromstring(xml_data)
# ๋„ค์ž„์ŠคํŽ˜์ด์Šค ์ฒ˜๋ฆฌ (hwpx ๋‚ด๋ถ€ xml์€ ๋„ค์ž„์ŠคํŽ˜์ด์Šค๋ฅผ ์‚ฌ์šฉํ•จ)
# ๋ณดํ†ต <hp:t> ํƒœ๊ทธ ์•ˆ์— ํ…์ŠคํŠธ๊ฐ€ ์žˆ์Œ. ๋„ค์ž„์ŠคํŽ˜์ด์Šค ๋ฌด์‹œํ•˜๊ณ  ํƒœ๊ทธ ์ด๋ฆ„๋งŒ์œผ๋กœ ์ฐพ๊ฑฐ๋‚˜ namespace map์„ ์จ์•ผ ํ•จ.
# ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ„๋‹จํžˆ ๋ชจ๋“  ํ…์ŠคํŠธ ๋…ธ๋“œ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ์ถ”์ถœ
# hp:t (text) ํƒœ๊ทธ๋ฅผ ์ฐพ๋Š” ๊ฒƒ์ด ๊ฐ€์žฅ ์ •ํ™•ํ•จ.
# ํŽธ์˜์ƒ namespace {http://www.hancom.co.kr/hwpml/2011/paragraph} ๋ฅผ ๊ณ ๋ คํ•ด์•ผ ํ•จ
ns = {'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph'}
# ํ•ด๋‹น ์„น์…˜ ๋‚ด์˜ ๋ชจ๋“  ๋ฌธ๋‹จ(<hp:p>)์„ ์ฐพ์Œ
for para in root.findall('.//hp:p', ns):
para_text = ""
# ๋ฌธ๋‹จ ๋‚ด์˜ ๋Ÿฐ(<hp:run>) -> ํ…์ŠคํŠธ(<hp:t>) ์ถ”์ถœ
for text_node in para.findall('.//hp:t', ns):
if text_node.text:
para_text += text_node.text
# ํ…์ŠคํŠธ ์ •์ œ (๊ธฐ์กด hwp ์ฝ”๋“œ์™€ ๋™์ผํ•œ ๋กœ์ง ์ ์šฉ)
cleaned_text = re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9\s.,?!()~%+-]', '', para_text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
if cleaned_text:
texts.append(cleaned_text)
print(f"{os.path.basename(file_path)}๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ํŒŒ์‹ฑํ•˜์˜€์Šต๋‹ˆ๋‹ค.")
return texts
except Exception as e:
err_msg = f"{os.path.basename(file_path)} ํŒŒ์‹ฑ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}"
print(err_msg)
return [err_msg]
@staticmethod
def get_text(file_path: str) -> list[str]:
"""
ํ™•์žฅ์ž์— ๋”ฐ๋ผ ์ ์ ˆํ•œ ํŒŒ์„œ ํ˜ธ์ถœ
input:
file_path : ๋กœ์ปฌ์— ์ €์žฅ๋œ ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์˜ ๊ฒฝ๋กœ
output:
list[str] : ๊ฒŒ์‹œ๊ธ€์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ
"""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return TextParser.parse_pdf(file_path)
elif ext == ".docx":
return TextParser.parse_docx(file_path)
elif ext == ".hwp":
return TextParser.parse_hwp(file_path)
elif ext == ".hwpx":
return TextParser.parse_hwpx(file_path)
else:
print(f"์ง€์›ํ•˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹์ž…๋‹ˆ๋‹ค. : {ext}")
return [""]
def parse_attachment(filename: str, download_dir: str) -> list[str]:
"""
๋กœ์ปฌ์— ์ €์žฅ๋œ ์ฒจ๋ถ€ํŒŒ์ผ์„ ํ™•์žฅ์ž์— ๋งž๊ฒŒ ํŒŒ์‹ฑํ•˜์—ฌ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
input
filename : ๋‹ค์šด๋กœ๋“œ ๋œ ๊ฐ ์ฒจ๋ถ€ํŒŒ์ผ์˜ ์ด๋ฆ„
download_dir : ๋‹ค์šด๋กœ๋“œ๋œ ์ฒจ๋ถ€ํŒŒ์ผ์˜ ์ ˆ๋Œ€ ๊ฒฝ๋กœ
output
texts : ํŒŒ์ผ๋กœ๋ถ€ํ„ฐ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ
"""
try:
file_path = os.path.join(download_dir, filename)
texts = TextParser.get_text(file_path)
return texts
except Exception as e:
err_msg = f"{filename}์ด(๊ฐ€) ๋‹ค์šด๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค : {e}"
print(err_msg)
return [err_msg]
def parse_post_content(post_url: str, download_dir: str) -> dict[str, str | list[str]] | None:
"""
๊ฐ ๊ฒŒ์‹œ๊ธ€ ์ƒ์„ธํŽ˜์ด์ง€์—์„œ ๋ณธ๋ฌธ ํ…์ŠคํŠธ์™€ ์ฒจ๋ถ€ํŒŒ์ผ์„ ํฌ๋กค๋งํ•˜๋Š” ํ•จ์ˆ˜
input
post_url : ๊ฐ ๊ฒŒ์‹œ๊ธ€์˜ url
download_dir : ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์„ ์ €์žฅํ•  ์ ˆ๋Œ€ ๊ฒฝ๋กœ
output
output : ๊ฒŒ์‹œ๊ธ€๋กœ๋ถ€ํ„ฐ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ๋‹ด๊ณ  ์žˆ๋Š” ๋”•์…”๋„ˆ๋ฆฌ
output = {
"source_url": post_url, # ๊ฒŒ์‹œ๊ธ€์˜ url
"title": title, # ๊ฒŒ์‹œ๊ธ€ ์ œ๋ชฉ
"date": date, # ๊ฒŒ์‹œ๊ธ€ ์ž‘์„ฑ ๋‚ ์งœ
"category": sub_title, # ๊ฒŒ์‹œ๊ธ€์ด ๋‹ด๊ธด ์นดํ…Œ๊ณ ๋ฆฌ
"text": full_text, # ๊ฒŒ์‹œ๊ธ€ ์ œ๋ชฉ + ๋ณธ๋ฌธ
"attachments": attachments_str, # ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ(ํŒŒ์ผ๋ช…)
"extracted": extracted_str, # ์ฒจ๋ถ€ํŒŒ์ผ๋กœ๋ถ€ํ„ฐ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ
"crawl_date": now_time # ํฌ๋กค๋ง๋œ ๋‚ ์งœ์™€ ์‹œ๊ฐ„
}
"""
try:
res = requests.get(post_url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "html.parser")
# ๊ฒŒ์‹œ๊ธ€ ์ œ๋ชฉ ์–ป๊ธฐ
title_element = soup.find("th", string = "์ œ๋ชฉ")
if title_element:
title = title_element.find_next_sibling("td").get_text(strip=True)
else:
title = "์ œ๋ชฉ์—†์Œ"
# ๊ฒŒ์‹œ๊ธ€ ์ž‘์„ฑ ๋‚ ์งœ ์–ป๊ธฐ
date_element = soup.find("th", string = "๋‚ ์งœ")
if date_element:
date = date_element.find_next_sibling("td").get_text(strip=True)
else:
date = "๋‚ ์งœ์—†์Œ"
# ๊ฒŒ์‹œ๊ธ€ ๋‚ด์šฉ ์–ป๊ธฐ
content_element = soup.select_one(".cont")
if content_element:
# ๋ถˆํ•„์š”ํ•œ ์Šคํฌ๋ฆฝํŠธ, ์Šคํƒ€์ผ, ์ฃผ์„ ์ œ๊ฑฐ
for script in content_element(["script", "style", "iframe"]):
script.decompose()
body_text = content_element.get_text(separator='\n', strip=True)
else:
body_text = "๋ณธ๋ฌธ์—†์Œ"
full_text = f"์ œ๋ชฉ: {title}\n๋ณธ๋ฌธ: {body_text}"
# ๊ฒŒ์‹œ๊ธ€ ์นดํ…Œ๊ณ ๋ฆฌ ์–ป๊ธฐ
# ์นดํ…Œ๊ณ ๋ฆฌ : ๋Œ€ํ•™๊ณต์ง€์‚ฌํ•ญ, ํ•™์‚ฌ๊ณต์ง€, ์žฅํ•™๊ณต์ง€, ๋Œ€ํ•™์›๊ณต์ง€, ๋“ฑ๋“ฑ..
sub_title_element = soup.select_one(".sub_title")
if sub_title_element:
sub_title = sub_title_element.get_text(strip=True)
else:
sub_title = "์นดํ…Œ๊ณ ๋ฆฌ์—†์Œ"
# ์ฒจ๋ถ€ํŒŒ์ผ ์–ป๊ธฐ
attachments = []
extracted_text = []
file_links = soup.select(".list_attach a")
if file_links:
for f_link in file_links:
href = f_link.get("href")
if not href:
continue
if "javascript:downloadfile" in href:
try:
match = re.search(r"downloadfile\(\s*'([^']*)'\s*,\s*'([^']*)'\s*,\s*'([^']*)'\s*\)", href)
if match:
path = match.group(1)
server_fname = match.group(2) # ์ด๊ฑด ์„œ๋ฒ„์— ์ €์žฅ๋œ ํŒŒ์ผ ์ด๋ฆ„. e.g. 55E0D06B18CD4638B3515EDCC4D43130_.hwp
origin_fname = match.group(3) # ์ด๊ฒŒ ์‹ค์ œ ํŒŒ์ผ ์ด๋ฆ„. e.g. ์—ฐ๊ตฌํ™œ๋™์ข…์‚ฌ์ž ์•ˆ์ „๊ต์œก ์‹ ์ฒญ ๋ฐ ์ด์ˆ˜ ๋ฐฉ๋ฒ•(๋ชจ๋ฐ”์ผ, ์˜จ๋ผ์ธ).hwp
base_url = "https://www.seoultech.ac.kr"
download_url = f"{base_url}{path}/{server_fname}"
# attachments๋Š” ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์šฉ ํ…์ŠคํŠธ
attachments.append(f"{origin_fname} ({download_url})")
# ์ฒจ๋ถ€ํŒŒ์ผ์„ ์‹ค์ œ๋กœ ๋กœ์ปฌ์— ๋‹ค์šด๋กœ๋“œ
download_file(download_url, origin_fname, download_dir)
# ๋‹ค์šด๋กœ๋“œ๋œ ์ฒจ๋ถ€ํŒŒ์ผ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
details = parse_attachment(origin_fname, download_dir)
# ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋ฅผ extracted_text์— ์ถ”๊ฐ€
for s in details:
extracted_text.append(s)
except Exception as e:
print(f"Javascript ํŒŒ์‹ฑ ์—๋Ÿฌ : {e}")
else:
full_url = urljoin(post_url, href)
f_name = f_link.get_text(strip=True)
attachments.append(f"{f_name} ({full_url})")
attachments_str = ", ".join(attachments) if attachments else "์ฒจ๋ถ€ํŒŒ์ผ ์—†์Œ"
extracted_str = ", ".join(extracted_text) if extracted_text else "์ฒจ๋ถ€ํŒŒ์ผ ์—†์Œ"
# DB์— ์ €์žฅํ•  ์ตœ์ข… ๋ฐ์ดํ„ฐ ํ˜•์‹์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Œ
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
output = {
"source_url": post_url, # ๊ฒŒ์‹œ๊ธ€์˜ url
"title": title, # ๊ฒŒ์‹œ๊ธ€ ์ œ๋ชฉ
"date": date, # ๊ฒŒ์‹œ๊ธ€ ์ž‘์„ฑ ๋‚ ์งœ
"category": sub_title, # ๊ฒŒ์‹œ๊ธ€์ด ๋‹ด๊ธด ์นดํ…Œ๊ณ ๋ฆฌ
"text": full_text, # ๊ฒŒ์‹œ๊ธ€ ์ œ๋ชฉ + ๋ณธ๋ฌธ
"attachments": attachments_str, # ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ(ํŒŒ์ผ๋ช…)
"extracted": extracted_str, # ์ฒจ๋ถ€ํŒŒ์ผ๋กœ๋ถ€ํ„ฐ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ
"crawl_date": now_time # ํฌ๋กค๋ง๋œ ๋‚ ์งœ์™€ ์‹œ๊ฐ„
}
return output
except Exception as e:
err_msg = f"{post_url}์„(๋ฅผ) ํŒŒ์‹ฑํ•˜๋Š” ๊ณผ์ •์—์„œ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค : {e}"
print(err_msg)
traceback.print_exc()
return None
def save_to_db(data: dict[str, str | list[str]], collection: chromadb.Collection) -> None:
"""
๊ฐ ๊ฒŒ์‹œ๊ธ€๋งˆ๋‹ค parse_post_content๋ฅผ ํ†ตํ•ด ์–ป์–ด๋‚ธ ๊ฒฐ๊ณผ๋ฌผ์„ ChromaDB์— ์ €์žฅํ•˜๋Š” ํ•จ์ˆ˜
input
data : parse_post_content์˜ output
collection : chromadb ํด๋ผ์ด์–ธํŠธ ๊ฐ์ฒด
output
-
๊ธฐ๋Šฅ : collection์— ๊ฐ data๋ฅผ ์ €์žฅ
id๋Š” ๊ฐ ๊ฒŒ์‹œ๊ธ€์˜ url(๊ฐ ๋ฐ์ดํ„ฐ์˜ ๊ณ ์œ ํ•œ ์ฃผ์†Œ์ด๋ฏ€๋กœ)
์ฐธ๊ณ ) chromadb์— ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•˜๋ ค๋ฉด
documents : ๋ฒกํ„ฐํ™”๋˜์–ด ๊ฒ€์ƒ‰์˜ ๋Œ€์ƒ์ด ๋˜๋Š” ํ…์ŠคํŠธ -> ๋ณธ๋ฌธ + ์ฒจ๋ถ€ํŒŒ์ผ
metadatas : ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์™€ ํ•จ๊ป˜ ๋ฐ˜ํ™˜๋˜๊ฑฐ๋‚˜, ํ•„ํ„ฐ๋ง์— ์‚ฌ์šฉํ•  ์ •๋ณด -> ์ œ๋ชฉ, ๋‚ ์งœ ๋“ฑ
ids : ๊ฐ ๋ฐ์ดํ„ฐ์˜ ๊ณ ์œ ํ•œ id
"""
# ์ž„๋ฒ ๋”ฉํ•  ํ…์ŠคํŠธ : ๊ฒŒ์‹œ๊ธ€ ๋ณธ๋ฌธ
# ์ฒจ๋ถ€ํŒŒ์ผ์ด ์žˆ์„ ๊ฒฝ์šฐ ํ•ด๋‹น ์ •๋ณด๋„ ํ•จ๊ป˜ ์ž„๋ฒ ๋”ฉ
content_to_embed = data["text"]
if data.get("extracted"):
content_to_embed += f"\n\n[์ฒจ๋ถ€ํŒŒ์ผ ๋‚ด์šฉ]\n{data["extracted"]}"
metadata_dict = {
"source_url": data["source_url"],
"title": data["title"],
"date": data["date"],
"category": data["category"],
"attachments": data["attachments"] if data["attachments"] else "",
"crawl_date": data["crawl_date"]
}
# upsert๋Š” ๊ธฐ์กด์— ๋™์ผํ•œ id๊ฐ€ ์žˆ์œผ๋ฉด ์—…๋ฐ์ดํŠธํ•˜๊ณ , ์ƒˆ๋กœ์šด id๋ฉด db์— ์ถ”๊ฐ€ํ•จ.
collection.upsert(
ids = [data["source_url"]],
documents = [content_to_embed],
metadatas = [metadata_dict]
)
print(f"๋‹ค์Œ ๊ฒŒ์‹œ๊ธ€์ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. : {data["title"]}")
def crawl_seoultech_notice(download_dir: str, base_url: str, num_pages: str, collection: chromadb.Collection) -> None:
"""
crawl_seoultech_notice์˜ Docstring
input
download_dir : ๊ฒŒ์‹œ๊ธ€์˜ ์ฒจ๋ถ€ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•  ํด๋” (์ ˆ๋Œ€ ๊ฒฝ๋กœ)
base_url : ๊ธฐ๋ณธ ๊ณต์ง€์‚ฌํ•ญ ํŽ˜์ด์ง€ url
num_pages : ํฌ๋กค๋งํ•  ํŽ˜์ด์ง€ ์ˆ˜
collection : Chromadb Collection ๊ฐ์ฒด - ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ์ €์žฅ์†Œ
output
-
"""
t0 = time.time()
print(f"{base_url} ๋กœ๋ถ€ํ„ฐ ๋ฐ์ดํ„ฐ ํฌ๋กค๋ง์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค. ")
print(f"์˜ˆ์ƒ ์†Œ์š” ์‹œ๊ฐ„ : ์•ฝ {num_pages * 20} ~ {num_pages * 30}์ดˆ")
for i in range(num_pages):
current_url = f"{base_url}?page={i+1}"
print(f"=== {i+1} ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์‹œ์ž‘ : {current_url} ===")
post_urls = get_post_urls(current_url)
for post in post_urls:
post_data = parse_post_content(post, download_dir)
if post_data:
try:
save_to_db(post_data, collection)
except Exception as e:
err_msg = f"๊ฒŒ์‹œ๋ฌผ์˜ ๋ฐ์ดํ„ฐ๋ฅผ DB์— ์ €์žฅํ•˜๋Š” ๊ณผ์ •์—์„œ ์—๋Ÿฌ๊ฐ€ ๋ฐœ์ƒํ•˜์˜€์Šต๋‹ˆ๋‹ค : {e}"
print(err_msg)
else:
print(f"ํŒŒ์‹ฑ ์‹คํŒจ๋กœ ๋ฐ์ดํ„ฐ ์ €์žฅ์„ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค : {post}")
t1 = time.time()
print(f"=== {i+1} ํŽ˜์ด์ง€ ์ˆ˜์ง‘ ์™„๋ฃŒ (์†Œ์š” ์‹œ๊ฐ„ : {t1 - t0:.2f}์ดˆ)===")
def main(args):
abs_download_path = os.path.join(args.base_dir, args.download_dir)
abs_db_path = os.path.join(args.base_dir, args.db_dir)
collection = make_db(abs_download_path, abs_db_path, args.collection_name)
# ๊ธฐ๋ณธ ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜ ์™ธ์˜ ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•  ๊ฒฝ์šฐ
#collection = make_db(abs_download_path, abs_db_path, args.collection_name, args.embedf_name)
crawl_seoultech_notice(abs_download_path, args.base_url, args.num_page, collection)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--base_url", type = str, default = "https://www.seoultech.ac.kr/service/info/notice")
parser.add_argument("--base_dir", type = str, default = str(Path(__file__).resolve().parent)) # ํ˜„์žฌ ์ด ํŒŒ์ผ์ด ์žˆ๋Š” ๋””๋ ‰ํ† ๋ฆฌ
parser.add_argument("--download_dir", type = str, default = "seoultech_data_download")
parser.add_argument("--db_dir", type = str, default = "seoultech_data_db")
parser.add_argument("--header", type = dict, default = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"})
parser.add_argument("--num_page", type = int, default = 1)
parser.add_argument("--collection_name", type = str, default = "seoultech_notices")
parser.add_argument("--embedf_name", type = str, default = "BAAI/bge-m3")
args = parser.parse_args()
main(args)