|
|
import os
|
|
|
import time
|
|
|
import torch
|
|
|
import requests
|
|
|
import fitz
|
|
|
import olefile
|
|
|
import zlib
|
|
|
import re
|
|
|
import traceback
|
|
|
import chromadb
|
|
|
import argparse
|
|
|
import zipfile
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
from docx import Document
|
|
|
from urllib.parse import urljoin
|
|
|
from pathlib import Path
|
|
|
from chromadb.utils import embedding_functions
|
|
|
|
|
|
|
|
|
def make_db(download_dir: str, db_dir: str, collection_name: str, *, embedf_name: str = "") -> chromadb.Collection:
|
|
|
"""
|
|
|
chromadb์ collection ๊ฐ์ฒด๋ฅผ ๋ง๋ค์ด์ ๋ฐํํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
download_dir : ์นํ์ด์ง์์ ๊ฐ ๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ๋ค์ด๋ก๋ํด์ ์ ์ฅํ ํด๋ (์ ๋ ๊ฒฝ๋ก)
|
|
|
db_dir : chroma db๋ฅผ ์ ์ฅํ ๋ก์ปฌ ํด๋ (์ ๋ ๊ฒฝ๋ก)
|
|
|
embedf_name(optional) : ํ
์คํธ๋ฅผ ์๋ฒ ๋ฉํ ํจ์ ์ด๋ฆ
|
|
|
output
|
|
|
collection ๊ฐ์ฒด
|
|
|
|
|
|
***
|
|
|
DefaultEmbeddingFunction ๋ง๊ณ ๋ค๋ฅธ๊ฑธ ์ฌ์ฉํ๋ ค๋ฉด
|
|
|
|
|
|
pip install -q sentence-transformers ์ค์นํ๊ณ
|
|
|
|
|
|
"""
|
|
|
|
|
|
print("chromadb collection ๊ฐ์ฒด๋ฅผ ๋ก์ปฌ์ ์์ฑํฉ๋๋ค. ")
|
|
|
|
|
|
if not os.path.exists(download_dir):
|
|
|
os.makedirs(download_dir)
|
|
|
print(f"๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ๋ค์ด๋ก๋ํ ํด๋๊ฐ ์์ฑ๋์์ต๋๋ค : {download_dir}")
|
|
|
else:
|
|
|
print(f"๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ๋ค์ด๋ก๋ํ ํด๋๊ฐ ์ด๋ฏธ ์กด์ฌํฉ๋๋ค. :{download_dir}")
|
|
|
|
|
|
if not os.path.exists(db_dir):
|
|
|
os.makedirs(db_dir)
|
|
|
print(f"chromadb collection์ ์ ์ฅํ ํด๋๊ฐ ์์ฑ๋์์ต๋๋ค : {db_dir}")
|
|
|
else:
|
|
|
print(f"chromadb collection์ ์ ์ฅํ ํด๋๊ฐ ์ด๋ฏธ ์กด์ฌํฉ๋๋ค. : {db_dir}")
|
|
|
|
|
|
|
|
|
chroma_client = chromadb.PersistentClient(path = db_dir)
|
|
|
print("chromadb client๊ฐ ์์ฑ๋์์ต๋๋ค.")
|
|
|
|
|
|
|
|
|
|
|
|
if embedf_name:
|
|
|
embed_fun = embedding_functions.SentenceTransformerEmbeddingFunction(
|
|
|
model_name = embedf_name,
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
)
|
|
|
print(f"์๋ฒ ๋ฉ ํจ์๋ก {embedf_name} ๋ฅผ ์ฌ์ฉํฉ๋๋ค. ")
|
|
|
else:
|
|
|
embed_fun = embedding_functions.DefaultEmbeddingFunction()
|
|
|
print("์๋ฒ ๋ฉ ํจ์๋ก ๊ธฐ๋ณธ ์๋ฒ ๋ฉ ํจ์๋ฅผ ์ฌ์ฉํฉ๋๋ค. ")
|
|
|
|
|
|
|
|
|
existing_collections = [c.name for c in chroma_client.list_collections()]
|
|
|
|
|
|
|
|
|
if collection_name in existing_collections:
|
|
|
print(f"๊ธฐ์กด ๋ฐ์ดํฐ๋ฒ ์ด์ค๋ฅผ ๋ฐ๊ฒฌํ์ต๋๋ค. \'{collection_name}\' Collection์ ๋ถ๋ฌ์ต๋๋ค.")
|
|
|
collection = chroma_client.get_collection(
|
|
|
name = collection_name,
|
|
|
embedding_function = embed_fun
|
|
|
)
|
|
|
print(f"Collection ๊ฐ์ฒด๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ๋ถ๋ฌ์์ต๋๋ค : {db_dir}")
|
|
|
|
|
|
|
|
|
else:
|
|
|
print(f"๊ธฐ์กด ๋ฐ์ดํฐ๋ฒ ์ด์ค๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค. \'{collection_name}\' Collection์ ์๋ก ์์ฑํฉ๋๋ค.")
|
|
|
collection = chroma_client.create_collection(
|
|
|
name = collection_name,
|
|
|
embedding_function = embed_fun
|
|
|
)
|
|
|
print(f"Collection ๊ฐ์ฒด๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์์ฑ๋์์ต๋๋ค : {db_dir}")
|
|
|
|
|
|
return collection
|
|
|
|
|
|
|
|
|
def get_post_urls(current_url: str) -> list[str]:
|
|
|
"""
|
|
|
๊ฒ์ํ ํ์ด์ง์์ ๊ฐ๋ณ ๊ฒ์๊ธ์ url์ ์ถ์ถํด์ ๋ฆฌ์คํธ๋ก ๋ชจ์์ฃผ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
current_url : ๊ฒ์ํ์ ํ์ฌ ํ์ด์ง ๋งํฌ, ๊ธฐ๋ณธ์ ์ผ๋ก {args.base_url}?page=1 (์์ธ๊ณผ๊ธฐ๋ ๊ณต์ง์ฌํญ url)
|
|
|
output
|
|
|
post_urls : ํ์ฌ ํ์ด์ง ๋ด์ ๊ฐ ๊ฒ์๊ธ ๋งํฌ๋ค์ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
|
|
|
print(f"{current_url}์ ๊ฒ์๊ธ์ ์์งํฉ๋๋ค. ")
|
|
|
post_urls = []
|
|
|
|
|
|
try:
|
|
|
res = requests.get(current_url)
|
|
|
res.raise_for_status()
|
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
|
|
|
|
links = soup.select("td.tit a")
|
|
|
|
|
|
if not links:
|
|
|
links = soup.select(".board_list a")
|
|
|
|
|
|
for link in links:
|
|
|
href = link.get("href")
|
|
|
if href and "javascript" not in href:
|
|
|
full_url = urljoin(current_url, href)
|
|
|
post_urls.append(full_url)
|
|
|
|
|
|
output = list(set(post_urls))
|
|
|
print(f"{len(output)}๊ฐ์ ๊ฒ์๊ธ ๋งํฌ๋ฅผ ์์ง ์๋ฃํ์ต๋๋ค. ")
|
|
|
return output
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"๊ฒ์๊ธ ๋งํฌ ์์ง ์ค ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค : {e}")
|
|
|
return []
|
|
|
|
|
|
|
|
|
def download_file(url: str, filename :str, download_dir: str) -> None:
|
|
|
"""
|
|
|
์ฃผ์ด์ง ๋ค์ด๋ก๋ ๋งํฌ๋ก๋ถํฐ ํ์ผ์ ๋ก์ปฌ์ ๋ค์ด๋ก๋ํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
url : ๋ค์ด๋ก๋ ๋งํฌ
|
|
|
filename : ์ ์ฅํ ํ์ผ ์ด๋ฆ
|
|
|
download_dir : ํ์ผ์ ์ ์ฅํ ํด๋ (์ ๋ ๊ฒฝ๋ก)
|
|
|
output
|
|
|
-
|
|
|
"""
|
|
|
print(f"{url} ๋ก๋ถํฐ \'{filename}\' ์(๋ฅผ) {download_dir} ์ ๋ค์ด๋ก๋ํฉ๋๋ค.")
|
|
|
|
|
|
try:
|
|
|
r = requests.get(url, stream=True)
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
|
valid_name = "".join(c for c in filename if c not in '<>:"/\\|?*')
|
|
|
save_path = os.path.join(download_dir, valid_name)
|
|
|
|
|
|
with open(save_path, 'wb') as f:
|
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
|
f.write(chunk)
|
|
|
print(f"๋ค์ด๋ก๋ ์ฑ๊ณต : {filename}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"๋ค์ด๋ก๋ ์คํจ ({filename}): {e}")
|
|
|
|
|
|
|
|
|
class TextParser:
|
|
|
"""
|
|
|
๊ฐ ํ์ผ์ ํ์ฅ์์ ๋ง๊ฒ ํ์ผ๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ์ญํ
|
|
|
|
|
|
@staticmethod ์๋ฏธ
|
|
|
: ํด๋น ๋ฉ์๋๊ฐ ํด๋์ค ์์ ์๊ธด ํ์ง๋ง, ํด๋์ค ๋ด๋ถ ์ ๋ณด๋ ํ์ ์๋ค
|
|
|
= self.xxx ์ด๋ฐ์์ผ๋ก ํด๋์ค ๋ด๋ถ ๋ณ์ ๋ฑ ์ฌ์ฉ X
|
|
|
๊ทธ๋ฅ ์์๋ง ์ด ํด๋์ค๊ณ ํด๋์ค๋์ ๋
๋ฆฝ์ ์ผ๋ก ์ฌ์ฉํ๊ฒ ๋ค. ์๋ฐ ์๋ฏผ๊ฐ๋ด
|
|
|
|
|
|
์ฌ์ฉํ๋ ์ด์ : ์ฌ๋ฌ ์ฉ๋๊ฐ ๋น์ทํ ํจ์๋ค์ ํ๋์ ํด๋์ค ์์ ๋ชจ์๋๋ ์ฉ๋
|
|
|
"""
|
|
|
@staticmethod
|
|
|
def parse_pdf(file_path: str) -> list[str]:
|
|
|
"""
|
|
|
PDF ํ์ผ์์ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
file_path : (๋ก์ปฌ์) ์ ์ฅ๋ .pdf ํ์ ํ์ผ์ ๊ฒฝ๋ก
|
|
|
output
|
|
|
texts : pdf ํ์ผ์์ ์ถ์ถํ ํ
์คํธ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
|
|
|
texts = []
|
|
|
|
|
|
try:
|
|
|
doc = fitz.open(file_path)
|
|
|
for page in doc:
|
|
|
text = page.get_text().strip()
|
|
|
if text:
|
|
|
texts.append(text)
|
|
|
doc.close()
|
|
|
print(f"{os.path.basename(file_path)}๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ํ์ฑํ์์ต๋๋ค. ")
|
|
|
return texts
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"{os.path.basename(file_path)}๋ฅผ ํ์ฑํ๋ ๊ณผ์ ์์ ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค : {str(e)}]"
|
|
|
print(err_msg)
|
|
|
return [err_msg]
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_docx(file_path: str) -> list[str]:
|
|
|
"""
|
|
|
์๋ ํ์ผ์์ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
- file_path : (๋ก์ปฌ์) ์ ์ฅ๋ .docx ํ์ ํ์ผ์ ๊ฒฝ๋ก
|
|
|
output
|
|
|
- texts : docx ํ์ผ์์ ์ถ์ถํ ํ
์คํธ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
texts = []
|
|
|
|
|
|
try:
|
|
|
doc = Document(file_path)
|
|
|
for para in doc.paragraphs:
|
|
|
text = para.text.strip()
|
|
|
if text:
|
|
|
texts.append(text)
|
|
|
print(f"{os.path.basename(file_path)}๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ํ์ฑํ์์ต๋๋ค. ")
|
|
|
return texts
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"{os.path.basename(file_path)}๋ฅผ ํ์ฑํ๋ ๊ณผ์ ์์ ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค : {str(e)}]"
|
|
|
print(err_msg)
|
|
|
return [err_msg]
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_hwp(file_path: str) -> list[str]:
|
|
|
"""
|
|
|
ํ๊ธ ํ์ผ์์ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์ ### hwp5-txt ๋ฑ์ ๋๊ตฌ ์ด์ฉ?
|
|
|
ํ์ฌ๋ ์ ฌ๋ฏธ๋๊ฐ ์ง์ค ์ฝ๋ ๊ทธ๋๋ก ๊ฐ์ ธ์ด -> ์ต์ ํ ํ์
|
|
|
|
|
|
input
|
|
|
file_path : (๋ก์ปฌ์) ์ ์ฅ๋ .hwp ํ์ ํ์ผ์ ๊ฒฝ๋ก
|
|
|
output
|
|
|
texts : hwp ํ์ผ์์ ์ถ์ถํ ํ
์คํธ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
texts = []
|
|
|
|
|
|
try:
|
|
|
|
|
|
if not olefile.isOleFile(file_path):
|
|
|
err_msg = f"{os.path.basename}์(๋) ์ ํจํ OLE ํ์ผ์ด ์๋๋๋ค"
|
|
|
print(err_msg)
|
|
|
return [err_msg]
|
|
|
|
|
|
f = olefile.OleFileIO(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
dirs = f.listdir()
|
|
|
|
|
|
|
|
|
body_sections = [d for d in dirs if d[0] == "BodyText" and d[1].startswith("Section")]
|
|
|
|
|
|
body_sections.sort(key=lambda x: int(x[1].replace("Section", "")))
|
|
|
|
|
|
|
|
|
for section in body_sections:
|
|
|
|
|
|
body_stream = f.openstream(section)
|
|
|
data = body_stream.read()
|
|
|
|
|
|
|
|
|
try:
|
|
|
unpacked_data = zlib.decompress(data, -15)
|
|
|
except Exception:
|
|
|
|
|
|
unpacked_data = data
|
|
|
|
|
|
|
|
|
extracted_text = unpacked_data.decode('utf-16le', errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'[^๊ฐ-ํฃa-zA-Z0-9\s.,?!()~%+-]', '', extracted_text)
|
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
|
|
|
|
|
if cleaned_text:
|
|
|
texts.append(cleaned_text)
|
|
|
|
|
|
f.close()
|
|
|
print(f"{os.path.basename(file_path)}๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ํ์ฑํ์์ต๋๋ค.")
|
|
|
return texts
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg2 = f"{os.path.basename(file_path)}๋ฅผ ํ์ฑํ๋ ๊ณผ์ ์์ ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค : {str(e)}]"
|
|
|
print(err_msg2)
|
|
|
return [err_msg2]
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_hwpx(file_path: str) -> list[str]:
|
|
|
"""
|
|
|
hwpx ํ์ผ์์ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์
|
|
|
hwpx๋ zip ํ์ผ ํ์์ด๋ฏ๋ก zipfile์ ์ด์ฉํด xml์ ํ์ฑํจ
|
|
|
input
|
|
|
file_path : (๋ก์ปฌ์) ์ ์ฅ๋ .hwpx ํ์ ํ์ผ์ ๊ฒฝ๋ก
|
|
|
output
|
|
|
texts : hwpx ํ์ผ์์ ์ถ์ถํ ํ
์คํธ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
texts = []
|
|
|
|
|
|
try:
|
|
|
|
|
|
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
|
|
|
|
file_list = zf.namelist()
|
|
|
|
|
|
|
|
|
|
|
|
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
|
|
|
|
|
|
if not section_files:
|
|
|
return ["๋ณธ๋ฌธ ์น์
์ ์ฐพ์ ์ ์์ต๋๋ค."]
|
|
|
|
|
|
for section in section_files:
|
|
|
|
|
|
xml_data = zf.read(section)
|
|
|
|
|
|
|
|
|
root = ET.fromstring(xml_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ns = {'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph'}
|
|
|
|
|
|
|
|
|
for para in root.findall('.//hp:p', ns):
|
|
|
para_text = ""
|
|
|
|
|
|
for text_node in para.findall('.//hp:t', ns):
|
|
|
if text_node.text:
|
|
|
para_text += text_node.text
|
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'[^๊ฐ-ํฃa-zA-Z0-9\s.,?!()~%+-]', '', para_text)
|
|
|
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
|
|
|
|
|
if cleaned_text:
|
|
|
texts.append(cleaned_text)
|
|
|
|
|
|
print(f"{os.path.basename(file_path)}๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ํ์ฑํ์์ต๋๋ค.")
|
|
|
return texts
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"{os.path.basename(file_path)} ํ์ฑ ์ค ์๋ฌ ๋ฐ์: {str(e)}"
|
|
|
print(err_msg)
|
|
|
return [err_msg]
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
def get_text(file_path: str) -> list[str]:
|
|
|
"""
|
|
|
ํ์ฅ์์ ๋ฐ๋ผ ์ ์ ํ ํ์ ํธ์ถ
|
|
|
input:
|
|
|
file_path : ๋ก์ปฌ์ ์ ์ฅ๋ ๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ๊ฒฝ๋ก
|
|
|
output:
|
|
|
list[str] : ๊ฒ์๊ธ์์ ์ถ์ถ๋ ํ
์คํธ
|
|
|
"""
|
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
if ext == ".pdf":
|
|
|
return TextParser.parse_pdf(file_path)
|
|
|
elif ext == ".docx":
|
|
|
return TextParser.parse_docx(file_path)
|
|
|
elif ext == ".hwp":
|
|
|
return TextParser.parse_hwp(file_path)
|
|
|
elif ext == ".hwpx":
|
|
|
return TextParser.parse_hwpx(file_path)
|
|
|
else:
|
|
|
print(f"์ง์ํ์ง ์๋ ํ์ผ ํ์์
๋๋ค. : {ext}")
|
|
|
return [""]
|
|
|
|
|
|
|
|
|
def parse_attachment(filename: str, download_dir: str) -> list[str]:
|
|
|
"""
|
|
|
๋ก์ปฌ์ ์ ์ฅ๋ ์ฒจ๋ถํ์ผ์ ํ์ฅ์์ ๋ง๊ฒ ํ์ฑํ์ฌ ํ
์คํธ๋ฅผ ๋ฐํํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
filename : ๋ค์ด๋ก๋ ๋ ๊ฐ ์ฒจ๋ถํ์ผ์ ์ด๋ฆ
|
|
|
download_dir : ๋ค์ด๋ก๋๋ ์ฒจ๋ถํ์ผ์ ์ ๋ ๊ฒฝ๋ก
|
|
|
output
|
|
|
texts : ํ์ผ๋ก๋ถํฐ ์ถ์ถ๋ ํ
์คํธ ๋ฐ์ดํฐ
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
file_path = os.path.join(download_dir, filename)
|
|
|
|
|
|
texts = TextParser.get_text(file_path)
|
|
|
|
|
|
return texts
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"{filename}์ด(๊ฐ) ๋ค์ด๋ก๋๋์ง ์์์ต๋๋ค : {e}"
|
|
|
print(err_msg)
|
|
|
return [err_msg]
|
|
|
|
|
|
|
|
|
def parse_post_content(post_url: str, download_dir: str) -> dict[str, str | list[str]] | None:
|
|
|
"""
|
|
|
๊ฐ ๊ฒ์๊ธ ์์ธํ์ด์ง์์ ๋ณธ๋ฌธ ํ
์คํธ์ ์ฒจ๋ถํ์ผ์ ํฌ๋กค๋งํ๋ ํจ์
|
|
|
|
|
|
input
|
|
|
post_url : ๊ฐ ๊ฒ์๊ธ์ url
|
|
|
download_dir : ๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ์ ์ฅํ ์ ๋ ๊ฒฝ๋ก
|
|
|
output
|
|
|
output : ๊ฒ์๊ธ๋ก๋ถํฐ ์ถ์ถ๋ ํ
์คํธ ๋ฐ ๋ฉํ๋ฐ์ดํฐ๋ฅผ ๋ด๊ณ ์๋ ๋์
๋๋ฆฌ
|
|
|
output = {
|
|
|
"source_url": post_url, # ๊ฒ์๊ธ์ url
|
|
|
"title": title, # ๊ฒ์๊ธ ์ ๋ชฉ
|
|
|
"date": date, # ๊ฒ์๊ธ ์์ฑ ๋ ์ง
|
|
|
"category": sub_title, # ๊ฒ์๊ธ์ด ๋ด๊ธด ์นดํ
๊ณ ๋ฆฌ
|
|
|
"text": full_text, # ๊ฒ์๊ธ ์ ๋ชฉ + ๋ณธ๋ฌธ
|
|
|
"attachments": attachments_str, # ๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ ๋ฆฌ์คํธ(ํ์ผ๋ช
)
|
|
|
"extracted": extracted_str, # ์ฒจ๋ถํ์ผ๋ก๋ถํฐ ์ถ์ถํ ํ
์คํธ
|
|
|
"crawl_date": now_time # ํฌ๋กค๋ง๋ ๋ ์ง์ ์๊ฐ
|
|
|
}
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
res = requests.get(post_url)
|
|
|
res.raise_for_status()
|
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
title_element = soup.find("th", string = "์ ๋ชฉ")
|
|
|
if title_element:
|
|
|
title = title_element.find_next_sibling("td").get_text(strip=True)
|
|
|
else:
|
|
|
title = "์ ๋ชฉ์์"
|
|
|
|
|
|
|
|
|
|
|
|
date_element = soup.find("th", string = "๋ ์ง")
|
|
|
if date_element:
|
|
|
date = date_element.find_next_sibling("td").get_text(strip=True)
|
|
|
else:
|
|
|
date = "๋ ์ง์์"
|
|
|
|
|
|
|
|
|
|
|
|
content_element = soup.select_one(".cont")
|
|
|
|
|
|
if content_element:
|
|
|
|
|
|
for script in content_element(["script", "style", "iframe"]):
|
|
|
script.decompose()
|
|
|
body_text = content_element.get_text(separator='\n', strip=True)
|
|
|
else:
|
|
|
body_text = "๋ณธ๋ฌธ์์"
|
|
|
|
|
|
full_text = f"์ ๋ชฉ: {title}\n๋ณธ๋ฌธ: {body_text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub_title_element = soup.select_one(".sub_title")
|
|
|
if sub_title_element:
|
|
|
sub_title = sub_title_element.get_text(strip=True)
|
|
|
else:
|
|
|
sub_title = "์นดํ
๊ณ ๋ฆฌ์์"
|
|
|
|
|
|
|
|
|
|
|
|
attachments = []
|
|
|
extracted_text = []
|
|
|
|
|
|
file_links = soup.select(".list_attach a")
|
|
|
|
|
|
if file_links:
|
|
|
for f_link in file_links:
|
|
|
href = f_link.get("href")
|
|
|
|
|
|
if not href:
|
|
|
continue
|
|
|
|
|
|
if "javascript:downloadfile" in href:
|
|
|
try:
|
|
|
match = re.search(r"downloadfile\(\s*'([^']*)'\s*,\s*'([^']*)'\s*,\s*'([^']*)'\s*\)", href)
|
|
|
|
|
|
if match:
|
|
|
path = match.group(1)
|
|
|
server_fname = match.group(2)
|
|
|
origin_fname = match.group(3)
|
|
|
|
|
|
base_url = "https://www.seoultech.ac.kr"
|
|
|
download_url = f"{base_url}{path}/{server_fname}"
|
|
|
|
|
|
|
|
|
attachments.append(f"{origin_fname} ({download_url})")
|
|
|
|
|
|
|
|
|
download_file(download_url, origin_fname, download_dir)
|
|
|
|
|
|
|
|
|
details = parse_attachment(origin_fname, download_dir)
|
|
|
|
|
|
|
|
|
for s in details:
|
|
|
extracted_text.append(s)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Javascript ํ์ฑ ์๋ฌ : {e}")
|
|
|
|
|
|
else:
|
|
|
full_url = urljoin(post_url, href)
|
|
|
f_name = f_link.get_text(strip=True)
|
|
|
attachments.append(f"{f_name} ({full_url})")
|
|
|
|
|
|
|
|
|
attachments_str = ", ".join(attachments) if attachments else "์ฒจ๋ถํ์ผ ์์"
|
|
|
extracted_str = ", ".join(extracted_text) if extracted_text else "์ฒจ๋ถํ์ผ ์์"
|
|
|
|
|
|
|
|
|
|
|
|
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
output = {
|
|
|
"source_url": post_url,
|
|
|
"title": title,
|
|
|
"date": date,
|
|
|
"category": sub_title,
|
|
|
"text": full_text,
|
|
|
"attachments": attachments_str,
|
|
|
"extracted": extracted_str,
|
|
|
"crawl_date": now_time
|
|
|
}
|
|
|
return output
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"{post_url}์(๋ฅผ) ํ์ฑํ๋ ๊ณผ์ ์์ ์๋ฌ๊ฐ ๋ฐ์ํ์ต๋๋ค : {e}"
|
|
|
print(err_msg)
|
|
|
traceback.print_exc()
|
|
|
return None
|
|
|
|
|
|
|
|
|
def save_to_db(data: dict[str, str | list[str]], collection: chromadb.Collection) -> None:
|
|
|
"""
|
|
|
๊ฐ ๊ฒ์๊ธ๋ง๋ค parse_post_content๋ฅผ ํตํด ์ป์ด๋ธ ๊ฒฐ๊ณผ๋ฌผ์ ChromaDB์ ์ ์ฅํ๋ ํจ์
|
|
|
input
|
|
|
data : parse_post_content์ output
|
|
|
collection : chromadb ํด๋ผ์ด์ธํธ ๊ฐ์ฒด
|
|
|
output
|
|
|
-
|
|
|
๊ธฐ๋ฅ : collection์ ๊ฐ data๋ฅผ ์ ์ฅ
|
|
|
id๋ ๊ฐ ๊ฒ์๊ธ์ url(๊ฐ ๋ฐ์ดํฐ์ ๊ณ ์ ํ ์ฃผ์์ด๋ฏ๋ก)
|
|
|
|
|
|
์ฐธ๊ณ ) chromadb์ ๋ฐ์ดํฐ๋ฅผ ์ ์ฅํ๋ ค๋ฉด
|
|
|
documents : ๋ฒกํฐํ๋์ด ๊ฒ์์ ๋์์ด ๋๋ ํ
์คํธ -> ๋ณธ๋ฌธ + ์ฒจ๋ถํ์ผ
|
|
|
metadatas : ๊ฒ์ ๊ฒฐ๊ณผ์ ํจ๊ป ๋ฐํ๋๊ฑฐ๋, ํํฐ๋ง์ ์ฌ์ฉํ ์ ๋ณด -> ์ ๋ชฉ, ๋ ์ง ๋ฑ
|
|
|
ids : ๊ฐ ๋ฐ์ดํฐ์ ๊ณ ์ ํ id
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
content_to_embed = data["text"]
|
|
|
if data.get("extracted"):
|
|
|
content_to_embed += f"\n\n[์ฒจ๋ถํ์ผ ๋ด์ฉ]\n{data["extracted"]}"
|
|
|
|
|
|
metadata_dict = {
|
|
|
"source_url": data["source_url"],
|
|
|
"title": data["title"],
|
|
|
"date": data["date"],
|
|
|
"category": data["category"],
|
|
|
"attachments": data["attachments"] if data["attachments"] else "",
|
|
|
"crawl_date": data["crawl_date"]
|
|
|
}
|
|
|
|
|
|
|
|
|
collection.upsert(
|
|
|
ids = [data["source_url"]],
|
|
|
documents = [content_to_embed],
|
|
|
metadatas = [metadata_dict]
|
|
|
)
|
|
|
|
|
|
print(f"๋ค์ ๊ฒ์๊ธ์ด ์ ์ฅ๋์์ต๋๋ค. : {data["title"]}")
|
|
|
|
|
|
|
|
|
def crawl_seoultech_notice(download_dir: str, base_url: str, num_pages: str, collection: chromadb.Collection) -> None:
|
|
|
"""
|
|
|
crawl_seoultech_notice์ Docstring
|
|
|
|
|
|
input
|
|
|
download_dir : ๊ฒ์๊ธ์ ์ฒจ๋ถํ์ผ์ ๋ค์ด๋ก๋ํ ํด๋ (์ ๋ ๊ฒฝ๋ก)
|
|
|
base_url : ๊ธฐ๋ณธ ๊ณต์ง์ฌํญ ํ์ด์ง url
|
|
|
num_pages : ํฌ๋กค๋งํ ํ์ด์ง ์
|
|
|
collection : Chromadb Collection ๊ฐ์ฒด - ๋ฐ์ดํฐ๋ฅผ ์ ์ฅํ ์ ์ฅ์
|
|
|
output
|
|
|
-
|
|
|
"""
|
|
|
t0 = time.time()
|
|
|
print(f"{base_url} ๋ก๋ถํฐ ๋ฐ์ดํฐ ํฌ๋กค๋ง์ ์์ํฉ๋๋ค. ")
|
|
|
print(f"์์ ์์ ์๊ฐ : ์ฝ {num_pages * 20} ~ {num_pages * 30}์ด")
|
|
|
|
|
|
for i in range(num_pages):
|
|
|
current_url = f"{base_url}?page={i+1}"
|
|
|
print(f"=== {i+1} ํ์ด์ง ํฌ๋กค๋ง ์์ : {current_url} ===")
|
|
|
|
|
|
post_urls = get_post_urls(current_url)
|
|
|
|
|
|
for post in post_urls:
|
|
|
post_data = parse_post_content(post, download_dir)
|
|
|
|
|
|
if post_data:
|
|
|
try:
|
|
|
save_to_db(post_data, collection)
|
|
|
|
|
|
except Exception as e:
|
|
|
err_msg = f"๊ฒ์๋ฌผ์ ๋ฐ์ดํฐ๋ฅผ DB์ ์ ์ฅํ๋ ๊ณผ์ ์์ ์๋ฌ๊ฐ ๋ฐ์ํ์์ต๋๋ค : {e}"
|
|
|
print(err_msg)
|
|
|
|
|
|
else:
|
|
|
print(f"ํ์ฑ ์คํจ๋ก ๋ฐ์ดํฐ ์ ์ฅ์ ๊ฑด๋๋๋๋ค : {post}")
|
|
|
|
|
|
t1 = time.time()
|
|
|
print(f"=== {i+1} ํ์ด์ง ์์ง ์๋ฃ (์์ ์๊ฐ : {t1 - t0:.2f}์ด)===")
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
|
|
abs_download_path = os.path.join(args.base_dir, args.download_dir)
|
|
|
abs_db_path = os.path.join(args.base_dir, args.db_dir)
|
|
|
|
|
|
collection = make_db(abs_download_path, abs_db_path, args.collection_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
crawl_seoultech_notice(abs_download_path, args.base_url, args.num_page, collection)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument("--base_url", type = str, default = "https://www.seoultech.ac.kr/service/info/notice")
|
|
|
parser.add_argument("--base_dir", type = str, default = str(Path(__file__).resolve().parent))
|
|
|
parser.add_argument("--download_dir", type = str, default = "seoultech_data_download")
|
|
|
parser.add_argument("--db_dir", type = str, default = "seoultech_data_db")
|
|
|
parser.add_argument("--header", type = dict, default = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"})
|
|
|
parser.add_argument("--num_page", type = int, default = 1)
|
|
|
parser.add_argument("--collection_name", type = str, default = "seoultech_notices")
|
|
|
parser.add_argument("--embedf_name", type = str, default = "BAAI/bge-m3")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
main(args) |