File size: 5,251 Bytes
3a36548 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
์ ๋น ๋ณด๋์๋ฃ ํฌ๋กค๋ฌ - ๋ฉ์ธ ์ง์
์
์ง์ ์ ๋น: ๋๋ถ์ด๋ฏผ์ฃผ๋น, ๊ตญ๋ฏผ์ํ, ์กฐ๊ตญํ์ ๋น, ๊ฐํ์ ๋น, ๊ธฐ๋ณธ์๋๋น, ์ง๋ณด๋น
์ฌ์ฉ๋ฒ:
python main.py # ์ ์ฒด ์ ๋น ์ฆ๋ถ ์
๋ฐ์ดํธ
python main.py --party minjoo # ๋๋ถ์ด๋ฏผ์ฃผ๋น๋ง
python main.py --party ppp # ๊ตญ๋ฏผ์ํ๋ง
python main.py --party rebuilding # ์กฐ๊ตญํ์ ๋น๋ง
python main.py --party reform # ๊ฐํ์ ๋น๋ง
python main.py --party basic_income # ๊ธฐ๋ณธ์๋๋น๋ง
python main.py --party jinbo # ์ง๋ณด๋น๋ง
python main.py --start-date 2024-01-01 # ๋ ์ง ๋ฒ์ ์ง์
python main.py --party ppp --start-date 2024-01-01 --end-date 2024-06-30
"""
import asyncio
import argparse
import logging
from datetime import datetime
from minjoo_crawler_async import MinjooAsyncCrawler
from ppp_crawler_async import PPPAsyncCrawler
from rebuilding_crawler_async import RebuildingAsyncCrawler
from reform_crawler_async import ReformAsyncCrawler
from basic_income_crawler_async import BasicIncomeAsyncCrawler
from jinbo_crawler_async import JinboAsyncCrawler
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('main.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
PARTY_LABELS = {
'minjoo': '๋๋ถ์ด๋ฏผ์ฃผ๋น',
'ppp': '๊ตญ๋ฏผ์ํ',
'rebuilding': '์กฐ๊ตญํ์ ๋น',
'reform': '๊ฐํ์ ๋น',
'basic_income':'๊ธฐ๋ณธ์๋๋น',
'jinbo': '์ง๋ณด๋น',
'all': '์ ์ฒด (6๊ฐ ์ ๋น)',
}
ALL_PARTIES = ['minjoo', 'ppp', 'rebuilding', 'reform', 'basic_income', 'jinbo']
def parse_args():
parser = argparse.ArgumentParser(
description='์ ๋น ๋ณด๋์๋ฃ ํฌ๋กค๋ฌ',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
'--party',
choices=list(PARTY_LABELS.keys()),
default='all',
help=(
'ํฌ๋กค๋งํ ์ ๋น ์ ํ (๊ธฐ๋ณธ๊ฐ: all)\n'
' minjoo : ๋๋ถ์ด๋ฏผ์ฃผ๋น\n'
' ppp : ๊ตญ๋ฏผ์ํ\n'
' rebuilding : ์กฐ๊ตญํ์ ๋น\n'
' reform : ๊ฐํ์ ๋น\n'
' basic_income : ๊ธฐ๋ณธ์๋๋น\n'
' jinbo : ์ง๋ณด๋น\n'
' all : ์ ์ฒด ๋์ ํฌ๋กค๋ง'
)
)
parser.add_argument(
'--start-date',
metavar='YYYY-MM-DD',
default=None,
help='์์ง ์์ ๋ ์ง (์: 2024-01-01)\n๋ฏธ์
๋ ฅ ์ ๋ง์ง๋ง ํฌ๋กค๋ง ์ดํ๋ถํฐ (์ฆ๋ถ ์
๋ฐ์ดํธ)'
)
parser.add_argument(
'--end-date',
metavar='YYYY-MM-DD',
default=None,
help='์์ง ์ข
๋ฃ ๋ ์ง (์: 2024-12-31)\n๋ฏธ์
๋ ฅ ์ ์ค๋ ๋ ์ง'
)
return parser.parse_args()
def get_crawler(party: str):
"""์ ๋น ์ฝ๋์ ๋ง๋ ํฌ๋กค๋ฌ ์ธ์คํด์ค ๋ฐํ"""
return {
'minjoo': MinjooAsyncCrawler,
'ppp': PPPAsyncCrawler,
'rebuilding': RebuildingAsyncCrawler,
'reform': ReformAsyncCrawler,
'basic_income': BasicIncomeAsyncCrawler,
'jinbo': JinboAsyncCrawler,
}[party]()
async def run_party(party: str, start_date=None, end_date=None):
"""๋จ์ผ ์ ๋น ํฌ๋กค๋ง ์คํ"""
crawler = get_crawler(party)
if start_date or end_date:
df = await crawler.collect_all(start_date, end_date)
if not df.empty:
crawler.save_local(df)
crawler.upload_to_huggingface(df)
else:
await crawler.run_incremental()
async def main():
args = parse_args()
start_time = datetime.now()
target_parties = ALL_PARTIES if args.party == 'all' else [args.party]
logger.info("=" * 60)
logger.info("์ ๋น ๋ณด๋์๋ฃ ํฌ๋กค๋ฌ ์์")
logger.info(f"๋์ ์ ๋น : {PARTY_LABELS[args.party]}")
logger.info(f"์์ง ๊ธฐ๊ฐ : {args.start_date or '์ฆ๋ถ ์
๋ฐ์ดํธ'} ~ {args.end_date or '์ค๋'}")
logger.info("=" * 60)
if len(target_parties) == 1:
await run_party(target_parties[0], args.start_date, args.end_date)
else:
results = await asyncio.gather(
*[run_party(p, args.start_date, args.end_date) for p in target_parties],
return_exceptions=True
)
for party, result in zip(target_parties, results):
if isinstance(result, Exception):
logger.error(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์คํจ: {result}")
else:
logger.info(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์๋ฃ")
duration = (datetime.now() - start_time).total_seconds()
logger.info("=" * 60)
logger.info(f"์ ์ฒด ์๋ฃ! ์์ ์๊ฐ: {duration:.1f}์ด ({duration / 60:.1f}๋ถ)")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())
|