File size: 5,251 Bytes
3a36548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ - ๋ฉ”์ธ ์ง„์ž…์ 

์ง€์› ์ •๋‹น: ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น, ๊ตญ๋ฏผ์˜ํž˜, ์กฐ๊ตญํ˜์‹ ๋‹น, ๊ฐœํ˜์‹ ๋‹น, ๊ธฐ๋ณธ์†Œ๋“๋‹น, ์ง„๋ณด๋‹น



์‚ฌ์šฉ๋ฒ•:

  python main.py                                    # ์ „์ฒด ์ •๋‹น ์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ

  python main.py --party minjoo                     # ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น๋งŒ

  python main.py --party ppp                        # ๊ตญ๋ฏผ์˜ํž˜๋งŒ

  python main.py --party rebuilding                 # ์กฐ๊ตญํ˜์‹ ๋‹น๋งŒ

  python main.py --party reform                     # ๊ฐœํ˜์‹ ๋‹น๋งŒ

  python main.py --party basic_income               # ๊ธฐ๋ณธ์†Œ๋“๋‹น๋งŒ

  python main.py --party jinbo                      # ์ง„๋ณด๋‹น๋งŒ

  python main.py --start-date 2024-01-01            # ๋‚ ์งœ ๋ฒ”์œ„ ์ง€์ •

  python main.py --party ppp --start-date 2024-01-01 --end-date 2024-06-30

"""

import asyncio
import argparse
import logging
from datetime import datetime

from minjoo_crawler_async import MinjooAsyncCrawler
from ppp_crawler_async import PPPAsyncCrawler
from rebuilding_crawler_async import RebuildingAsyncCrawler
from reform_crawler_async import ReformAsyncCrawler
from basic_income_crawler_async import BasicIncomeAsyncCrawler
from jinbo_crawler_async import JinboAsyncCrawler

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler('main.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

PARTY_LABELS = {
    'minjoo':      '๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น',
    'ppp':         '๊ตญ๋ฏผ์˜ํž˜',
    'rebuilding':  '์กฐ๊ตญํ˜์‹ ๋‹น',
    'reform':      '๊ฐœํ˜์‹ ๋‹น',
    'basic_income':'๊ธฐ๋ณธ์†Œ๋“๋‹น',
    'jinbo':       '์ง„๋ณด๋‹น',
    'all':         '์ „์ฒด (6๊ฐœ ์ •๋‹น)',
}

ALL_PARTIES = ['minjoo', 'ppp', 'rebuilding', 'reform', 'basic_income', 'jinbo']


def parse_args():
    parser = argparse.ArgumentParser(
        description='์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ',
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument(
        '--party',
        choices=list(PARTY_LABELS.keys()),
        default='all',
        help=(
            'ํฌ๋กค๋งํ•  ์ •๋‹น ์„ ํƒ (๊ธฐ๋ณธ๊ฐ’: all)\n'
            '  minjoo       : ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น\n'
            '  ppp          : ๊ตญ๋ฏผ์˜ํž˜\n'
            '  rebuilding   : ์กฐ๊ตญํ˜์‹ ๋‹น\n'
            '  reform       : ๊ฐœํ˜์‹ ๋‹น\n'
            '  basic_income : ๊ธฐ๋ณธ์†Œ๋“๋‹น\n'
            '  jinbo        : ์ง„๋ณด๋‹น\n'
            '  all          : ์ „์ฒด ๋™์‹œ ํฌ๋กค๋ง'
        )
    )
    parser.add_argument(
        '--start-date',
        metavar='YYYY-MM-DD',
        default=None,
        help='์ˆ˜์ง‘ ์‹œ์ž‘ ๋‚ ์งœ (์˜ˆ: 2024-01-01)\n๋ฏธ์ž…๋ ฅ ์‹œ ๋งˆ์ง€๋ง‰ ํฌ๋กค๋ง ์ดํ›„๋ถ€ํ„ฐ (์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ)'
    )
    parser.add_argument(
        '--end-date',
        metavar='YYYY-MM-DD',
        default=None,
        help='์ˆ˜์ง‘ ์ข…๋ฃŒ ๋‚ ์งœ (์˜ˆ: 2024-12-31)\n๋ฏธ์ž…๋ ฅ ์‹œ ์˜ค๋Š˜ ๋‚ ์งœ'
    )
    return parser.parse_args()


def get_crawler(party: str):
    """์ •๋‹น ์ฝ”๋“œ์— ๋งž๋Š” ํฌ๋กค๋Ÿฌ ์ธ์Šคํ„ด์Šค ๋ฐ˜ํ™˜"""
    return {
        'minjoo':       MinjooAsyncCrawler,
        'ppp':          PPPAsyncCrawler,
        'rebuilding':   RebuildingAsyncCrawler,
        'reform':       ReformAsyncCrawler,
        'basic_income': BasicIncomeAsyncCrawler,
        'jinbo':        JinboAsyncCrawler,
    }[party]()


async def run_party(party: str, start_date=None, end_date=None):
    """๋‹จ์ผ ์ •๋‹น ํฌ๋กค๋ง ์‹คํ–‰"""
    crawler = get_crawler(party)
    if start_date or end_date:
        df = await crawler.collect_all(start_date, end_date)
        if not df.empty:
            crawler.save_local(df)
            crawler.upload_to_huggingface(df)
    else:
        await crawler.run_incremental()


async def main():
    args = parse_args()
    start_time = datetime.now()

    target_parties = ALL_PARTIES if args.party == 'all' else [args.party]

    logger.info("=" * 60)
    logger.info("์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ ์‹œ์ž‘")
    logger.info(f"๋Œ€์ƒ ์ •๋‹น : {PARTY_LABELS[args.party]}")
    logger.info(f"์ˆ˜์ง‘ ๊ธฐ๊ฐ„ : {args.start_date or '์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ'} ~ {args.end_date or '์˜ค๋Š˜'}")
    logger.info("=" * 60)

    if len(target_parties) == 1:
        await run_party(target_parties[0], args.start_date, args.end_date)
    else:
        results = await asyncio.gather(
            *[run_party(p, args.start_date, args.end_date) for p in target_parties],
            return_exceptions=True
        )
        for party, result in zip(target_parties, results):
            if isinstance(result, Exception):
                logger.error(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์‹คํŒจ: {result}")
            else:
                logger.info(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์™„๋ฃŒ")

    duration = (datetime.now() - start_time).total_seconds()
    logger.info("=" * 60)
    logger.info(f"์ „์ฒด ์™„๋ฃŒ! ์†Œ์š” ์‹œ๊ฐ„: {duration:.1f}์ดˆ ({duration / 60:.1f}๋ถ„)")
    logger.info("=" * 60)


if __name__ == "__main__":
    asyncio.run(main())