| from apscheduler.schedulers.asyncio import AsyncIOScheduler |
| from apscheduler.triggers.cron import CronTrigger |
| from datetime import datetime |
| import urllib.parse |
| import feedparser |
| import asyncio |
| import pytz |
| from time import mktime |
|
|
| |
| from core.database import SessionLocal |
| from core.models import NewsEmbedding |
| from core.dependencies import get_embedding_model |
| import os |
| import requests |
| from newspaper import Article |
| from langchain_google_genai import ChatGoogleGenerativeAI |
| from langchain_core.messages import HumanMessage |
| from apscheduler.triggers.interval import IntervalTrigger |
|
|
| |
| llm = ChatGoogleGenerativeAI( |
| model="gemini-2.5-flash-lite", |
| temperature=0.1, |
| google_api_key=os.getenv("GOOGLE_API_KEY") |
| ) |
|
|
| |
| def is_real_gold_news(title): |
| title_lower = title.lower() |
| black_list = ["๊ธ์์ผ", "๋ณด์กฐ๊ธ", "์ฅํ๊ธ", "์ง์๊ธ", "๋ฒ๊ธ", "์ถ๊ธ", "์
๊ธ", "๊ธ์ง", "์ก๊ธ", "๋์ถ๊ธ", "๋ชจ๊ธ", "๊ธฐ๊ธ", "๊ณผ์ง๊ธ", "golden retriever", "golden state", "golden globe", "golden rule", "marigold"] |
| white_list = ["์จ์ค", "๊ณจ๋๋ฐ", "์์ธ", "์๊ธ", "๊ฑฐ๋์", "๋ฌ๋ฌ", "ํฌ์", "๊ธ๊ฐ", "ํ๊ตญ๊ธ๊ฑฐ๋์", "krx", "๊ธํ๋", "ounce", "bullion", "price", "market", "fed", "inflation", "xau", "spot", "invest"] |
|
|
| if any(bad_word in title_lower for bad_word in black_list): return False |
| if any(good_word in title_lower for good_word in white_list): return True |
| return False |
|
|
| |
| def fetch_filter_and_save_news(keyword, hl, gl, ceid, db_session, max_news=3): |
| url_keyword = urllib.parse.quote(keyword) |
| rss_url = f"https://news.google.com/rss/search?q={url_keyword}&hl={hl}&gl={gl}&ceid={ceid}" |
| |
| feed = feedparser.parse(rss_url) |
| valid_news = [] |
| |
| |
| embedder = get_embedding_model() |
| |
| for entry in feed.entries: |
| if len(valid_news) >= max_news: |
| break |
| |
| if is_real_gold_news(entry.title): |
| |
| exists = db_session.query(NewsEmbedding).filter(NewsEmbedding.title == entry.title).first() |
| if exists: |
| continue |
| |
| |
| if hasattr(entry, 'published_parsed') and entry.published_parsed: |
| pub_date = datetime.fromtimestamp(mktime(entry.published_parsed), pytz.UTC) |
| else: |
| pub_date = datetime.now(pytz.UTC) |
|
|
| |
| try: |
| |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', |
| 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', |
| 'Cache-Control': 'no-cache', |
| 'Pragma': 'no-cache', |
| } |
|
|
| |
| response = requests.get(entry.link, timeout=15, headers=headers, allow_redirects=True) |
| real_url = response.url |
|
|
| |
| from newspaper import Config |
| config = Config() |
| config.browser_user_agent = headers['User-Agent'] |
| config.request_timeout = 15 |
|
|
| article = Article(real_url, config=config) |
| article.download() |
| article.parse() |
| |
| article_text = article.text.strip() |
| |
| |
| if len(article_text) < 100 or "Google News" in article_text[:100]: |
| |
| soup = BeautifulSoup(response.text, 'html.parser') |
| |
| article_text = ' '.join([p.text for p in soup.find_all('p') if len(p.text) > 20]) |
|
|
| if len(article_text) < 100: |
| print(f"โ ๏ธ ๋ณธ๋ฌธ ์ถ์ถ ์คํจ (๋ด์ฉ ๋ถ์กฑ): {entry.title}") |
| continue |
|
|
| |
| print(f"\n[๊ธฐ์ฌ ์ ๋ชฉ]: {entry.title}") |
| print(f"[์ค์ ์ฃผ์]: {real_url}") |
| print(f"[๋ณธ๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ]:\n{article_text[:300]}...\n") |
| print("-" * 50) |
| |
| |
| continue |
| |
| prompt = f""" |
| ๋ค์์ ๋ด์ค ๊ธฐ์ฌ ์๋ฌธ์
๋๋ค: |
| {article_text[:3000]} |
| |
| ์ด ๋ด์ค๊ฐ ๊ธ(Gold, ๊ท๊ธ์/ํฌ์์์ฐ/๊ธ๊ฐ)๊ณผ ๊ด๋ จ๋ ์ค์ ๋ด์ค์ธ์ง ํ๋ณํ๊ณ , |
| ๋ง๋ค๋ฉด ๊ธฐ์ฌ์ ํต์ฌ ๋ด์ฉ์ 1~2์ค๋ก ์์ฝํด์ฃผ์ธ์. |
| ๋ง์ฝ ๊ธ๊ณผ ์ ํ ๊ด๋ จ์ด ์๋ ๋ด์ค๋ผ๋ฉด (์: ๊ธ์์ผ, ๋ฒ๊ธ, ์ฅํ๊ธ, ์ถ๊ธ, ์ก๊ธ, ๋ณด์กฐ๊ธ, ๋จ์ํ '๊ธ'์ด ํฌํจ๋ ๋จ์ด๋ง ์๋ ๊ธฐ์ฌ ๋ฑ) |
| 'NOT_GOLD_NEWS' ๋ผ๊ณ ๋ง ์ ํํ ๋ต๋ณํ์ธ์. |
| |
| ์์ฝ ๊ฒฐ๊ณผ: |
| """ |
| response = llm.invoke([HumanMessage(content=prompt)]) |
| summary = response.content.strip() |
| |
| if summary == "NOT_GOLD_NEWS": |
| print(f"โ [Gemini ํํฐ๋ง] ๊ฐ์ง ๊ธ ๋ด์ค ์คํต: {entry.title}") |
| continue |
| |
| content_text = summary |
| print(f"โ
[Gemini ์์ฝ ์ฑ๊ณต] ์์ฝ๋ฌธ: {content_text}") |
| |
| except Exception as e: |
| print(f"โ ๏ธ ๊ธฐ์ฌ ๋ณธ๋ฌธ ์ถ์ถ ๋๋ ์์ฝ ์คํจ ({entry.title}): {e}") |
| |
| continue |
|
|
| |
| |
| |
| title_emb = embedder.embed_query(entry.title) |
| content_emb = embedder.embed_query(content_text) |
|
|
| |
| new_article = NewsEmbedding( |
| title=entry.title, |
| title_embedding=title_emb, |
| content=content_text, |
| content_embedding=content_emb, |
| created_at=pub_date |
| ) |
| db_session.add(new_article) |
| |
| valid_news.append((entry.title, entry.link)) |
| print(f"โ
DB ์ถ๊ฐ ์์ฝ: {entry.title}") |
| |
| |
| if valid_news: |
| db_session.commit() |
| |
| return valid_news |
|
|
| |
| async def search_gold_news(): |
| print(f"\n=== ๐ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ์๋ ๊ฒ์ ๋ฐ DB ์ ์ฅ ์์ ===") |
| |
| |
| db = SessionLocal() |
| try: |
| print("\n[๊ตญ๋ด ๋ด์ค ๊ฒ์ ์ค...]") |
| kr_news = fetch_filter_and_save_news("๊ธ", "ko", "KR", "KR:ko", db, max_news=3) |
| if not kr_news: |
| print("์๋ก์ด ๊ตญ๋ด ๋ด์ค๊ฐ ์๊ฑฐ๋ ๋ชจ๋ ์ด๋ฏธ ์ ์ฅ๋ ๊ธฐ์ฌ์ผ.") |
|
|
| print("\n[ํด์ธ ๋ด์ค ๊ฒ์ ์ค...]") |
| en_news = fetch_filter_and_save_news("gold", "en", "US", "US:en", db, max_news=3) |
| if not en_news: |
| print("์๋ก์ด ํด์ธ ๋ด์ค๊ฐ ์๊ฑฐ๋ ๋ชจ๋ ์ด๋ฏธ ์ ์ฅ๋ ๊ธฐ์ฌ์ผ.") |
| |
| except Exception as e: |
| print(f"โ DB ์ ์ฅ ์ค ์๋ฌ ๋ฐ์: {e}") |
| db.rollback() |
| finally: |
| |
| db.close() |
| |
| print("==========================================\n") |
|
|
| |
| def create_scheduler(): |
| scheduler = AsyncIOScheduler(timezone="Asia/Seoul") |
| |
| scheduler.add_job(search_gold_news, IntervalTrigger(seconds=60)) |
| return scheduler |
|
|
| news_scheduler = create_scheduler() |