RAG_HF / core /cronjob.py
tjrlgns09's picture
.
d2100e7
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from datetime import datetime
import urllib.parse
import feedparser
import asyncio
import pytz
from time import mktime
# ์ถ”๊ฐ€๋œ ์ž„ํฌํŠธ (DB ๋ฐ ๋ชจ๋ธ, ์ž„๋ฒ ๋”ฉ)
from core.database import SessionLocal
from core.models import NewsEmbedding
from core.dependencies import get_embedding_model
import os
import requests
from newspaper import Article
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from apscheduler.triggers.interval import IntervalTrigger
# Initialize Gemini
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash-lite",
temperature=0.1,
google_api_key=os.getenv("GOOGLE_API_KEY")
)
# (is_real_gold_news ํ•จ์ˆ˜๋Š” ๊ธฐ์กด๊ณผ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€)
def is_real_gold_news(title):
title_lower = title.lower()
black_list = ["๊ธˆ์š”์ผ", "๋ณด์กฐ๊ธˆ", "์žฅํ•™๊ธˆ", "์ง€์›๊ธˆ", "๋ฒŒ๊ธˆ", "์ถœ๊ธˆ", "์ž…๊ธˆ", "๊ธˆ์ง€", "์†ก๊ธˆ", "๋Œ€์ถœ๊ธˆ", "๋ชจ๊ธˆ", "๊ธฐ๊ธˆ", "๊ณผ์ง•๊ธˆ", "golden retriever", "golden state", "golden globe", "golden rule", "marigold"]
white_list = ["์˜จ์Šค", "๊ณจ๋“œ๋ฐ”", "์‹œ์„ธ", "์ˆœ๊ธˆ", "๊ฑฐ๋ž˜์†Œ", "๋‹ฌ๋Ÿฌ", "ํˆฌ์ž", "๊ธˆ๊ฐ’", "ํ•œ๊ตญ๊ธˆ๊ฑฐ๋ž˜์†Œ", "krx", "๊ธˆํŽ€๋“œ", "ounce", "bullion", "price", "market", "fed", "inflation", "xau", "spot", "invest"]
if any(bad_word in title_lower for bad_word in black_list): return False
if any(good_word in title_lower for good_word in white_list): return True
return False
# 2. RSS ํ”ผ๋“œ๋ฅผ ๊ฐ€์ ธ์™€์„œ DB์— ์ €์žฅํ•˜๋Š” ํ•จ์ˆ˜๋กœ ์—…๊ทธ๋ ˆ์ด๋“œ
def fetch_filter_and_save_news(keyword, hl, gl, ceid, db_session, max_news=3):
url_keyword = urllib.parse.quote(keyword)
rss_url = f"https://news.google.com/rss/search?q={url_keyword}&hl={hl}&gl={gl}&ceid={ceid}"
feed = feedparser.parse(rss_url)
valid_news = []
# dependencies.py์— ์žˆ๋Š” ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
embedder = get_embedding_model()
for entry in feed.entries:
if len(valid_news) >= max_news:
break
if is_real_gold_news(entry.title):
# ๐Ÿ’ก [ํ•ต์‹ฌ] ์ค‘๋ณต ๊ฒ€์‚ฌ: DB์— ๋˜‘๊ฐ™์€ ์ œ๋ชฉ์˜ ๊ธฐ์‚ฌ๊ฐ€ ์ด๋ฏธ ์žˆ๋Š”์ง€ ํ™•์ธ
exists = db_session.query(NewsEmbedding).filter(NewsEmbedding.title == entry.title).first()
if exists:
continue # ์ด๋ฏธ DB์— ์žˆ์œผ๋ฉด ์Šคํ‚ตํ•˜๊ณ  ๋‹ค์Œ ๊ธฐ์‚ฌ๋กœ ๋„˜์–ด๊ฐ
# ๐Ÿ’ก [ํ•ต์‹ฌ] ๋ฐœํ–‰์ผ ์ถ”์ถœ: ํ”ผ๋“œ์—์„œ ์ œ๊ณตํ•˜๋Š” ์‹œ๊ฐ„(published_parsed)์„ Datetime์œผ๋กœ ๋ณ€ํ™˜
if hasattr(entry, 'published_parsed') and entry.published_parsed:
pub_date = datetime.fromtimestamp(mktime(entry.published_parsed), pytz.UTC)
else:
pub_date = datetime.now(pytz.UTC)
# Gemini๋ฅผ ํ†ตํ•œ ์ง„์งœ ๊ธˆ ๋‰ด์Šค ํ™•์ธ ๋ฐ ์š”์•ฝ
try:
# ๐Ÿ’ก [๊ฐœ์„ ] ๋” ๊ฐ•๋ ฅํ•œ ๋ธŒ๋ผ์šฐ์ € ์œ„์žฅ ํ—ค๋”
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
}
# 1. ๋จผ์ € ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ๋œ ์ตœ์ข… URL์„ ๋”ฐ๋ƒ…๋‹ˆ๋‹ค.
response = requests.get(entry.link, timeout=15, headers=headers, allow_redirects=True)
real_url = response.url
# 2. newspaper3k ์„ค์ • ์ ์šฉ
from newspaper import Config
config = Config()
config.browser_user_agent = headers['User-Agent']
config.request_timeout = 15
article = Article(real_url, config=config)
article.download()
article.parse()
article_text = article.text.strip()
# ๐Ÿ’ก [๊ฒ€์ฆ] ๋ณธ๋ฌธ์ด ์—†๊ฑฐ๋‚˜ "Google News" ๊ป๋ฐ๊ธฐ๋งŒ ๊ธํžŒ ๊ฒฝ์šฐ ์ฒดํฌ
if len(article_text) < 100 or "Google News" in article_text[:100]:
# ๋งŒ์•ฝ newspaper๊ฐ€ ์‹คํŒจํ•˜๋ฉด BeautifulSoup์œผ๋กœ ์žฌ์‹œ๋„ (์ตœํ›„์˜ ์ˆ˜๋‹จ)
soup = BeautifulSoup(response.text, 'html.parser')
# ๋‰ด์Šค ์‚ฌ์ดํŠธ๋“ค์ด ๋ณดํ†ต ์“ฐ๋Š” ๋ณธ๋ฌธ ํƒœ๊ทธ๋“ค ์œ„์ฃผ๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ
article_text = ' '.join([p.text for p in soup.find_all('p') if len(p.text) > 20])
if len(article_text) < 100:
print(f"โš ๏ธ ๋ณธ๋ฌธ ์ถ”์ถœ ์‹คํŒจ (๋‚ด์šฉ ๋ถ€์กฑ): {entry.title}")
continue
# ๋””๋ฒ„๊น… ์ถœ๋ ฅ
print(f"\n[๊ธฐ์‚ฌ ์ œ๋ชฉ]: {entry.title}")
print(f"[์‹ค์ œ ์ฃผ์†Œ]: {real_url}")
print(f"[๋ณธ๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ]:\n{article_text[:300]}...\n")
print("-" * 50)
# ํ…Œ์ŠคํŠธ ์™„๋ฃŒ ์ „๊นŒ์ง€๋Š” API ํ˜ธ์ถœ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์œ ์ง€
continue
prompt = f"""
๋‹ค์Œ์€ ๋‰ด์Šค ๊ธฐ์‚ฌ ์›๋ฌธ์ž…๋‹ˆ๋‹ค:
{article_text[:3000]}
์ด ๋‰ด์Šค๊ฐ€ ๊ธˆ(Gold, ๊ท€๊ธˆ์†/ํˆฌ์ž์ž์‚ฐ/๊ธˆ๊ฐ’)๊ณผ ๊ด€๋ จ๋œ ์‹ค์ œ ๋‰ด์Šค์ธ์ง€ ํŒ๋ณ„ํ•˜๊ณ ,
๋งž๋‹ค๋ฉด ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ 1~2์ค„๋กœ ์š”์•ฝํ•ด์ฃผ์„ธ์š”.
๋งŒ์•ฝ ๊ธˆ๊ณผ ์ „ํ˜€ ๊ด€๋ จ์ด ์—†๋Š” ๋‰ด์Šค๋ผ๋ฉด (์˜ˆ: ๊ธˆ์š”์ผ, ๋ฒŒ๊ธˆ, ์žฅํ•™๊ธˆ, ์ถœ๊ธˆ, ์†ก๊ธˆ, ๋ณด์กฐ๊ธˆ, ๋‹จ์ˆœํžˆ '๊ธˆ'์ด ํฌํ•จ๋œ ๋‹จ์–ด๋งŒ ์žˆ๋Š” ๊ธฐ์‚ฌ ๋“ฑ)
'NOT_GOLD_NEWS' ๋ผ๊ณ ๋งŒ ์ •ํ™•ํžˆ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
์š”์•ฝ ๊ฒฐ๊ณผ:
"""
response = llm.invoke([HumanMessage(content=prompt)])
summary = response.content.strip()
if summary == "NOT_GOLD_NEWS":
print(f"โŒ [Gemini ํ•„ํ„ฐ๋ง] ๊ฐ€์งœ ๊ธˆ ๋‰ด์Šค ์Šคํ‚ต: {entry.title}")
continue
content_text = summary
print(f"โœ… [Gemini ์š”์•ฝ ์„ฑ๊ณต] ์š”์•ฝ๋ฌธ: {content_text}")
except Exception as e:
print(f"โš ๏ธ ๊ธฐ์‚ฌ ๋ณธ๋ฌธ ์ถ”์ถœ ๋˜๋Š” ์š”์•ฝ ์‹คํŒจ ({entry.title}): {e}")
# ์‹คํŒจํ–ˆ์„ ๋• ๋ฌด์‹œํ•˜๊ณ  ๋‹ค์Œ ๊ธฐ์‚ฌ๋กœ ๋„˜์–ด๊ฐ‘๋‹ˆ๋‹ค
continue
# ๐Ÿ’ก ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ (List[float] ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜๋จ)
# content ์ปฌ๋Ÿผ์— ๋„ฃ์„ ๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ (๋‰ด์Šค ์š”์•ฝ)
title_emb = embedder.embed_query(entry.title)
content_emb = embedder.embed_query(content_text)
# DB ๋ชจ๋ธ ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ์„ธ์…˜์— ์ถ”๊ฐ€
new_article = NewsEmbedding(
title=entry.title,
title_embedding=title_emb,
content=content_text,
content_embedding=content_emb,
created_at=pub_date
)
db_session.add(new_article)
valid_news.append((entry.title, entry.link))
print(f"โœ… DB ์ถ”๊ฐ€ ์˜ˆ์•ฝ: {entry.title}")
# ๋ณ€๊ฒฝ์‚ฌํ•ญ์„ DB์— ์ตœ์ข… ๋ฐ˜์˜ (Commit)
if valid_news:
db_session.commit()
return valid_news
# 3. ๋ฉ”์ธ ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (DB ์„ธ์…˜ ๊ด€๋ฆฌ ์ถ”๊ฐ€)
async def search_gold_news():
print(f"\n=== ๐ŸŒŸ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ์ž๋™ ๊ฒ€์ƒ‰ ๋ฐ DB ์ €์žฅ ์‹œ์ž‘ ===")
# DB ์„ธ์…˜ ์—ด๊ธฐ
db = SessionLocal()
try:
print("\n[๊ตญ๋‚ด ๋‰ด์Šค ๊ฒ€์ƒ‰ ์ค‘...]")
kr_news = fetch_filter_and_save_news("๊ธˆ", "ko", "KR", "KR:ko", db, max_news=3)
if not kr_news:
print("์ƒˆ๋กœ์šด ๊ตญ๋‚ด ๋‰ด์Šค๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋‘ ์ด๋ฏธ ์ €์žฅ๋œ ๊ธฐ์‚ฌ์•ผ.")
print("\n[ํ•ด์™ธ ๋‰ด์Šค ๊ฒ€์ƒ‰ ์ค‘...]")
en_news = fetch_filter_and_save_news("gold", "en", "US", "US:en", db, max_news=3)
if not en_news:
print("์ƒˆ๋กœ์šด ํ•ด์™ธ ๋‰ด์Šค๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋‘ ์ด๋ฏธ ์ €์žฅ๋œ ๊ธฐ์‚ฌ์•ผ.")
except Exception as e:
print(f"โŒ DB ์ €์žฅ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
db.rollback()
finally:
# ์ž‘์—…์ด ๋๋‚˜๋ฉด ๋ฌด์กฐ๊ฑด DB ์„ธ์…˜ ๋‹ซ๊ธฐ
db.close()
print("==========================================\n")
# ์Šค์ผ€์ค„๋Ÿฌ ์„ค์ • (๊ธฐ์กด๊ณผ ๋™์ผ)
def create_scheduler():
scheduler = AsyncIOScheduler(timezone="Asia/Seoul")
#scheduler.add_job(search_gold_news, CronTrigger(hour=18, minute=0))
scheduler.add_job(search_gold_news, IntervalTrigger(seconds=60))
return scheduler
news_scheduler = create_scheduler()