Spaces:

HAENGEE
/

RAG_HF

Running

RAG_HF / core /cronjob.py

d2100e7 about 1 month ago

8.8 kB

	from apscheduler.schedulers.asyncio import AsyncIOScheduler
	from apscheduler.triggers.cron import CronTrigger
	from datetime import datetime
	import urllib.parse
	import feedparser
	import asyncio
	import pytz
	from time import mktime

	# 추가된 임포트 (DB 및 모델, 임베딩)
	from core.database import SessionLocal
	from core.models import NewsEmbedding
	from core.dependencies import get_embedding_model
	import os
	import requests
	from newspaper import Article
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.messages import HumanMessage
	from apscheduler.triggers.interval import IntervalTrigger

	# Initialize Gemini
	llm = ChatGoogleGenerativeAI(
	model="gemini-2.5-flash-lite",
	temperature=0.1,
	google_api_key=os.getenv("GOOGLE_API_KEY")
	)

	# (is_real_gold_news 함수는 기존과 동일하게 유지)
	def is_real_gold_news(title):
	title_lower = title.lower()
	black_list = ["금요일", "보조금", "장학금", "지원금", "벌금", "출금", "입금", "금지", "송금", "대출금", "모금", "기금", "과징금", "golden retriever", "golden state", "golden globe", "golden rule", "marigold"]
	white_list = ["온스", "골드바", "시세", "순금", "거래소", "달러", "투자", "금값", "한국금거래소", "krx", "금펀드", "ounce", "bullion", "price", "market", "fed", "inflation", "xau", "spot", "invest"]

	if any(bad_word in title_lower for bad_word in black_list): return False
	if any(good_word in title_lower for good_word in white_list): return True
	return False

	# 2. RSS 피드를 가져와서 DB에 저장하는 함수로 업그레이드
	def fetch_filter_and_save_news(keyword, hl, gl, ceid, db_session, max_news=3):
	url_keyword = urllib.parse.quote(keyword)
	rss_url = f"https://news.google.com/rss/search?q={url_keyword}&hl={hl}&gl={gl}&ceid={ceid}"

	feed = feedparser.parse(rss_url)
	valid_news = []

	# dependencies.py에 있는 임베딩 모델 인스턴스 가져오기
	embedder = get_embedding_model()

	for entry in feed.entries:
	if len(valid_news) >= max_news:
	break

	if is_real_gold_news(entry.title):
	# 💡 [핵심] 중복 검사: DB에 똑같은 제목의 기사가 이미 있는지 확인
	exists = db_session.query(NewsEmbedding).filter(NewsEmbedding.title == entry.title).first()
	if exists:
	continue # 이미 DB에 있으면 스킵하고 다음 기사로 넘어감

	# 💡 [핵심] 발행일 추출: 피드에서 제공하는 시간(published_parsed)을 Datetime으로 변환
	if hasattr(entry, 'published_parsed') and entry.published_parsed:
	pub_date = datetime.fromtimestamp(mktime(entry.published_parsed), pytz.UTC)
	else:
	pub_date = datetime.now(pytz.UTC)

	# Gemini를 통한 진짜 금 뉴스 확인 및 요약
	try:
	# 💡 [개선] 더 강력한 브라우저 위장 헤더
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8',
	'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
	'Cache-Control': 'no-cache',
	'Pragma': 'no-cache',
	}

	# 1. 먼저 리다이렉트된 최종 URL을 따냅니다.
	response = requests.get(entry.link, timeout=15, headers=headers, allow_redirects=True)
	real_url = response.url

	# 2. newspaper3k 설정 적용
	from newspaper import Config
	config = Config()
	config.browser_user_agent = headers['User-Agent']
	config.request_timeout = 15

	article = Article(real_url, config=config)
	article.download()
	article.parse()

	article_text = article.text.strip()

	# 💡 [검증] 본문이 없거나 "Google News" 껍데기만 긁힌 경우 체크
	if len(article_text) < 100 or "Google News" in article_text[:100]:
	# 만약 newspaper가 실패하면 BeautifulSoup으로 재시도 (최후의 수단)
	soup = BeautifulSoup(response.text, 'html.parser')
	# 뉴스 사이트들이 보통 쓰는 본문 태그들 위주로 텍스트 추출
	article_text = ' '.join([p.text for p in soup.find_all('p') if len(p.text) > 20])

	if len(article_text) < 100:
	print(f"⚠️ 본문 추출 실패 (내용 부족): {entry.title}")
	continue

	# 디버깅 출력
	print(f"\n[기사 제목]: {entry.title}")
	print(f"[실제 주소]: {real_url}")
	print(f"[본문 미리보기]:\n{article_text[:300]}...\n")
	print("-" * 50)

	# 테스트 완료 전까지는 API 호출 방지를 위해 유지
	continue

	prompt = f"""
	다음은 뉴스 기사 원문입니다:
	{article_text[:3000]}

	이 뉴스가 금(Gold, 귀금속/투자자산/금값)과 관련된 실제 뉴스인지 판별하고,
	맞다면 기사의 핵심 내용을 1~2줄로 요약해주세요.
	만약 금과 전혀 관련이 없는 뉴스라면 (예: 금요일, 벌금, 장학금, 출금, 송금, 보조금, 단순히 '금'이 포함된 단어만 있는 기사 등)
	'NOT_GOLD_NEWS' 라고만 정확히 답변하세요.

	요약 결과:
	"""
	response = llm.invoke([HumanMessage(content=prompt)])
	summary = response.content.strip()

	if summary == "NOT_GOLD_NEWS":
	print(f"❌ [Gemini 필터링] 가짜 금 뉴스 스킵: {entry.title}")
	continue

	content_text = summary
	print(f"✅ [Gemini 요약 성공] 요약문: {content_text}")

	except Exception as e:
	print(f"⚠️ 기사 본문 추출 또는 요약 실패 ({entry.title}): {e}")
	# 실패했을 땐 무시하고 다음 기사로 넘어갑니다
	continue

	# 💡 임베딩 생성 (List[float] 형태로 반환됨)
	# content 컬럼에 넣을 데이터 구성 (뉴스 요약)

	title_emb = embedder.embed_query(entry.title)
	content_emb = embedder.embed_query(content_text)

	# DB 모델 객체 생성 및 세션에 추가
	new_article = NewsEmbedding(
	title=entry.title,
	title_embedding=title_emb,
	content=content_text,
	content_embedding=content_emb,
	created_at=pub_date
	)
	db_session.add(new_article)

	valid_news.append((entry.title, entry.link))
	print(f"✅ DB 추가 예약: {entry.title}")

	# 변경사항을 DB에 최종 반영 (Commit)
	if valid_news:
	db_session.commit()

	return valid_news

	# 3. 메인 검색 함수 (DB 세션 관리 추가)
	async def search_gold_news():
	print(f"\n=== 🌟 [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 자동 검색 및 DB 저장 시작 ===")

	# DB 세션 열기
	db = SessionLocal()
	try:
	print("\n[국내 뉴스 검색 중...]")
	kr_news = fetch_filter_and_save_news("금", "ko", "KR", "KR:ko", db, max_news=3)
	if not kr_news:
	print("새로운 국내 뉴스가 없거나 모두 이미 저장된 기사야.")

	print("\n[해외 뉴스 검색 중...]")
	en_news = fetch_filter_and_save_news("gold", "en", "US", "US:en", db, max_news=3)
	if not en_news:
	print("새로운 해외 뉴스가 없거나 모두 이미 저장된 기사야.")

	except Exception as e:
	print(f"❌ DB 저장 중 에러 발생: {e}")
	db.rollback()
	finally:
	# 작업이 끝나면 무조건 DB 세션 닫기
	db.close()

	print("==========================================\n")

	# 스케줄러 설정 (기존과 동일)
	def create_scheduler():
	scheduler = AsyncIOScheduler(timezone="Asia/Seoul")
	#scheduler.add_job(search_gold_news, CronTrigger(hour=18, minute=0))
	scheduler.add_job(search_gold_news, IntervalTrigger(seconds=60))
	return scheduler

	news_scheduler = create_scheduler()