Spaces:

ginigen
/

Today

Running

App Files Files Community

Today / app.py

ginipick

Create app.py

6085299 verified 2 months ago

raw

history blame

17.1 kB

	# -- coding: utf-8 --
	"""
	AI 뉴스 & 허깅페이스 트렌딩 분석 시스템
	- AI Times 뉴스 크롤링 및 카테고리 분류
	- 허깅페이스 모델/스페이스 트렌딩 정보 수집
	- Fireworks AI (Qwen) 를 통한 뉴스 분석
	- Brave Search를 통한 팩트 체크
	"""

	import requests
	from bs4 import BeautifulSoup
	import json
	from datetime import datetime
	from typing import List, Dict, Optional
	import time
	import re


	class AINewsAnalyzer:
	def __init__(self, fireworks_api_key: str, brave_api_key: str):
	"""
	Args:
	fireworks_api_key: Fireworks AI API 키
	brave_api_key: Brave Search API 키
	"""
	self.fireworks_api_key = fireworks_api_key
	self.brave_api_key = brave_api_key

	# 뉴스 카테고리 정의
	self.categories = {
	"산업동향": ["산업", "기업", "투자", "인수", "파트너십", "시장"],
	"기술혁신": ["기술", "모델", "알고리즘", "개발", "연구", "논문"],
	"제품출시": ["출시", "공개", "발표", "서비스", "제품"],
	"정책규제": ["규제", "정책", "법", "정부", "제재"],
	"보안이슈": ["보안", "취약점", "해킹", "위험", "프라이버시"],
	}

	self.huggingface_data = {
	"models": [],
	"spaces": []
	}

	self.news_data = []

	def fetch_aitimes_news(self, urls: List[str]) -> List[Dict]:
	"""AI Times 뉴스 크롤링"""
	all_news = []

	for url in urls:
	try:
	print(f"📰 뉴스 크롤링 중: {url}")
	response = requests.get(url, headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	})
	soup = BeautifulSoup(response.content, 'html.parser')

	# 뉴스 기사 추출 (실제 구조에 맞게 조정 필요)
	articles = []

	# 제목과 링크가 있는 a 태그 찾기
	for link in soup.find_all('a', href=True):
	if '/news/articleView.html' in link['href']:
	title = link.get_text(strip=True)
	article_url = link['href']

	if not article_url.startswith('http'):
	article_url = 'https://www.aitimes.com' + article_url

	# 날짜 추출 (형제 요소에서)
	date_text = ""
	parent = link.parent
	if parent:
	date_elem = parent.find(text=re.compile(r'\d{2}-\d{2}'))
	if date_elem:
	date_text = date_elem.strip()

	if title and len(title) > 10:
	articles.append({
	'title': title,
	'url': article_url,
	'date': date_text,
	'source': 'AI Times'
	})

	all_news.extend(articles[:10]) # 상위 10개만
	time.sleep(1) # 크롤링 예의

	except Exception as e:
	print(f"❌ 크롤링 오류: {e}")

	return all_news

	def fetch_huggingface_trending(self) -> Dict:
	"""허깅페이스 트렌딩 모델 및 스페이스 수집"""
	print("🤗 허깅페이스 트렌딩 정보 수집 중...")

	# 모델 트렌딩
	try:
	models_url = "https://huggingface.co/api/models"
	params = {
	'sort': 'trending',
	'limit': 30
	}

	response = requests.get(models_url, params=params, timeout=10)
	if response.status_code == 200:
	models = response.json()

	for model in models[:30]:
	self.huggingface_data['models'].append({
	'name': model.get('id', 'Unknown'),
	'downloads': model.get('downloads', 0),
	'likes': model.get('likes', 0),
	'task': model.get('pipeline_tag', 'N/A'),
	'url': f"https://huggingface.co/{model.get('id', '')}"
	})

	print(f"✅ {len(self.huggingface_data['models'])}개 트렌딩 모델 수집 완료")

	except Exception as e:
	print(f"❌ 모델 수집 오류: {e}")

	# 스페이스 트렌딩 (웹 크롤링)
	try:
	spaces_url = "https://huggingface.co/spaces"
	response = requests.get(spaces_url, headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}, timeout=10)

	soup = BeautifulSoup(response.content, 'html.parser')

	# 스페이스 링크 추출
	space_count = 0
	for link in soup.find_all('a', href=True):
	if '/spaces/' in link['href'] and space_count < 30:
	space_name = link['href'].replace('/spaces/', '')
	if '/' in space_name and len(space_name) > 3:
	title = link.get_text(strip=True)
	if title:
	self.huggingface_data['spaces'].append({
	'name': space_name,
	'title': title[:100],
	'url': f"https://huggingface.co{link['href']}"
	})
	space_count += 1

	print(f"✅ {len(self.huggingface_data['spaces'])}개 트렌딩 스페이스 수집 완료")

	except Exception as e:
	print(f"❌ 스페이스 수집 오류: {e}")

	return self.huggingface_data

	def categorize_news(self, news_list: List[Dict]) -> List[Dict]:
	"""뉴스 카테고리 분류"""
	for news in news_list:
	title = news['title'].lower()
	news['category'] = "기타"

	for category, keywords in self.categories.items():
	if any(keyword in title for keyword in keywords):
	news['category'] = category
	break

	return news_list

	def analyze_with_qwen(self, text: str, instruction: str) -> str:
	"""Fireworks AI Qwen 모델을 사용한 분석"""
	url = "https://api.fireworks.ai/inference/v1/chat/completions"

	payload = {
	"model": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
	"max_tokens": 4096,
	"top_p": 1,
	"top_k": 40,
	"presence_penalty": 0,
	"frequency_penalty": 0,
	"temperature": 0.6,
	"messages": [
	{
	"role": "system",
	"content": "당신은 AI 뉴스를 초등학생도 이해할 수 있게 쉽게 설명하는 전문가입니다."
	},
	{
	"role": "user",
	"content": f"{instruction}\n\n뉴스: {text}"
	}
	]
	}

	headers = {
	"Accept": "application/json",
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.fireworks_api_key}"
	}

	try:
	response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=30)

	if response.status_code == 200:
	result = response.json()
	return result['choices'][0]['message']['content']
	else:
	return f"분석 실패 (상태 코드: {response.status_code})"

	except Exception as e:
	return f"분석 오류: {str(e)}"

	def fact_check_with_brave(self, query: str) -> List[Dict]:
	"""Brave Search를 통한 팩트 체크"""
	url = "https://api.search.brave.com/res/v1/web/search"

	headers = {
	"Accept": "application/json",
	"X-Subscription-Token": self.brave_api_key
	}

	params = {
	"q": query,
	"count": 5,
	"text_decorations": False,
	"search_lang": "ko"
	}

	try:
	response = requests.get(url, headers=headers, params=params, timeout=10)

	if response.status_code == 200:
	data = response.json()
	results = []

	if 'web' in data and 'results' in data['web']:
	for item in data['web']['results'][:3]:
	results.append({
	'title': item.get('title', ''),
	'description': item.get('description', ''),
	'url': item.get('url', '')
	})

	return results
	else:
	return []

	except Exception as e:
	print(f"❌ Brave Search 오류: {e}")
	return []

	def generate_report(self, news_list: List[Dict], analyze_news: bool = True) -> str:
	"""종합 리포트 생성"""
	report = []
	report.append("=" * 80)
	report.append("📊 AI 뉴스 & 허깅페이스 트렌딩 종합 리포트")
	report.append(f"📅 생성일시: {datetime.now().strftime('%Y년 %m월 %d일 %H:%M')}")
	report.append("=" * 80)
	report.append("")

	# 1. 카테고리별 뉴스 분석
	report.append("📰 === AI TIMES 뉴스 분석 ===")
	report.append("")

	categorized_news = {}
	for news in news_list:
	category = news.get('category', '기타')
	if category not in categorized_news:
	categorized_news[category] = []
	categorized_news[category].append(news)

	for category, articles in categorized_news.items():
	report.append(f"📌 [{category}] ({len(articles)}건)")
	report.append("-" * 80)

	for i, article in enumerate(articles[:5], 1): # 카테고리당 5개만
	report.append(f"{i}. {article['title']}")
	report.append(f" 🔗 {article['url']}")
	report.append(f" 📅 {article.get('date', 'N/A')}")

	# LLM 분석 (선택적)
	if analyze_news and i <= 2: # 각 카테고리 상위 2개만 분석
	print(f"🤖 LLM 분석 중: {article['title'][:50]}...")

	instruction = """이 뉴스를 다음 형식으로 분석해주세요:
	1. 핵심 내용 (2-3문장, 초등학생 수준)
	2. 왜 중요한가? (1-2문장)
	3. 당신이 해야 할 행동 (1-2개 항목)

	간결하고 명확하게 작성해주세요."""

	analysis = self.analyze_with_qwen(article['title'], instruction)
	report.append(f"\n 🤖 AI 분석:")
	for line in analysis.split('\n'):
	if line.strip():
	report.append(f" {line.strip()}")

	# 팩트 체크 (선택적)
	fact_check = self.fact_check_with_brave(article['title'][:100])
	if fact_check:
	report.append(f"\n ✅ 팩트 체크 (Brave Search):")
	for fc in fact_check[:2]:
	report.append(f" • {fc['title']}")
	report.append(f" {fc['url']}")

	time.sleep(2) # API 레이트 리밋 고려

	report.append("")

	report.append("")

	# 2. 허깅페이스 트렌딩
	report.append("🤗 === 허깅페이스 트렌딩 TOP 30 ===")
	report.append("")

	# 모델
	report.append("🔥 트렌딩 모델 TOP 30")
	report.append("-" * 80)
	for i, model in enumerate(self.huggingface_data['models'][:30], 1):
	report.append(f"{i:2d}. {model['name']}")
	report.append(f" 📊 다운로드: {model['downloads']:,} \| ❤️ 좋아요: {model['likes']:,}")
	report.append(f" 🏷️ Task: {model['task']}")
	report.append(f" 🔗 {model['url']}")
	report.append("")

	report.append("")

	# 스페이스
	report.append("🚀 트렌딩 스페이스 TOP 30")
	report.append("-" * 80)
	for i, space in enumerate(self.huggingface_data['spaces'][:30], 1):
	report.append(f"{i:2d}. {space['name']}")
	report.append(f" 📝 {space['title']}")
	report.append(f" 🔗 {space['url']}")
	report.append("")

	# 3. 종합 요약
	report.append("=" * 80)
	report.append("📈 종합 요약")
	report.append("=" * 80)
	report.append(f"• 총 뉴스 수집: {len(news_list)}건")
	report.append(f"• 카테고리 수: {len(categorized_news)}개")
	report.append(f"• 트렌딩 모델: {len(self.huggingface_data['models'])}개")
	report.append(f"• 트렌딩 스페이스: {len(self.huggingface_data['spaces'])}개")
	report.append("")

	return '\n'.join(report)

	def run_full_analysis(self, news_urls: List[str], analyze_with_llm: bool = True) -> str:
	"""전체 분석 실행"""
	print("🚀 AI 뉴스 & 허깅페이스 트렌딩 분석 시작...")
	print("")

	# 1. 뉴스 수집
	news_list = self.fetch_aitimes_news(news_urls)
	print(f"✅ 총 {len(news_list)}건의 뉴스 수집 완료")
	print("")

	# 2. 뉴스 카테고리 분류
	categorized_news = self.categorize_news(news_list)
	print("✅ 뉴스 카테고리 분류 완료")
	print("")

	# 3. 허깅페이스 트렌딩 수집
	self.fetch_huggingface_trending()
	print("")

	# 4. 리포트 생성
	print("📝 리포트 생성 중...")
	report = self.generate_report(categorized_news, analyze_news=analyze_with_llm)

	print("")
	print("✅ 분석 완료!")

	return report

	def save_report(self, report: str, filename: str = None):
	"""리포트 저장"""
	if filename is None:
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	filename = f"ai_news_report_{timestamp}.txt"

	with open(filename, 'w', encoding='utf-8') as f:
	f.write(report)

	print(f"💾 리포트 저장 완료: {filename}")


	# ==================== 사용 예시 ====================

	def main():
	"""메인 실행 함수"""

	# API 키 설정
	FIREWORKS_API_KEY = "YOUR_FIREWORKS_API_KEY" # 여기에 Fireworks API 키 입력
	BRAVE_API_KEY = "YOUR_BRAVE_API_KEY" # 여기에 Brave Search API 키 입력

	# AI Times 뉴스 URL
	news_urls = [
	"https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm", # AI 산업
	"https://www.aitimes.com/news/articleList.html?sc_section_code=S1N24&view_type=sm" # AI 기술
	]

	# 분석기 초기화
	analyzer = AINewsAnalyzer(
	fireworks_api_key=FIREWORKS_API_KEY,
	brave_api_key=BRAVE_API_KEY
	)

	# 전체 분석 실행
	# analyze_with_llm=False로 설정하면 LLM 분석 없이 빠르게 수집만 함
	report = analyzer.run_full_analysis(
	news_urls=news_urls,
	analyze_with_llm=True # LLM 분석 활성화 (시간이 오래 걸림)
	)

	# 결과 출력
	print("\n" + "=" * 80)
	print(report)

	# 파일 저장
	analyzer.save_report(report)


	if __name__ == "__main__":
	main()


	# ==================== 사용 팁 ====================
	"""
	1. API 키 설정:
	- Fireworks AI: https://fireworks.ai/
	- Brave Search: https://brave.com/search/api/

	2. 빠른 테스트 (LLM 분석 없이):
	analyzer.run_full_analysis(news_urls, analyze_with_llm=False)

	3. 특정 카테고리만 분석:
	categorized_news에서 원하는 카테고리 필터링

	4. 크롤링 주기 조정:
	time.sleep() 값을 조정하여 속도/안정성 균형

	5. 결과 활용:
	- JSON으로 저장: json.dumps(analyzer.huggingface_data)
	- 데이터베이스 저장
	- 대시보드 연동
	"""