Spaces:

VIDraft
/

dash

Running

App Files Files Community

openfree commited on 12 days ago

Commit

a6fc716

verified ·

1 Parent(s): 3a4965c

Update news.py

Browse files

Files changed (1) hide show

news.py +85 -143

news.py CHANGED Viewed

@@ -1,110 +1,101 @@
 """
-VDash 뉴스 모듈
 - AI Times + Hacker News 크롤링
-- 비드래프트 조직 관점 자동 분류/태깅
 - HF Dataset 영구 저장
 """
-import requests
-import json
-import re
-import time
-import os
-import tempfile
 from datetime import datetime, timedelta
 from typing import List, Dict
 from bs4 import BeautifulSoup
 from huggingface_hub import HfApi, hf_hub_download
 HF_TOKEN = os.getenv("HF_TOKEN")
 SPACE_ID = os.getenv("SPACE_ID", "")
 OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
 DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
 NEWS_FILE = "news.json"
 hf_api = HfApi(token=HF_TOKEN)
-# ============================================================
-# 비드래프트 관점 태그 분류 규칙
-# ============================================================
 TAG_RULES = [
-    # (키워드 리스트, 태그명, 색상코드)
-    (["정부", "과제", "공모", "지원사업", "IITP", "NIA", "NIPA", "국책"], "🏛️ 정부과제", "#3b82f6"),
-    (["투자", "펀딩", "시리즈", "VC", "IPO", "인수", "M&A", "밸류에이션"], "💰 투자/IR", "#f59e0b"),
-    (["영상", "비디오", "video", "생성", "sora", "gen-", "동영상", "이미지생성", "텍스트투비디오"], "🎬 영상AI", "#ef4444"),
-    (["한국어", "korean", "multilingual", "번역", "다국어", "한글"], "🇰🇷 한국어AI", "#8b5cf6"),
-    (["허깅페이스", "hugging", "HF", "spaces", "모델", "오픈소스", "깃허브"], "🤗 HF/오픈소스", "#10b981"),
-    (["LLM", "GPT", "Claude", "Gemini", "거대언어", "파운데이션", "파인튜닝", "RAG", "에이전트", "agent"], "🧠 LLM/에이전트", "#6366f1"),
-    (["GPU", "칩", "반도체", "엔비디아", "NVIDIA", "인프라", "서버", "클라우드", "데이터센터"], "🖥️ 인프라/GPU", "#0d9488"),
-    (["보안", "개인정보", "규제", "법안", "윤리", "안전", "저작권", "AI법"], "🔒 규제/윤리", "#dc2626"),
-    (["스타트업", "창업", "사업", "제휴", "파트너", "계약", "매출", "수익"], "💼 비즈니스", "#ea580c"),
-    (["교육", "학습", "연구", "논문", "arXiv", "벤치마크", "성능", "평가"], "📚 R&D/연구", "#059669"),
-    (["마케팅", "콘텐츠", "SNS", "브랜딩", "홍보", "PR", "미디어"], "📢 마케팅/PR", "#9333ea"),
 ]
-RELEVANCE_KEYWORDS = {
-    "core": ["AI 영상", "비디오 생성", "한국어", "허깅페이스", "오픈소스", "에이전트", "LLM", "스타트업", "정부과제"],
-    "high": ["GPU", "클라우드", "인프라", "투자", "생성AI", "파인튜닝", "RAG", "API"],
-    "medium": ["규제", "교육", "연구", "보안", "마케팅"],
 }
-def classify_news(title: str, source: str = "") -> Dict:
-    """뉴스를 비드래프트 관점에서 분류"""
     text = (title + " " + source).lower()
-    tags = []
-    colors = {}
-    for keywords, tag_name, color in TAG_RULES:
         for kw in keywords:
             if kw.lower() in text:
-                if tag_name not in tags:
-                    tags.append(tag_name)
-                    colors[tag_name] = color
                 break
     if not tags:
         tags.append("📰 일반AI뉴스")
         colors["📰 일반AI뉴스"] = "#64748b"
-    # 비드래프트 관련도 판단
     relevance = "일반"
-    for kw in RELEVANCE_KEYWORDS["core"]:
-        if kw.lower() in text:
-            relevance = "핵심"
-            break
-    if relevance == "일반":
-        for kw in RELEVANCE_KEYWORDS["high"]:
-            if kw.lower() in text:
-                relevance = "주목"
-                break
-    if relevance == "일반":
-        for kw in RELEVANCE_KEYWORDS["medium"]:
             if kw.lower() in text:
-                relevance = "참고"
                 break
     return {"tags": tags, "colors": colors, "relevance": relevance}
-def generate_summary(title: str) -> str:
-    """제목 기반 한줄 요약 (LLM 없이 규칙 기반)"""
-    t = title.strip()
-    if len(t) > 80:
-        return t[:77] + "..."
-    return t
-# ============================================================
-# 크롤러
-# ============================================================
 UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-def fetch_aitimes(max_items: int = 20) -> List[Dict]:
-    """AI Times 최신 뉴스 크롤링"""
     print("📰 AI Times 수집 중...")
     urls = [
         "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
@@ -113,64 +104,44 @@ def fetch_aitimes(max_items: int = 20) -> List[Dict]:
     all_news = []
     today = datetime.now().strftime("%m-%d")
     yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
     for url in urls:
         try:
             r = requests.get(url, timeout=15, headers={"User-Agent": UA})
             r.raise_for_status()
             r.encoding = "utf-8"
             soup = BeautifulSoup(r.text, "html.parser")
-            articles = soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+"))
-            for tag in articles:
                 title = tag.get_text(strip=True)
                 link = tag.get("href", "")
                 if not title or len(title) < 10:
                     continue
                 if link and not link.startswith("http"):
                     link = "https://www.aitimes.com" + link
-                parent = tag.parent
-                date_text = ""
-                if parent:
-                    m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", parent.get_text())
                     if m:
                         date_text = m.group(1)
-                if not date_text:
-                    date_text = today
                 if today not in date_text and yesterday not in date_text:
                     continue
                 cls = classify_news(title, "AI Times")
-                all_news.append({
-                    "title": title, "url": link, "date": date_text,
-                    "source": "AI Times", "summary": generate_summary(title),
-                    **cls,
-                })
             time.sleep(0.5)
         except Exception as e:
-            print(f"  ⚠️ AI Times 오류: {e}")
     seen = set()
-    unique = []
-    for n in all_news:
-        if n["url"] not in seen:
-            seen.add(n["url"])
-            unique.append(n)
     print(f"  ✅ AI Times {len(unique)}건")
     return unique[:max_items]
-def fetch_hackernews(limit: int = 15) -> List[Dict]:
-    """Hacker News 최신 AI 관련 뉴스"""
     print("🔥 Hacker News 수집 중...")
     news = []
     try:
         r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
         ids = r.json()[:limit * 3]
         cutoff = datetime.utcnow() - timedelta(hours=36)
         for sid in ids:
             if len(news) >= limit:
                 break
@@ -182,94 +153,65 @@ def fetch_hackernews(limit: int = 15) -> List[Dict]:
                 st = datetime.utcfromtimestamp(s.get("time", 0))
                 if st < cutoff:
                     continue
-                title = s.get("title", "")
-                cls = classify_news(title, "Hacker News")
                 news.append({
-                    "title": title, "url": s["url"],
                     "date": st.strftime("%m-%d %H:%M"),
                     "source": "Hacker News",
-                    "summary": generate_summary(title),
                     "score": s.get("score", 0),
                     **cls,
                 })
-                time.sleep(0.15)
             except Exception:
                 continue
-        print(f"  ✅ HN {len(news)}건")
     except Exception as e:
-        print(f"  ⚠️ HN 오류: {e}")
     return news
-# ============================================================
-# HF Dataset 저장/로드
-# ============================================================
-def load_news_from_hf() -> List[Dict]:
     try:
-        path = hf_hub_download(
-            repo_id=DATASET_REPO, filename=NEWS_FILE,
-            repo_type="dataset", token=HF_TOKEN, force_download=True,
-        )
         with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        print(f"[OK] Loaded {len(data)} news from HF")
-        return data
-    except Exception as e:
-        print(f"[INFO] News load: {e}")
         return []
-def save_news_to_hf(news_list: List[Dict]):
     try:
         tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
         with open(tmp, "w", encoding="utf-8") as f:
             json.dump(news_list, f, ensure_ascii=False, indent=2)
-        hf_api.upload_file(
-            path_or_fileobj=tmp, path_in_repo=NEWS_FILE,
-            repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN,
-        )
-        print(f"[OK] Saved {len(news_list)} news to HF")
     except Exception as e:
         print(f"[ERROR] News save: {e}")
-# ============================================================
-# 메인 수집 함수 (app.py에서 호출)
-# ============================================================
-def collect_news(force: bool = False) -> List[Dict]:
-    """뉴스 수집 + 분류 + 저장"""
     if not force:
         cached = load_news_from_hf()
         if cached:
-            # 캐시가 6시간 이내면 재사용
             try:
                 last = cached[0].get("collected_at", "")
-                if last:
-                    last_dt = datetime.fromisoformat(last)
-                    if (datetime.now() - last_dt).total_seconds() < 21600:
-                        print("[NEWS] Cache fresh, reusing")
-                        return cached
             except Exception:
                 pass
     print("\n[NEWS] Collecting fresh news...")
     now_iso = datetime.now().isoformat()
-    aitimes = fetch_aitimes(20)
-    hn = fetch_hackernews(15)
-    all_news = aitimes + hn
     for n in all_news:
         n["collected_at"] = now_iso
-    # 관련도 순 정렬: 핵심 > 주목 > 참고 > 일반
     order = {"핵심": 0, "주목": 1, "참고": 2, "일반": 3}
     all_news.sort(key=lambda x: order.get(x.get("relevance", "일반"), 3))
     if HF_TOKEN and all_news:
         save_news_to_hf(all_news)
-    print(f"[NEWS] Total: {len(all_news)} articles\n")
     return all_news

 """
+VDash 뉴스 모듈 v2
 - AI Times + Hacker News 크롤링
+- HN 영문 제목 → 한글 자동 번역
+- 비드래프트 관점 자동 분류
 - HF Dataset 영구 저장
 """
+import requests, json, re, time, os, tempfile
 from datetime import datetime, timedelta
 from typing import List, Dict
 from bs4 import BeautifulSoup
 from huggingface_hub import HfApi, hf_hub_download
+try:
+    from deep_translator import GoogleTranslator
+    translator = GoogleTranslator(source='en', target='ko')
+    HAS_TRANSLATOR = True
+except Exception:
+    HAS_TRANSLATOR = False
+    print("[NEWS] deep-translator not available, HN titles will stay English")
 HF_TOKEN = os.getenv("HF_TOKEN")
 SPACE_ID = os.getenv("SPACE_ID", "")
 OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
 DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
 NEWS_FILE = "news.json"
 hf_api = HfApi(token=HF_TOKEN)
 TAG_RULES = [
+    (["정부","과제","공모","지원사업","IITP","NIA","NIPA","국책","government","grant"], "🏛️ 정부과제", "#3b82f6"),
+    (["투자","펀딩","시리즈","VC","IPO","인수","M&A","밸류에이션","funding","investment","acquisition"], "💰 투자/IR", "#f59e0b"),
+    (["영상","비디오","video","생성","sora","gen-","동영상","이미지생성"], "🎬 영상AI", "#ef4444"),
+    (["한국어","korean","multilingual","번역","다국어"], "🇰🇷 한국어AI", "#8b5cf6"),
+    (["허깅페이스","hugging","HF","spaces","오픈소스","open source","github"], "🤗 HF/오픈소스", "#10b981"),
+    (["LLM","GPT","Claude","Gemini","거대언어","파인튜닝","RAG","에이전트","agent","transformer","llama","mistral"], "🧠 LLM/에이전트", "#6366f1"),
+    (["GPU","칩","반도체","엔비디아","NVIDIA","인프라","서버","클라우드","데이터센터","chip","server","cloud"], "🖥️ 인프라/GPU", "#0d9488"),
+    (["보안","개인정보","규제","법안","윤리","안전","저작권","AI법","regulation","safety","privacy"], "🔒 규제/윤리", "#dc2626"),
+    (["스타트업","창업","사업","제휴","파트너","계약","매출","startup","business","revenue"], "💼 비즈니스", "#ea580c"),
+    (["교육","학습","연구","논문","arXiv","벤치마크","성능","평가","paper","research","benchmark"], "📚 R&D/연구", "#059669"),
+    (["마케팅","콘텐츠","SNS","브랜딩","홍보","PR","미디어","marketing"], "📢 마케팅/PR", "#9333ea"),
 ]
+RELEVANCE_KW = {
+    "핵심": ["AI 영상","비디오 생성","한국어","허깅페이스","오픈소스","에이전트","LLM","스타트업","정부과제","video generation","hugging face"],
+    "주목": ["GPU","클라우드","인프라","투자","생성AI","파인튜닝","RAG","API","generative"],
+    "참고": ["규제","교육","연구","보안","마케팅","benchmark","safety"],
 }
+def classify_news(title, source=""):
     text = (title + " " + source).lower()
+    tags, colors = [], {}
+    for keywords, tag, color in TAG_RULES:
         for kw in keywords:
             if kw.lower() in text:
+                if tag not in tags:
+                    tags.append(tag)
+                    colors[tag] = color
                 break
     if not tags:
         tags.append("📰 일반AI뉴스")
         colors["📰 일반AI뉴스"] = "#64748b"
     relevance = "일반"
+    for level in ["핵심", "주목", "참고"]:
+        for kw in RELEVANCE_KW[level]:
             if kw.lower() in text:
+                relevance = level
                 break
+        if relevance != "일반":
+            break
     return {"tags": tags, "colors": colors, "relevance": relevance}
+def translate_to_korean(text):
+    """영문 텍스트를 한글로 번역"""
+    if not text or not HAS_TRANSLATOR:
+        return text
+    # 이미 한글이면 스킵
+    if re.search(r'[가-힣]', text):
+        return text
+    try:
+        result = translator.translate(text)
+        return result if result else text
+    except Exception as e:
+        print(f"  번역 실패: {e}")
+        return text
+def gen_summary(title):
+    t = title.strip()
+    return t[:80] + "..." if len(t) > 80 else t
 UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+def fetch_aitimes(max_items=20):
     print("📰 AI Times 수집 중...")
     urls = [
         "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
     all_news = []
     today = datetime.now().strftime("%m-%d")
     yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
     for url in urls:
         try:
             r = requests.get(url, timeout=15, headers={"User-Agent": UA})
             r.raise_for_status()
             r.encoding = "utf-8"
             soup = BeautifulSoup(r.text, "html.parser")
+            for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")):
                 title = tag.get_text(strip=True)
                 link = tag.get("href", "")
                 if not title or len(title) < 10:
                     continue
                 if link and not link.startswith("http"):
                     link = "https://www.aitimes.com" + link
+                date_text = today
+                if tag.parent:
+                    m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text())
                     if m:
                         date_text = m.group(1)
                 if today not in date_text and yesterday not in date_text:
                     continue
                 cls = classify_news(title, "AI Times")
+                all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls})
             time.sleep(0.5)
         except Exception as e:
+            print(f"  ⚠️ AI Times: {e}")
     seen = set()
+    unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])]
     print(f"  ✅ AI Times {len(unique)}건")
     return unique[:max_items]
+def fetch_hackernews(limit=15):
     print("🔥 Hacker News 수집 중...")
     news = []
     try:
         r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
         ids = r.json()[:limit * 3]
         cutoff = datetime.utcnow() - timedelta(hours=36)
         for sid in ids:
             if len(news) >= limit:
                 break
                 st = datetime.utcfromtimestamp(s.get("time", 0))
                 if st < cutoff:
                     continue
+                title_en = s.get("title", "")
+                # 한글 번역
+                title_ko = translate_to_korean(title_en)
+                cls = classify_news(title_en + " " + title_ko, "Hacker News")
                 news.append({
+                    "title": title_ko,
+                    "title_en": title_en,
+                    "url": s["url"],
                     "date": st.strftime("%m-%d %H:%M"),
                     "source": "Hacker News",
+                    "summary": gen_summary(title_ko),
                     "score": s.get("score", 0),
                     **cls,
                 })
+                time.sleep(0.2)
             except Exception:
                 continue
+        print(f"  ✅ HN {len(news)}건 (한글 번역 완료)")
     except Exception as e:
+        print(f"  ⚠️ HN: {e}")
     return news
+def load_news_from_hf():
     try:
+        path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True)
         with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
         return []
+def save_news_to_hf(news_list):
     try:
         tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
         with open(tmp, "w", encoding="utf-8") as f:
             json.dump(news_list, f, ensure_ascii=False, indent=2)
+        hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
     except Exception as e:
         print(f"[ERROR] News save: {e}")
+def collect_news(force=False):
     if not force:
         cached = load_news_from_hf()
         if cached:
             try:
                 last = cached[0].get("collected_at", "")
+                if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600:
+                    return cached
             except Exception:
                 pass
     print("\n[NEWS] Collecting fresh news...")
     now_iso = datetime.now().isoformat()
+    all_news = fetch_aitimes(20) + fetch_hackernews(15)
     for n in all_news:
         n["collected_at"] = now_iso
     order = {"핵심": 0, "주목": 1, "참고": 2, "일반": 3}
     all_news.sort(key=lambda x: order.get(x.get("relevance", "일반"), 3))
     if HF_TOKEN and all_news:
         save_news_to_hf(all_news)
     return all_news