Update news.py
Browse files
news.py
CHANGED
|
@@ -1,110 +1,101 @@
|
|
| 1 |
"""
|
| 2 |
-
VDash ๋ด์ค ๋ชจ๋
|
| 3 |
- AI Times + Hacker News ํฌ๋กค๋ง
|
| 4 |
-
-
|
|
|
|
| 5 |
- HF Dataset ์๊ตฌ ์ ์ฅ
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
import requests
|
| 9 |
-
import json
|
| 10 |
-
import re
|
| 11 |
-
import time
|
| 12 |
-
import os
|
| 13 |
-
import tempfile
|
| 14 |
from datetime import datetime, timedelta
|
| 15 |
from typing import List, Dict
|
| 16 |
from bs4 import BeautifulSoup
|
| 17 |
from huggingface_hub import HfApi, hf_hub_download
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 20 |
SPACE_ID = os.getenv("SPACE_ID", "")
|
| 21 |
OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
|
| 22 |
DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
|
| 23 |
NEWS_FILE = "news.json"
|
| 24 |
-
|
| 25 |
hf_api = HfApi(token=HF_TOKEN)
|
| 26 |
|
| 27 |
-
# ============================================================
|
| 28 |
-
# ๋น๋๋ํํธ ๊ด์ ํ๊ทธ ๋ถ๋ฅ ๊ท์น
|
| 29 |
-
# ============================================================
|
| 30 |
-
|
| 31 |
TAG_RULES = [
|
| 32 |
-
|
| 33 |
-
(["
|
| 34 |
-
(["
|
| 35 |
-
(["
|
| 36 |
-
(["
|
| 37 |
-
(["
|
| 38 |
-
(["
|
| 39 |
-
(["
|
| 40 |
-
(["
|
| 41 |
-
(["
|
| 42 |
-
(["
|
| 43 |
-
(["๋ง์ผํ
", "์ฝํ
์ธ ", "SNS", "๋ธ๋๋ฉ", "ํ๋ณด", "PR", "๋ฏธ๋์ด"], "๐ข ๋ง์ผํ
/PR", "#9333ea"),
|
| 44 |
]
|
| 45 |
|
| 46 |
-
|
| 47 |
-
"
|
| 48 |
-
"
|
| 49 |
-
"
|
| 50 |
}
|
| 51 |
|
| 52 |
|
| 53 |
-
def classify_news(title
|
| 54 |
-
"""๋ด์ค๋ฅผ ๋น๋๋ํํธ ๊ด์ ์์ ๋ถ๋ฅ"""
|
| 55 |
text = (title + " " + source).lower()
|
| 56 |
-
tags = []
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
for keywords, tag_name, color in TAG_RULES:
|
| 60 |
for kw in keywords:
|
| 61 |
if kw.lower() in text:
|
| 62 |
-
if
|
| 63 |
-
tags.append(
|
| 64 |
-
colors[
|
| 65 |
break
|
| 66 |
-
|
| 67 |
if not tags:
|
| 68 |
tags.append("๐ฐ ์ผ๋ฐAI๋ด์ค")
|
| 69 |
colors["๐ฐ ์ผ๋ฐAI๋ด์ค"] = "#64748b"
|
| 70 |
-
|
| 71 |
-
# ๋น๋๋ํํธ ๊ด๋ จ๋ ํ๋จ
|
| 72 |
relevance = "์ผ๋ฐ"
|
| 73 |
-
for
|
| 74 |
-
|
| 75 |
-
relevance = "ํต์ฌ"
|
| 76 |
-
break
|
| 77 |
-
if relevance == "์ผ๋ฐ":
|
| 78 |
-
for kw in RELEVANCE_KEYWORDS["high"]:
|
| 79 |
-
if kw.lower() in text:
|
| 80 |
-
relevance = "์ฃผ๋ชฉ"
|
| 81 |
-
break
|
| 82 |
-
if relevance == "์ผ๋ฐ":
|
| 83 |
-
for kw in RELEVANCE_KEYWORDS["medium"]:
|
| 84 |
if kw.lower() in text:
|
| 85 |
-
relevance =
|
| 86 |
break
|
| 87 |
-
|
|
|
|
| 88 |
return {"tags": tags, "colors": colors, "relevance": relevance}
|
| 89 |
|
| 90 |
|
| 91 |
-
def
|
| 92 |
-
"""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 104 |
|
| 105 |
|
| 106 |
-
def fetch_aitimes(max_items
|
| 107 |
-
"""AI Times ์ต์ ๋ด์ค ํฌ๋กค๋ง"""
|
| 108 |
print("๐ฐ AI Times ์์ง ์ค...")
|
| 109 |
urls = [
|
| 110 |
"https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
|
|
@@ -113,64 +104,44 @@ def fetch_aitimes(max_items: int = 20) -> List[Dict]:
|
|
| 113 |
all_news = []
|
| 114 |
today = datetime.now().strftime("%m-%d")
|
| 115 |
yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
|
| 116 |
-
|
| 117 |
for url in urls:
|
| 118 |
try:
|
| 119 |
r = requests.get(url, timeout=15, headers={"User-Agent": UA})
|
| 120 |
r.raise_for_status()
|
| 121 |
r.encoding = "utf-8"
|
| 122 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
for tag in articles:
|
| 126 |
title = tag.get_text(strip=True)
|
| 127 |
link = tag.get("href", "")
|
| 128 |
if not title or len(title) < 10:
|
| 129 |
continue
|
| 130 |
if link and not link.startswith("http"):
|
| 131 |
link = "https://www.aitimes.com" + link
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
if parent:
|
| 136 |
-
m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", parent.get_text())
|
| 137 |
if m:
|
| 138 |
date_text = m.group(1)
|
| 139 |
-
if not date_text:
|
| 140 |
-
date_text = today
|
| 141 |
-
|
| 142 |
if today not in date_text and yesterday not in date_text:
|
| 143 |
continue
|
| 144 |
-
|
| 145 |
cls = classify_news(title, "AI Times")
|
| 146 |
-
all_news.append({
|
| 147 |
-
"title": title, "url": link, "date": date_text,
|
| 148 |
-
"source": "AI Times", "summary": generate_summary(title),
|
| 149 |
-
**cls,
|
| 150 |
-
})
|
| 151 |
time.sleep(0.5)
|
| 152 |
except Exception as e:
|
| 153 |
-
print(f" โ ๏ธ AI Times
|
| 154 |
-
|
| 155 |
seen = set()
|
| 156 |
-
unique = []
|
| 157 |
-
for n in all_news:
|
| 158 |
-
if n["url"] not in seen:
|
| 159 |
-
seen.add(n["url"])
|
| 160 |
-
unique.append(n)
|
| 161 |
print(f" โ
AI Times {len(unique)}๊ฑด")
|
| 162 |
return unique[:max_items]
|
| 163 |
|
| 164 |
|
| 165 |
-
def fetch_hackernews(limit
|
| 166 |
-
"""Hacker News ์ต์ AI ๊ด๋ จ ๋ด์ค"""
|
| 167 |
print("๐ฅ Hacker News ์์ง ์ค...")
|
| 168 |
news = []
|
| 169 |
try:
|
| 170 |
r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
|
| 171 |
ids = r.json()[:limit * 3]
|
| 172 |
cutoff = datetime.utcnow() - timedelta(hours=36)
|
| 173 |
-
|
| 174 |
for sid in ids:
|
| 175 |
if len(news) >= limit:
|
| 176 |
break
|
|
@@ -182,94 +153,65 @@ def fetch_hackernews(limit: int = 15) -> List[Dict]:
|
|
| 182 |
st = datetime.utcfromtimestamp(s.get("time", 0))
|
| 183 |
if st < cutoff:
|
| 184 |
continue
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
| 187 |
news.append({
|
| 188 |
-
"title":
|
|
|
|
|
|
|
| 189 |
"date": st.strftime("%m-%d %H:%M"),
|
| 190 |
"source": "Hacker News",
|
| 191 |
-
"summary":
|
| 192 |
"score": s.get("score", 0),
|
| 193 |
**cls,
|
| 194 |
})
|
| 195 |
-
time.sleep(0.
|
| 196 |
except Exception:
|
| 197 |
continue
|
| 198 |
-
print(f" โ
HN {len(news)}๊ฑด")
|
| 199 |
except Exception as e:
|
| 200 |
-
print(f" โ ๏ธ HN
|
| 201 |
return news
|
| 202 |
|
| 203 |
|
| 204 |
-
|
| 205 |
-
# HF Dataset ์ ์ฅ/๋ก๋
|
| 206 |
-
# ============================================================
|
| 207 |
-
|
| 208 |
-
def load_news_from_hf() -> List[Dict]:
|
| 209 |
try:
|
| 210 |
-
path = hf_hub_download(
|
| 211 |
-
repo_id=DATASET_REPO, filename=NEWS_FILE,
|
| 212 |
-
repo_type="dataset", token=HF_TOKEN, force_download=True,
|
| 213 |
-
)
|
| 214 |
with open(path, "r", encoding="utf-8") as f:
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
return data
|
| 218 |
-
except Exception as e:
|
| 219 |
-
print(f"[INFO] News load: {e}")
|
| 220 |
return []
|
| 221 |
|
| 222 |
|
| 223 |
-
def save_news_to_hf(news_list
|
| 224 |
try:
|
| 225 |
tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
|
| 226 |
with open(tmp, "w", encoding="utf-8") as f:
|
| 227 |
json.dump(news_list, f, ensure_ascii=False, indent=2)
|
| 228 |
-
hf_api.upload_file(
|
| 229 |
-
path_or_fileobj=tmp, path_in_repo=NEWS_FILE,
|
| 230 |
-
repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN,
|
| 231 |
-
)
|
| 232 |
-
print(f"[OK] Saved {len(news_list)} news to HF")
|
| 233 |
except Exception as e:
|
| 234 |
print(f"[ERROR] News save: {e}")
|
| 235 |
|
| 236 |
|
| 237 |
-
|
| 238 |
-
# ๋ฉ์ธ ์์ง ํจ์ (app.py์์ ํธ์ถ)
|
| 239 |
-
# ============================================================
|
| 240 |
-
|
| 241 |
-
def collect_news(force: bool = False) -> List[Dict]:
|
| 242 |
-
"""๋ด์ค ์์ง + ๋ถ๋ฅ + ์ ์ฅ"""
|
| 243 |
if not force:
|
| 244 |
cached = load_news_from_hf()
|
| 245 |
if cached:
|
| 246 |
-
# ์บ์๊ฐ 6์๊ฐ ์ด๋ด๋ฉด ์ฌ์ฌ์ฉ
|
| 247 |
try:
|
| 248 |
last = cached[0].get("collected_at", "")
|
| 249 |
-
if last:
|
| 250 |
-
|
| 251 |
-
if (datetime.now() - last_dt).total_seconds() < 21600:
|
| 252 |
-
print("[NEWS] Cache fresh, reusing")
|
| 253 |
-
return cached
|
| 254 |
except Exception:
|
| 255 |
pass
|
| 256 |
-
|
| 257 |
print("\n[NEWS] Collecting fresh news...")
|
| 258 |
now_iso = datetime.now().isoformat()
|
| 259 |
-
|
| 260 |
-
aitimes = fetch_aitimes(20)
|
| 261 |
-
hn = fetch_hackernews(15)
|
| 262 |
-
all_news = aitimes + hn
|
| 263 |
-
|
| 264 |
for n in all_news:
|
| 265 |
n["collected_at"] = now_iso
|
| 266 |
-
|
| 267 |
-
# ๊ด๋ จ๋ ์ ์ ๋ ฌ: ํต์ฌ > ์ฃผ๋ชฉ > ์ฐธ๊ณ > ์ผ๋ฐ
|
| 268 |
order = {"ํต์ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ": 3}
|
| 269 |
all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ"), 3))
|
| 270 |
-
|
| 271 |
if HF_TOKEN and all_news:
|
| 272 |
save_news_to_hf(all_news)
|
| 273 |
-
|
| 274 |
-
print(f"[NEWS] Total: {len(all_news)} articles\n")
|
| 275 |
return all_news
|
|
|
|
| 1 |
"""
|
| 2 |
+
VDash ๋ด์ค ๋ชจ๋ v2
|
| 3 |
- AI Times + Hacker News ํฌ๋กค๋ง
|
| 4 |
+
- HN ์๋ฌธ ์ ๋ชฉ โ ํ๊ธ ์๋ ๋ฒ์ญ
|
| 5 |
+
- ๋น๋๋ํํธ ๊ด์ ์๋ ๋ถ๋ฅ
|
| 6 |
- HF Dataset ์๊ตฌ ์ ์ฅ
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
import requests, json, re, time, os, tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from datetime import datetime, timedelta
|
| 11 |
from typing import List, Dict
|
| 12 |
from bs4 import BeautifulSoup
|
| 13 |
from huggingface_hub import HfApi, hf_hub_download
|
| 14 |
|
| 15 |
+
try:
|
| 16 |
+
from deep_translator import GoogleTranslator
|
| 17 |
+
translator = GoogleTranslator(source='en', target='ko')
|
| 18 |
+
HAS_TRANSLATOR = True
|
| 19 |
+
except Exception:
|
| 20 |
+
HAS_TRANSLATOR = False
|
| 21 |
+
print("[NEWS] deep-translator not available, HN titles will stay English")
|
| 22 |
+
|
| 23 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 24 |
SPACE_ID = os.getenv("SPACE_ID", "")
|
| 25 |
OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
|
| 26 |
DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
|
| 27 |
NEWS_FILE = "news.json"
|
|
|
|
| 28 |
hf_api = HfApi(token=HF_TOKEN)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
TAG_RULES = [
|
| 31 |
+
(["์ ๋ถ","๊ณผ์ ","๊ณต๋ชจ","์ง์์ฌ์
","IITP","NIA","NIPA","๊ตญ์ฑ
","government","grant"], "๐๏ธ ์ ๋ถ๊ณผ์ ", "#3b82f6"),
|
| 32 |
+
(["ํฌ์","ํ๋ฉ","์๋ฆฌ์ฆ","VC","IPO","์ธ์","M&A","๋ฐธ๋ฅ์์ด์
","funding","investment","acquisition"], "๐ฐ ํฌ์/IR", "#f59e0b"),
|
| 33 |
+
(["์์","๋น๋์ค","video","์์ฑ","sora","gen-","๋์์","์ด๋ฏธ์ง์์ฑ"], "๐ฌ ์์AI", "#ef4444"),
|
| 34 |
+
(["ํ๊ตญ์ด","korean","multilingual","๋ฒ์ญ","๋ค๊ตญ์ด"], "๐ฐ๐ท ํ๊ตญ์ดAI", "#8b5cf6"),
|
| 35 |
+
(["ํ๊น
ํ์ด์ค","hugging","HF","spaces","์คํ์์ค","open source","github"], "๐ค HF/์คํ์์ค", "#10b981"),
|
| 36 |
+
(["LLM","GPT","Claude","Gemini","๊ฑฐ๋์ธ์ด","ํ์ธํ๋","RAG","์์ด์ ํธ","agent","transformer","llama","mistral"], "๐ง LLM/์์ด์ ํธ", "#6366f1"),
|
| 37 |
+
(["GPU","์นฉ","๋ฐ๋์ฒด","์๋น๋์","NVIDIA","์ธํ๋ผ","์๋ฒ","ํด๋ผ์ฐ๋","๋ฐ์ดํฐ์ผํฐ","chip","server","cloud"], "๐ฅ๏ธ ์ธํ๋ผ/GPU", "#0d9488"),
|
| 38 |
+
(["๋ณด์","๊ฐ์ธ์ ๋ณด","๊ท์ ","๋ฒ์","์ค๋ฆฌ","์์ ","์ ์๊ถ","AI๋ฒ","regulation","safety","privacy"], "๐ ๊ท์ /์ค๋ฆฌ", "#dc2626"),
|
| 39 |
+
(["์คํํธ์
","์ฐฝ์
","์ฌ์
","์ ํด","ํํธ๋","๊ณ์ฝ","๋งค์ถ","startup","business","revenue"], "๐ผ ๋น์ฆ๋์ค", "#ea580c"),
|
| 40 |
+
(["๊ต์ก","ํ์ต","์ฐ๊ตฌ","๋
ผ๋ฌธ","arXiv","๋ฒค์น๋งํฌ","์ฑ๋ฅ","ํ๊ฐ","paper","research","benchmark"], "๐ R&D/์ฐ๊ตฌ", "#059669"),
|
| 41 |
+
(["๋ง์ผํ
","์ฝํ
์ธ ","SNS","๋ธ๋๋ฉ","ํ๋ณด","PR","๋ฏธ๋์ด","marketing"], "๐ข ๋ง์ผํ
/PR", "#9333ea"),
|
|
|
|
| 42 |
]
|
| 43 |
|
| 44 |
+
RELEVANCE_KW = {
|
| 45 |
+
"ํต์ฌ": ["AI ์์","๋น๋์ค ์์ฑ","ํ๊ตญ์ด","ํ๊น
ํ์ด์ค","์คํ์์ค","์์ด์ ํธ","LLM","์คํํธ์
","์ ๋ถ๊ณผ์ ","video generation","hugging face"],
|
| 46 |
+
"์ฃผ๋ชฉ": ["GPU","ํด๋ผ์ฐ๋","์ธํ๋ผ","ํฌ์","์์ฑAI","ํ์ธํ๋","RAG","API","generative"],
|
| 47 |
+
"์ฐธ๊ณ ": ["๊ท์ ","๊ต์ก","์ฐ๊ตฌ","๋ณด์","๋ง์ผํ
","benchmark","safety"],
|
| 48 |
}
|
| 49 |
|
| 50 |
|
| 51 |
+
def classify_news(title, source=""):
|
|
|
|
| 52 |
text = (title + " " + source).lower()
|
| 53 |
+
tags, colors = [], {}
|
| 54 |
+
for keywords, tag, color in TAG_RULES:
|
|
|
|
|
|
|
| 55 |
for kw in keywords:
|
| 56 |
if kw.lower() in text:
|
| 57 |
+
if tag not in tags:
|
| 58 |
+
tags.append(tag)
|
| 59 |
+
colors[tag] = color
|
| 60 |
break
|
|
|
|
| 61 |
if not tags:
|
| 62 |
tags.append("๐ฐ ์ผ๋ฐAI๋ด์ค")
|
| 63 |
colors["๐ฐ ์ผ๋ฐAI๋ด์ค"] = "#64748b"
|
|
|
|
|
|
|
| 64 |
relevance = "์ผ๋ฐ"
|
| 65 |
+
for level in ["ํต์ฌ", "์ฃผ๋ชฉ", "์ฐธ๊ณ "]:
|
| 66 |
+
for kw in RELEVANCE_KW[level]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
if kw.lower() in text:
|
| 68 |
+
relevance = level
|
| 69 |
break
|
| 70 |
+
if relevance != "์ผ๋ฐ":
|
| 71 |
+
break
|
| 72 |
return {"tags": tags, "colors": colors, "relevance": relevance}
|
| 73 |
|
| 74 |
|
| 75 |
+
def translate_to_korean(text):
|
| 76 |
+
"""์๋ฌธ ํ
์คํธ๋ฅผ ํ๊ธ๋ก ๋ฒ์ญ"""
|
| 77 |
+
if not text or not HAS_TRANSLATOR:
|
| 78 |
+
return text
|
| 79 |
+
# ์ด๋ฏธ ํ๊ธ์ด๋ฉด ์คํต
|
| 80 |
+
if re.search(r'[๊ฐ-ํฃ]', text):
|
| 81 |
+
return text
|
| 82 |
+
try:
|
| 83 |
+
result = translator.translate(text)
|
| 84 |
+
return result if result else text
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f" ๋ฒ์ญ ์คํจ: {e}")
|
| 87 |
+
return text
|
| 88 |
|
| 89 |
|
| 90 |
+
def gen_summary(title):
|
| 91 |
+
t = title.strip()
|
| 92 |
+
return t[:80] + "..." if len(t) > 80 else t
|
| 93 |
+
|
| 94 |
|
| 95 |
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 96 |
|
| 97 |
|
| 98 |
+
def fetch_aitimes(max_items=20):
|
|
|
|
| 99 |
print("๐ฐ AI Times ์์ง ์ค...")
|
| 100 |
urls = [
|
| 101 |
"https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
|
|
|
|
| 104 |
all_news = []
|
| 105 |
today = datetime.now().strftime("%m-%d")
|
| 106 |
yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
|
|
|
|
| 107 |
for url in urls:
|
| 108 |
try:
|
| 109 |
r = requests.get(url, timeout=15, headers={"User-Agent": UA})
|
| 110 |
r.raise_for_status()
|
| 111 |
r.encoding = "utf-8"
|
| 112 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 113 |
+
for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")):
|
|
|
|
|
|
|
| 114 |
title = tag.get_text(strip=True)
|
| 115 |
link = tag.get("href", "")
|
| 116 |
if not title or len(title) < 10:
|
| 117 |
continue
|
| 118 |
if link and not link.startswith("http"):
|
| 119 |
link = "https://www.aitimes.com" + link
|
| 120 |
+
date_text = today
|
| 121 |
+
if tag.parent:
|
| 122 |
+
m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text())
|
|
|
|
|
|
|
| 123 |
if m:
|
| 124 |
date_text = m.group(1)
|
|
|
|
|
|
|
|
|
|
| 125 |
if today not in date_text and yesterday not in date_text:
|
| 126 |
continue
|
|
|
|
| 127 |
cls = classify_news(title, "AI Times")
|
| 128 |
+
all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
time.sleep(0.5)
|
| 130 |
except Exception as e:
|
| 131 |
+
print(f" โ ๏ธ AI Times: {e}")
|
|
|
|
| 132 |
seen = set()
|
| 133 |
+
unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
print(f" โ
AI Times {len(unique)}๊ฑด")
|
| 135 |
return unique[:max_items]
|
| 136 |
|
| 137 |
|
| 138 |
+
def fetch_hackernews(limit=15):
|
|
|
|
| 139 |
print("๐ฅ Hacker News ์์ง ์ค...")
|
| 140 |
news = []
|
| 141 |
try:
|
| 142 |
r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
|
| 143 |
ids = r.json()[:limit * 3]
|
| 144 |
cutoff = datetime.utcnow() - timedelta(hours=36)
|
|
|
|
| 145 |
for sid in ids:
|
| 146 |
if len(news) >= limit:
|
| 147 |
break
|
|
|
|
| 153 |
st = datetime.utcfromtimestamp(s.get("time", 0))
|
| 154 |
if st < cutoff:
|
| 155 |
continue
|
| 156 |
+
title_en = s.get("title", "")
|
| 157 |
+
# ํ๊ธ ๋ฒ์ญ
|
| 158 |
+
title_ko = translate_to_korean(title_en)
|
| 159 |
+
cls = classify_news(title_en + " " + title_ko, "Hacker News")
|
| 160 |
news.append({
|
| 161 |
+
"title": title_ko,
|
| 162 |
+
"title_en": title_en,
|
| 163 |
+
"url": s["url"],
|
| 164 |
"date": st.strftime("%m-%d %H:%M"),
|
| 165 |
"source": "Hacker News",
|
| 166 |
+
"summary": gen_summary(title_ko),
|
| 167 |
"score": s.get("score", 0),
|
| 168 |
**cls,
|
| 169 |
})
|
| 170 |
+
time.sleep(0.2)
|
| 171 |
except Exception:
|
| 172 |
continue
|
| 173 |
+
print(f" โ
HN {len(news)}๊ฑด (ํ๊ธ ๋ฒ์ญ ์๋ฃ)")
|
| 174 |
except Exception as e:
|
| 175 |
+
print(f" โ ๏ธ HN: {e}")
|
| 176 |
return news
|
| 177 |
|
| 178 |
|
| 179 |
+
def load_news_from_hf():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
try:
|
| 181 |
+
path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True)
|
|
|
|
|
|
|
|
|
|
| 182 |
with open(path, "r", encoding="utf-8") as f:
|
| 183 |
+
return json.load(f)
|
| 184 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
| 185 |
return []
|
| 186 |
|
| 187 |
|
| 188 |
+
def save_news_to_hf(news_list):
|
| 189 |
try:
|
| 190 |
tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
|
| 191 |
with open(tmp, "w", encoding="utf-8") as f:
|
| 192 |
json.dump(news_list, f, ensure_ascii=False, indent=2)
|
| 193 |
+
hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
except Exception as e:
|
| 195 |
print(f"[ERROR] News save: {e}")
|
| 196 |
|
| 197 |
|
| 198 |
+
def collect_news(force=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
if not force:
|
| 200 |
cached = load_news_from_hf()
|
| 201 |
if cached:
|
|
|
|
| 202 |
try:
|
| 203 |
last = cached[0].get("collected_at", "")
|
| 204 |
+
if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600:
|
| 205 |
+
return cached
|
|
|
|
|
|
|
|
|
|
| 206 |
except Exception:
|
| 207 |
pass
|
|
|
|
| 208 |
print("\n[NEWS] Collecting fresh news...")
|
| 209 |
now_iso = datetime.now().isoformat()
|
| 210 |
+
all_news = fetch_aitimes(20) + fetch_hackernews(15)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
for n in all_news:
|
| 212 |
n["collected_at"] = now_iso
|
|
|
|
|
|
|
| 213 |
order = {"ํต์ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ": 3}
|
| 214 |
all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ"), 3))
|
|
|
|
| 215 |
if HF_TOKEN and all_news:
|
| 216 |
save_news_to_hf(all_news)
|
|
|
|
|
|
|
| 217 |
return all_news
|