openfree commited on
Commit
a6fc716
ยท
verified ยท
1 Parent(s): 3a4965c

Update news.py

Browse files
Files changed (1) hide show
  1. news.py +85 -143
news.py CHANGED
@@ -1,110 +1,101 @@
1
  """
2
- VDash ๋‰ด์Šค ๋ชจ๋“ˆ
3
  - AI Times + Hacker News ํฌ๋กค๋ง
4
- - ๋น„๋“œ๋ž˜ํ”„ํŠธ ์กฐ์ง ๊ด€์  ์ž๋™ ๋ถ„๋ฅ˜/ํƒœ๊น…
 
5
  - HF Dataset ์˜๊ตฌ ์ €์žฅ
6
  """
7
 
8
- import requests
9
- import json
10
- import re
11
- import time
12
- import os
13
- import tempfile
14
  from datetime import datetime, timedelta
15
  from typing import List, Dict
16
  from bs4 import BeautifulSoup
17
  from huggingface_hub import HfApi, hf_hub_download
18
 
 
 
 
 
 
 
 
 
19
  HF_TOKEN = os.getenv("HF_TOKEN")
20
  SPACE_ID = os.getenv("SPACE_ID", "")
21
  OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
22
  DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
23
  NEWS_FILE = "news.json"
24
-
25
  hf_api = HfApi(token=HF_TOKEN)
26
 
27
- # ============================================================
28
- # ๋น„๋“œ๋ž˜ํ”„ํŠธ ๊ด€์  ํƒœ๊ทธ ๋ถ„๋ฅ˜ ๊ทœ์น™
29
- # ============================================================
30
-
31
  TAG_RULES = [
32
- # (ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ, ํƒœ๊ทธ๋ช…, ์ƒ‰์ƒ์ฝ”๋“œ)
33
- (["์ •๋ถ€", "๊ณผ์ œ", "๊ณต๋ชจ", "์ง€์›์‚ฌ์—…", "IITP", "NIA", "NIPA", "๊ตญ์ฑ…"], "๐Ÿ›๏ธ ์ •๋ถ€๊ณผ์ œ", "#3b82f6"),
34
- (["ํˆฌ์ž", "ํŽ€๋”ฉ", "์‹œ๋ฆฌ์ฆˆ", "VC", "IPO", "์ธ์ˆ˜", "M&A", "๋ฐธ๋ฅ˜์—์ด์…˜"], "๐Ÿ’ฐ ํˆฌ์ž/IR", "#f59e0b"),
35
- (["์˜์ƒ", "๋น„๋””์˜ค", "video", "์ƒ์„ฑ", "sora", "gen-", "๋™์˜์ƒ", "์ด๋ฏธ์ง€์ƒ์„ฑ", "ํ…์ŠคํŠธํˆฌ๋น„๋””์˜ค"], "๐ŸŽฌ ์˜์ƒAI", "#ef4444"),
36
- (["ํ•œ๊ตญ์–ด", "korean", "multilingual", "๋ฒˆ์—ญ", "๋‹ค๊ตญ์–ด", "ํ•œ๊ธ€"], "๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ดAI", "#8b5cf6"),
37
- (["ํ—ˆ๊น…ํŽ˜์ด์Šค", "hugging", "HF", "spaces", "๋ชจ๋ธ", "์˜คํ”ˆ์†Œ์Šค", "๊นƒํ—ˆ๋ธŒ"], "๐Ÿค— HF/์˜คํ”ˆ์†Œ์Šค", "#10b981"),
38
- (["LLM", "GPT", "Claude", "Gemini", "๊ฑฐ๋Œ€์–ธ์–ด", "ํŒŒ์šด๋ฐ์ด์…˜", "ํŒŒ์ธํŠœ๋‹", "RAG", "์—์ด์ „ํŠธ", "agent"], "๐Ÿง  LLM/์—์ด์ „ํŠธ", "#6366f1"),
39
- (["GPU", "์นฉ", "๋ฐ˜๋„์ฒด", "์—”๋น„๋””์•„", "NVIDIA", "์ธํ”„๋ผ", "์„œ๋ฒ„", "ํด๋ผ์šฐ๋“œ", "๋ฐ์ดํ„ฐ์„ผํ„ฐ"], "๐Ÿ–ฅ๏ธ ์ธํ”„๋ผ/GPU", "#0d9488"),
40
- (["๋ณด์•ˆ", "๊ฐœ์ธ์ •๋ณด", "๊ทœ์ œ", "๋ฒ•์•ˆ", "์œค๋ฆฌ", "์•ˆ์ „", "์ €์ž‘๊ถŒ", "AI๋ฒ•"], "๐Ÿ”’ ๊ทœ์ œ/์œค๋ฆฌ", "#dc2626"),
41
- (["์Šคํƒ€ํŠธ์—…", "์ฐฝ์—…", "์‚ฌ์—…", "์ œํœด", "ํŒŒํŠธ๋„ˆ", "๊ณ„์•ฝ", "๋งค์ถœ", "์ˆ˜์ต"], "๐Ÿ’ผ ๋น„์ฆˆ๋‹ˆ์Šค", "#ea580c"),
42
- (["๊ต์œก", "ํ•™์Šต", "์—ฐ๊ตฌ", "๋…ผ๋ฌธ", "arXiv", "๋ฒค์น˜๋งˆํฌ", "์„ฑ๋Šฅ", "ํ‰๊ฐ€"], "๐Ÿ“š R&D/์—ฐ๊ตฌ", "#059669"),
43
- (["๋งˆ์ผ€ํŒ…", "์ฝ˜ํ…์ธ ", "SNS", "๋ธŒ๋žœ๋”ฉ", "ํ™๋ณด", "PR", "๋ฏธ๋””์–ด"], "๐Ÿ“ข ๋งˆ์ผ€ํŒ…/PR", "#9333ea"),
44
  ]
45
 
46
- RELEVANCE_KEYWORDS = {
47
- "core": ["AI ์˜์ƒ", "๋น„๋””์˜ค ์ƒ์„ฑ", "ํ•œ๊ตญ์–ด", "ํ—ˆ๊น…ํŽ˜์ด์Šค", "์˜คํ”ˆ์†Œ์Šค", "์—์ด์ „ํŠธ", "LLM", "์Šคํƒ€ํŠธ์—…", "์ •๋ถ€๊ณผ์ œ"],
48
- "high": ["GPU", "ํด๋ผ์šฐ๋“œ", "์ธํ”„๋ผ", "ํˆฌ์ž", "์ƒ์„ฑAI", "ํŒŒ์ธํŠœ๋‹", "RAG", "API"],
49
- "medium": ["๊ทœ์ œ", "๊ต์œก", "์—ฐ๊ตฌ", "๋ณด์•ˆ", "๋งˆ์ผ€ํŒ…"],
50
  }
51
 
52
 
53
- def classify_news(title: str, source: str = "") -> Dict:
54
- """๋‰ด์Šค๋ฅผ ๋น„๋“œ๋ž˜ํ”„ํŠธ ๊ด€์ ์—์„œ ๋ถ„๋ฅ˜"""
55
  text = (title + " " + source).lower()
56
- tags = []
57
- colors = {}
58
-
59
- for keywords, tag_name, color in TAG_RULES:
60
  for kw in keywords:
61
  if kw.lower() in text:
62
- if tag_name not in tags:
63
- tags.append(tag_name)
64
- colors[tag_name] = color
65
  break
66
-
67
  if not tags:
68
  tags.append("๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค")
69
  colors["๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค"] = "#64748b"
70
-
71
- # ๋น„๋“œ๋ž˜ํ”„ํŠธ ๊ด€๋ จ๋„ ํŒ๋‹จ
72
  relevance = "์ผ๋ฐ˜"
73
- for kw in RELEVANCE_KEYWORDS["core"]:
74
- if kw.lower() in text:
75
- relevance = "ํ•ต์‹ฌ"
76
- break
77
- if relevance == "์ผ๋ฐ˜":
78
- for kw in RELEVANCE_KEYWORDS["high"]:
79
- if kw.lower() in text:
80
- relevance = "์ฃผ๋ชฉ"
81
- break
82
- if relevance == "์ผ๋ฐ˜":
83
- for kw in RELEVANCE_KEYWORDS["medium"]:
84
  if kw.lower() in text:
85
- relevance = "์ฐธ๊ณ "
86
  break
87
-
 
88
  return {"tags": tags, "colors": colors, "relevance": relevance}
89
 
90
 
91
- def generate_summary(title: str) -> str:
92
- """์ œ๋ชฉ ๊ธฐ๋ฐ˜ ํ•œ์ค„ ์š”์•ฝ (LLM ์—†์ด ๊ทœ์น™ ๊ธฐ๋ฐ˜)"""
93
- t = title.strip()
94
- if len(t) > 80:
95
- return t[:77] + "..."
96
- return t
 
 
 
 
 
 
 
97
 
98
 
99
- # ============================================================
100
- # ํฌ๋กค๋Ÿฌ
101
- # ============================================================
 
102
 
103
  UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
104
 
105
 
106
- def fetch_aitimes(max_items: int = 20) -> List[Dict]:
107
- """AI Times ์ตœ์‹  ๋‰ด์Šค ํฌ๋กค๋ง"""
108
  print("๐Ÿ“ฐ AI Times ์ˆ˜์ง‘ ์ค‘...")
109
  urls = [
110
  "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
@@ -113,64 +104,44 @@ def fetch_aitimes(max_items: int = 20) -> List[Dict]:
113
  all_news = []
114
  today = datetime.now().strftime("%m-%d")
115
  yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
116
-
117
  for url in urls:
118
  try:
119
  r = requests.get(url, timeout=15, headers={"User-Agent": UA})
120
  r.raise_for_status()
121
  r.encoding = "utf-8"
122
  soup = BeautifulSoup(r.text, "html.parser")
123
- articles = soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+"))
124
-
125
- for tag in articles:
126
  title = tag.get_text(strip=True)
127
  link = tag.get("href", "")
128
  if not title or len(title) < 10:
129
  continue
130
  if link and not link.startswith("http"):
131
  link = "https://www.aitimes.com" + link
132
-
133
- parent = tag.parent
134
- date_text = ""
135
- if parent:
136
- m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", parent.get_text())
137
  if m:
138
  date_text = m.group(1)
139
- if not date_text:
140
- date_text = today
141
-
142
  if today not in date_text and yesterday not in date_text:
143
  continue
144
-
145
  cls = classify_news(title, "AI Times")
146
- all_news.append({
147
- "title": title, "url": link, "date": date_text,
148
- "source": "AI Times", "summary": generate_summary(title),
149
- **cls,
150
- })
151
  time.sleep(0.5)
152
  except Exception as e:
153
- print(f" โš ๏ธ AI Times ์˜ค๋ฅ˜: {e}")
154
-
155
  seen = set()
156
- unique = []
157
- for n in all_news:
158
- if n["url"] not in seen:
159
- seen.add(n["url"])
160
- unique.append(n)
161
  print(f" โœ… AI Times {len(unique)}๊ฑด")
162
  return unique[:max_items]
163
 
164
 
165
- def fetch_hackernews(limit: int = 15) -> List[Dict]:
166
- """Hacker News ์ตœ์‹  AI ๊ด€๋ จ ๋‰ด์Šค"""
167
  print("๐Ÿ”ฅ Hacker News ์ˆ˜์ง‘ ์ค‘...")
168
  news = []
169
  try:
170
  r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
171
  ids = r.json()[:limit * 3]
172
  cutoff = datetime.utcnow() - timedelta(hours=36)
173
-
174
  for sid in ids:
175
  if len(news) >= limit:
176
  break
@@ -182,94 +153,65 @@ def fetch_hackernews(limit: int = 15) -> List[Dict]:
182
  st = datetime.utcfromtimestamp(s.get("time", 0))
183
  if st < cutoff:
184
  continue
185
- title = s.get("title", "")
186
- cls = classify_news(title, "Hacker News")
 
 
187
  news.append({
188
- "title": title, "url": s["url"],
 
 
189
  "date": st.strftime("%m-%d %H:%M"),
190
  "source": "Hacker News",
191
- "summary": generate_summary(title),
192
  "score": s.get("score", 0),
193
  **cls,
194
  })
195
- time.sleep(0.15)
196
  except Exception:
197
  continue
198
- print(f" โœ… HN {len(news)}๊ฑด")
199
  except Exception as e:
200
- print(f" โš ๏ธ HN ์˜ค๋ฅ˜: {e}")
201
  return news
202
 
203
 
204
- # ============================================================
205
- # HF Dataset ์ €์žฅ/๋กœ๋“œ
206
- # ============================================================
207
-
208
- def load_news_from_hf() -> List[Dict]:
209
  try:
210
- path = hf_hub_download(
211
- repo_id=DATASET_REPO, filename=NEWS_FILE,
212
- repo_type="dataset", token=HF_TOKEN, force_download=True,
213
- )
214
  with open(path, "r", encoding="utf-8") as f:
215
- data = json.load(f)
216
- print(f"[OK] Loaded {len(data)} news from HF")
217
- return data
218
- except Exception as e:
219
- print(f"[INFO] News load: {e}")
220
  return []
221
 
222
 
223
- def save_news_to_hf(news_list: List[Dict]):
224
  try:
225
  tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
226
  with open(tmp, "w", encoding="utf-8") as f:
227
  json.dump(news_list, f, ensure_ascii=False, indent=2)
228
- hf_api.upload_file(
229
- path_or_fileobj=tmp, path_in_repo=NEWS_FILE,
230
- repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN,
231
- )
232
- print(f"[OK] Saved {len(news_list)} news to HF")
233
  except Exception as e:
234
  print(f"[ERROR] News save: {e}")
235
 
236
 
237
- # ============================================================
238
- # ๋ฉ”์ธ ์ˆ˜์ง‘ ํ•จ์ˆ˜ (app.py์—์„œ ํ˜ธ์ถœ)
239
- # ============================================================
240
-
241
- def collect_news(force: bool = False) -> List[Dict]:
242
- """๋‰ด์Šค ์ˆ˜์ง‘ + ๋ถ„๋ฅ˜ + ์ €์žฅ"""
243
  if not force:
244
  cached = load_news_from_hf()
245
  if cached:
246
- # ์บ์‹œ๊ฐ€ 6์‹œ๊ฐ„ ์ด๋‚ด๋ฉด ์žฌ์‚ฌ์šฉ
247
  try:
248
  last = cached[0].get("collected_at", "")
249
- if last:
250
- last_dt = datetime.fromisoformat(last)
251
- if (datetime.now() - last_dt).total_seconds() < 21600:
252
- print("[NEWS] Cache fresh, reusing")
253
- return cached
254
  except Exception:
255
  pass
256
-
257
  print("\n[NEWS] Collecting fresh news...")
258
  now_iso = datetime.now().isoformat()
259
-
260
- aitimes = fetch_aitimes(20)
261
- hn = fetch_hackernews(15)
262
- all_news = aitimes + hn
263
-
264
  for n in all_news:
265
  n["collected_at"] = now_iso
266
-
267
- # ๊ด€๋ จ๋„ ์ˆœ ์ •๋ ฌ: ํ•ต์‹ฌ > ์ฃผ๋ชฉ > ์ฐธ๊ณ  > ์ผ๋ฐ˜
268
  order = {"ํ•ต์‹ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ˜": 3}
269
  all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ˜"), 3))
270
-
271
  if HF_TOKEN and all_news:
272
  save_news_to_hf(all_news)
273
-
274
- print(f"[NEWS] Total: {len(all_news)} articles\n")
275
  return all_news
 
1
  """
2
+ VDash ๋‰ด์Šค ๋ชจ๋“ˆ v2
3
  - AI Times + Hacker News ํฌ๋กค๋ง
4
+ - HN ์˜๋ฌธ ์ œ๋ชฉ โ†’ ํ•œ๊ธ€ ์ž๋™ ๋ฒˆ์—ญ
5
+ - ๋น„๋“œ๋ž˜ํ”„ํŠธ ๊ด€์  ์ž๋™ ๋ถ„๋ฅ˜
6
  - HF Dataset ์˜๊ตฌ ์ €์žฅ
7
  """
8
 
9
+ import requests, json, re, time, os, tempfile
 
 
 
 
 
10
  from datetime import datetime, timedelta
11
  from typing import List, Dict
12
  from bs4 import BeautifulSoup
13
  from huggingface_hub import HfApi, hf_hub_download
14
 
15
+ try:
16
+ from deep_translator import GoogleTranslator
17
+ translator = GoogleTranslator(source='en', target='ko')
18
+ HAS_TRANSLATOR = True
19
+ except Exception:
20
+ HAS_TRANSLATOR = False
21
+ print("[NEWS] deep-translator not available, HN titles will stay English")
22
+
23
  HF_TOKEN = os.getenv("HF_TOKEN")
24
  SPACE_ID = os.getenv("SPACE_ID", "")
25
  OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
26
  DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
27
  NEWS_FILE = "news.json"
 
28
  hf_api = HfApi(token=HF_TOKEN)
29
 
 
 
 
 
30
  TAG_RULES = [
31
+ (["์ •๋ถ€","๊ณผ์ œ","๊ณต๋ชจ","์ง€์›์‚ฌ์—…","IITP","NIA","NIPA","๊ตญ์ฑ…","government","grant"], "๐Ÿ›๏ธ ์ •๋ถ€๊ณผ์ œ", "#3b82f6"),
32
+ (["ํˆฌ์ž","ํŽ€๋”ฉ","์‹œ๋ฆฌ์ฆˆ","VC","IPO","์ธ์ˆ˜","M&A","๋ฐธ๋ฅ˜์—์ด์…˜","funding","investment","acquisition"], "๐Ÿ’ฐ ํˆฌ์ž/IR", "#f59e0b"),
33
+ (["์˜์ƒ","๋น„๋””์˜ค","video","์ƒ์„ฑ","sora","gen-","๋™์˜์ƒ","์ด๋ฏธ์ง€์ƒ์„ฑ"], "๐ŸŽฌ ์˜์ƒAI", "#ef4444"),
34
+ (["ํ•œ๊ตญ์–ด","korean","multilingual","๋ฒˆ์—ญ","๋‹ค๊ตญ์–ด"], "๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ดAI", "#8b5cf6"),
35
+ (["ํ—ˆ๊น…ํŽ˜์ด์Šค","hugging","HF","spaces","์˜คํ”ˆ์†Œ์Šค","open source","github"], "๐Ÿค— HF/์˜คํ”ˆ์†Œ์Šค", "#10b981"),
36
+ (["LLM","GPT","Claude","Gemini","๊ฑฐ๋Œ€์–ธ์–ด","ํŒŒ์ธํŠœ๋‹","RAG","์—์ด์ „ํŠธ","agent","transformer","llama","mistral"], "๐Ÿง  LLM/์—์ด์ „ํŠธ", "#6366f1"),
37
+ (["GPU","์นฉ","๋ฐ˜๋„์ฒด","์—”๋น„๋””์•„","NVIDIA","์ธํ”„๋ผ","์„œ๋ฒ„","ํด๋ผ์šฐ๋“œ","๋ฐ์ดํ„ฐ์„ผํ„ฐ","chip","server","cloud"], "๐Ÿ–ฅ๏ธ ์ธํ”„๋ผ/GPU", "#0d9488"),
38
+ (["๋ณด์•ˆ","๊ฐœ์ธ์ •๋ณด","๊ทœ์ œ","๋ฒ•์•ˆ","์œค๋ฆฌ","์•ˆ์ „","์ €์ž‘๊ถŒ","AI๋ฒ•","regulation","safety","privacy"], "๐Ÿ”’ ๊ทœ์ œ/์œค๋ฆฌ", "#dc2626"),
39
+ (["์Šคํƒ€ํŠธ์—…","์ฐฝ์—…","์‚ฌ์—…","์ œํœด","ํŒŒํŠธ๋„ˆ","๊ณ„์•ฝ","๋งค์ถœ","startup","business","revenue"], "๐Ÿ’ผ ๋น„์ฆˆ๋‹ˆ์Šค", "#ea580c"),
40
+ (["๊ต์œก","ํ•™์Šต","์—ฐ๊ตฌ","๋…ผ๋ฌธ","arXiv","๋ฒค์น˜๋งˆํฌ","์„ฑ๋Šฅ","ํ‰๊ฐ€","paper","research","benchmark"], "๐Ÿ“š R&D/์—ฐ๊ตฌ", "#059669"),
41
+ (["๋งˆ์ผ€ํŒ…","์ฝ˜ํ…์ธ ","SNS","๋ธŒ๋žœ๋”ฉ","ํ™๋ณด","PR","๋ฏธ๋””์–ด","marketing"], "๐Ÿ“ข ๋งˆ์ผ€ํŒ…/PR", "#9333ea"),
 
42
  ]
43
 
44
+ RELEVANCE_KW = {
45
+ "ํ•ต์‹ฌ": ["AI ์˜์ƒ","๋น„๋””์˜ค ์ƒ์„ฑ","ํ•œ๊ตญ์–ด","ํ—ˆ๊น…ํŽ˜์ด์Šค","์˜คํ”ˆ์†Œ์Šค","์—์ด์ „ํŠธ","LLM","์Šคํƒ€ํŠธ์—…","์ •๋ถ€๊ณผ์ œ","video generation","hugging face"],
46
+ "์ฃผ๋ชฉ": ["GPU","ํด๋ผ์šฐ๋“œ","์ธํ”„๋ผ","ํˆฌ์ž","์ƒ์„ฑAI","ํŒŒ์ธํŠœ๋‹","RAG","API","generative"],
47
+ "์ฐธ๊ณ ": ["๊ทœ์ œ","๊ต์œก","์—ฐ๊ตฌ","๋ณด์•ˆ","๋งˆ์ผ€ํŒ…","benchmark","safety"],
48
  }
49
 
50
 
51
+ def classify_news(title, source=""):
 
52
  text = (title + " " + source).lower()
53
+ tags, colors = [], {}
54
+ for keywords, tag, color in TAG_RULES:
 
 
55
  for kw in keywords:
56
  if kw.lower() in text:
57
+ if tag not in tags:
58
+ tags.append(tag)
59
+ colors[tag] = color
60
  break
 
61
  if not tags:
62
  tags.append("๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค")
63
  colors["๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค"] = "#64748b"
 
 
64
  relevance = "์ผ๋ฐ˜"
65
+ for level in ["ํ•ต์‹ฌ", "์ฃผ๋ชฉ", "์ฐธ๊ณ "]:
66
+ for kw in RELEVANCE_KW[level]:
 
 
 
 
 
 
 
 
 
67
  if kw.lower() in text:
68
+ relevance = level
69
  break
70
+ if relevance != "์ผ๋ฐ˜":
71
+ break
72
  return {"tags": tags, "colors": colors, "relevance": relevance}
73
 
74
 
75
+ def translate_to_korean(text):
76
+ """์˜๋ฌธ ํ…์ŠคํŠธ๋ฅผ ํ•œ๊ธ€๋กœ ๋ฒˆ์—ญ"""
77
+ if not text or not HAS_TRANSLATOR:
78
+ return text
79
+ # ์ด๋ฏธ ํ•œ๊ธ€์ด๋ฉด ์Šคํ‚ต
80
+ if re.search(r'[๊ฐ€-ํžฃ]', text):
81
+ return text
82
+ try:
83
+ result = translator.translate(text)
84
+ return result if result else text
85
+ except Exception as e:
86
+ print(f" ๋ฒˆ์—ญ ์‹คํŒจ: {e}")
87
+ return text
88
 
89
 
90
+ def gen_summary(title):
91
+ t = title.strip()
92
+ return t[:80] + "..." if len(t) > 80 else t
93
+
94
 
95
  UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
96
 
97
 
98
+ def fetch_aitimes(max_items=20):
 
99
  print("๐Ÿ“ฐ AI Times ์ˆ˜์ง‘ ์ค‘...")
100
  urls = [
101
  "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
 
104
  all_news = []
105
  today = datetime.now().strftime("%m-%d")
106
  yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
 
107
  for url in urls:
108
  try:
109
  r = requests.get(url, timeout=15, headers={"User-Agent": UA})
110
  r.raise_for_status()
111
  r.encoding = "utf-8"
112
  soup = BeautifulSoup(r.text, "html.parser")
113
+ for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")):
 
 
114
  title = tag.get_text(strip=True)
115
  link = tag.get("href", "")
116
  if not title or len(title) < 10:
117
  continue
118
  if link and not link.startswith("http"):
119
  link = "https://www.aitimes.com" + link
120
+ date_text = today
121
+ if tag.parent:
122
+ m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text())
 
 
123
  if m:
124
  date_text = m.group(1)
 
 
 
125
  if today not in date_text and yesterday not in date_text:
126
  continue
 
127
  cls = classify_news(title, "AI Times")
128
+ all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls})
 
 
 
 
129
  time.sleep(0.5)
130
  except Exception as e:
131
+ print(f" โš ๏ธ AI Times: {e}")
 
132
  seen = set()
133
+ unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])]
 
 
 
 
134
  print(f" โœ… AI Times {len(unique)}๊ฑด")
135
  return unique[:max_items]
136
 
137
 
138
+ def fetch_hackernews(limit=15):
 
139
  print("๐Ÿ”ฅ Hacker News ์ˆ˜์ง‘ ์ค‘...")
140
  news = []
141
  try:
142
  r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
143
  ids = r.json()[:limit * 3]
144
  cutoff = datetime.utcnow() - timedelta(hours=36)
 
145
  for sid in ids:
146
  if len(news) >= limit:
147
  break
 
153
  st = datetime.utcfromtimestamp(s.get("time", 0))
154
  if st < cutoff:
155
  continue
156
+ title_en = s.get("title", "")
157
+ # ํ•œ๊ธ€ ๋ฒˆ์—ญ
158
+ title_ko = translate_to_korean(title_en)
159
+ cls = classify_news(title_en + " " + title_ko, "Hacker News")
160
  news.append({
161
+ "title": title_ko,
162
+ "title_en": title_en,
163
+ "url": s["url"],
164
  "date": st.strftime("%m-%d %H:%M"),
165
  "source": "Hacker News",
166
+ "summary": gen_summary(title_ko),
167
  "score": s.get("score", 0),
168
  **cls,
169
  })
170
+ time.sleep(0.2)
171
  except Exception:
172
  continue
173
+ print(f" โœ… HN {len(news)}๊ฑด (ํ•œ๊ธ€ ๋ฒˆ์—ญ ์™„๋ฃŒ)")
174
  except Exception as e:
175
+ print(f" โš ๏ธ HN: {e}")
176
  return news
177
 
178
 
179
+ def load_news_from_hf():
 
 
 
 
180
  try:
181
+ path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True)
 
 
 
182
  with open(path, "r", encoding="utf-8") as f:
183
+ return json.load(f)
184
+ except Exception:
 
 
 
185
  return []
186
 
187
 
188
+ def save_news_to_hf(news_list):
189
  try:
190
  tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
191
  with open(tmp, "w", encoding="utf-8") as f:
192
  json.dump(news_list, f, ensure_ascii=False, indent=2)
193
+ hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
 
 
 
 
194
  except Exception as e:
195
  print(f"[ERROR] News save: {e}")
196
 
197
 
198
+ def collect_news(force=False):
 
 
 
 
 
199
  if not force:
200
  cached = load_news_from_hf()
201
  if cached:
 
202
  try:
203
  last = cached[0].get("collected_at", "")
204
+ if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600:
205
+ return cached
 
 
 
206
  except Exception:
207
  pass
 
208
  print("\n[NEWS] Collecting fresh news...")
209
  now_iso = datetime.now().isoformat()
210
+ all_news = fetch_aitimes(20) + fetch_hackernews(15)
 
 
 
 
211
  for n in all_news:
212
  n["collected_at"] = now_iso
 
 
213
  order = {"ํ•ต์‹ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ˜": 3}
214
  all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ˜"), 3))
 
215
  if HF_TOKEN and all_news:
216
  save_news_to_hf(all_news)
 
 
217
  return all_news