bep40 commited on
Commit
524ed03
·
verified ·
1 Parent(s): 494ac7d

Add match detail API (commentaries + stats) + featured match endpoint

Browse files
Files changed (1) hide show
  1. main.py +112 -28
main.py CHANGED
@@ -17,7 +17,7 @@ BASE_24H = "https://www.24h.com.vn"
17
  SPACE_URL = "https://bep40-vnews.hf.space"
18
  _cache = {}
19
  _cache_ttl = 300
20
- _cache_ttl_live = 60 # 1 min for live data
21
  def _cached(key, fn, ttl=None):
22
  now=time.time();t=ttl or _cache_ttl
23
  if key in _cache and now-_cache[key]["t"]<t:return _cache[key]["d"]
@@ -32,7 +32,6 @@ def _get(url,headers=None):
32
  LEAGUE_IDS = {"nha":27110,"laliga":27233,"seriea":27044,"bundesliga":26891,"ligue1":27212}
33
 
34
  def fetch_bongda_api(endpoint):
35
- """Fetch bongda.com.vn API and return HTML content."""
36
  try:
37
  r=requests.get(f"https://bongda.com.vn{endpoint}",headers=BONGDA_HEADERS,timeout=10)
38
  if r.status_code==200:
@@ -43,43 +42,107 @@ def fetch_bongda_api(endpoint):
43
 
44
  @app.get("/api/livescore/live")
45
  def api_livescore_live():
46
- """Trận đang diễn ra."""
47
  html=_cached("ls_live",lambda:fetch_bongda_api("/api/fixtures/live"),ttl=_cache_ttl_live)
48
  return JSONResponse({"html":html})
49
 
50
  @app.get("/api/livescore/incoming")
51
  def api_livescore_incoming():
52
- """Trận sắp diễn ra."""
53
  html=_cached("ls_incoming",lambda:fetch_bongda_api("/api/fixtures/incoming"),ttl=_cache_ttl_live)
54
  return JSONResponse({"html":html})
55
 
56
  @app.get("/api/livescore/today")
57
  def api_livescore_today():
58
- """Tất cả trận hôm nay."""
59
  today=datetime.now().strftime("%Y-%m-%d")
60
  html=_cached("ls_today",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}"),ttl=_cache_ttl)
61
  return JSONResponse({"html":html})
62
 
63
  @app.get("/api/livescore/results")
64
  def api_livescore_results():
65
- """Kết quả hôm nay."""
66
  today=datetime.now().strftime("%Y-%m-%d")
67
  html=_cached("ls_results",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}&status=finished"),ttl=_cache_ttl)
68
  return JSONResponse({"html":html})
69
 
70
  @app.get("/api/livescore/standings/{league}")
71
  def api_livescore_standings(league:str):
72
- """Bảng xếp hạng theo giải."""
73
  tid=LEAGUE_IDS.get(league,27110)
74
  html=_cached(f"ls_bxh_{league}",lambda:fetch_bongda_api(f"/api/league-table/home?tournament_id={tid}&is_detail=True"),ttl=_cache_ttl)
75
  return JSONResponse({"html":html})
76
 
77
  @app.get("/api/livescore/date/{date}")
78
  def api_livescore_date(date:str):
79
- """Lịch thi đấu theo ngày (YYYY-MM-DD)."""
80
  html=fetch_bongda_api(f"/api/fixtures/get-by-date?date={date}")
81
  return JSONResponse({"html":html})
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # ===== NEWS SCRAPERS =====
84
  def scrape_vne(cat_url):
85
  try:
@@ -112,6 +175,41 @@ def scrape_vne_article(url):
112
  elif ch.name in("h2","h3"):body.append({"type":"heading","text":ch.get_text(strip=True)})
113
  return{"title":h1.get_text(strip=True) if h1 else "","summary":desc.get_text(strip=True) if desc else "","og_image":og_img,"body":body,"source":"vne","url":url}
114
  except:return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def scrape_bdp_videos():
116
  try:
117
  soup=_get(f"{BASE_BDP}/video");arts=[];seen=set()
@@ -165,19 +263,6 @@ def scrape_24h_shorts():
165
  if img_tag:img_src=img_tag.get("data-original") or img_tag.get("data-src") or img_tag.get("src","")
166
  if img_src and"base64" in img_src:img_src=""
167
  seen.add(href);arts.append({"title":title,"link":href,"img":img_src,"source":"24h-shorts"})
168
- if len(arts)<3:
169
- for a in soup.find_all("a",href=True):
170
- href=a.get("href","")
171
- if not href.endswith(".html") or"javascript:" in href or"-cvd" in href:continue
172
- if not href.startswith("http"):href=BASE_24H+href
173
- if href in seen:continue
174
- img=a.find("img") or(a.parent.find("img") if a.parent else None)
175
- if not img:continue
176
- img_src=img.get("data-original") or img.get("data-src") or img.get("src","")
177
- if not img_src or"base64" in img_src:continue
178
- title=img.get("alt","") or a.get("title","") or a.get_text(strip=True)
179
- if not title or len(title)<8:continue
180
- seen.add(href);arts.append({"title":title,"link":href,"img":img_src,"source":"24h-shorts"})
181
  return arts[:20]
182
  except:return[]
183
  def scrape_bbc_vietnamese():
@@ -212,9 +297,6 @@ def scrape_bbc_article(url):
212
  for p in soup.select("[data-component='text-block'] p, article p, main p"):
213
  t=p.get_text(strip=True)
214
  if t and len(t)>20:body.append({"type":"p","text":t})
215
- for img in soup.select("main img, article img"):
216
- src=img.get("src","")
217
- if src and("ichef" in src or"bbci" in src):body.append({"type":"img","src":src,"alt":img.get("alt","")})
218
  return{"title":h1.get_text(strip=True) if h1 else "","summary":"","og_image":og_img,"body":body,"source":"bbc","url":url}
219
  except:return None
220
  def extract_video_url(article_url):
@@ -222,8 +304,7 @@ def extract_video_url(article_url):
222
  r=requests.get(article_url,headers={**HEADERS,"Referer":"https://www.24h.com.vn/"},timeout=10);r.encoding="utf-8"
223
  m3u8s=re.findall(r'(https?://cdn\.24h\.com\.vn/[^\s"\'\\<>]+\.m3u8)',r.text)
224
  esc=[u.replace('\\//','/').replace('\\/','/') for u in re.findall(r'(https?:\\/\\/cdn\.24h\.com\.vn\\/[^\s"\'<>]+\.m3u8)',r.text)]
225
- all_urls=list(dict.fromkeys(m3u8s+esc));full=[u for u in all_urls if'_720p' not in u];p720=[u for u in all_urls if'_720p' in u]
226
- primary=full or p720
227
  if not primary:return None
228
  soup=BeautifulSoup(r.text,"lxml");og=soup.find("meta",property="og:image");poster=og.get("content","") if og else ""
229
  return{"src":primary[0],"poster":poster}
@@ -270,6 +351,8 @@ def api_categories():
270
  def api_highlights():return JSONResponse(_cached("highlights",scrape_24h_highlights))
271
  @app.get("/api/shorts")
272
  def api_shorts():return JSONResponse(_cached("shorts",scrape_24h_shorts))
 
 
273
  @app.get("/api/bdp_videos")
274
  def api_bdp_videos():return JSONResponse(_cached("bdp_videos",scrape_bdp_videos))
275
  @app.get("/api/video_url")
@@ -282,6 +365,7 @@ def api_video_url(url:str=Query(...)):
282
  def api_article(url:str=Query(...)):
283
  if"vnexpress.net" in url:data=scrape_vne_article(url)
284
  elif"bbc.com" in url:data=scrape_bbc_article(url)
 
285
  else:data=None
286
  return JSONResponse(data if data else{"error":"not supported"})
287
  @app.get("/v")
@@ -289,13 +373,13 @@ async def video_share(url:str=Query(default=""),title:str=Query(default="VNEWS V
289
  og_image=unquote(img) if img else "https://s1.vnecdn.net/vnexpress/restruct/i/v9505/logo_default.jpg"
290
  decoded_url=unquote(url);decoded_title=unquote(title)
291
  redirect_script=f'<script>localStorage.setItem("pending_video",JSON.stringify({{"url":"{decoded_url}","type":"{type}"}}));location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>'
292
- return HTMLResponse(f'''<!DOCTYPE html><html><head><meta charset="utf-8"><title>{decoded_title}</title><meta property="og:title" content="{decoded_title}"><meta property="og:image" content="{og_image}"><meta property="og:type" content="video.other"><meta property="og:site_name" content="VNEWS"></head><body style="background:#111;color:#fff;text-align:center;padding:40px"><p>⏳</p>{redirect_script}</body></html>''')
293
  @app.get("/s")
294
  async def share_redirect(url:str=Query(default=""),title:str=Query(default="VNEWS"),img:str=Query(default="")):
295
  og_image=unquote(img) if img else "https://s1.vnecdn.net/vnexpress/restruct/i/v9505/logo_default.jpg"
296
  decoded_url=unquote(url)
297
  redirect_script=f'<script>localStorage.setItem("pending_article","{decoded_url}");location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>'
298
- return HTMLResponse(f'''<!DOCTYPE html><html><head><meta charset="utf-8"><title>{unquote(title)}</title><meta property="og:title" content="{unquote(title)}"><meta property="og:image" content="{og_image}"><meta property="og:type" content="article"><meta property="og:site_name" content="VNEWS"></head><body>{redirect_script}</body></html>''')
299
  @app.get("/")
300
  async def index():
301
  with open("/app/static/index.html","r",encoding="utf-8") as f:return HTMLResponse(content=f.read())
 
17
  SPACE_URL = "https://bep40-vnews.hf.space"
18
  _cache = {}
19
  _cache_ttl = 300
20
+ _cache_ttl_live = 60
21
  def _cached(key, fn, ttl=None):
22
  now=time.time();t=ttl or _cache_ttl
23
  if key in _cache and now-_cache[key]["t"]<t:return _cache[key]["d"]
 
32
  LEAGUE_IDS = {"nha":27110,"laliga":27233,"seriea":27044,"bundesliga":26891,"ligue1":27212}
33
 
34
  def fetch_bongda_api(endpoint):
 
35
  try:
36
  r=requests.get(f"https://bongda.com.vn{endpoint}",headers=BONGDA_HEADERS,timeout=10)
37
  if r.status_code==200:
 
42
 
43
  @app.get("/api/livescore/live")
44
  def api_livescore_live():
 
45
  html=_cached("ls_live",lambda:fetch_bongda_api("/api/fixtures/live"),ttl=_cache_ttl_live)
46
  return JSONResponse({"html":html})
47
 
48
  @app.get("/api/livescore/incoming")
49
  def api_livescore_incoming():
 
50
  html=_cached("ls_incoming",lambda:fetch_bongda_api("/api/fixtures/incoming"),ttl=_cache_ttl_live)
51
  return JSONResponse({"html":html})
52
 
53
  @app.get("/api/livescore/today")
54
  def api_livescore_today():
 
55
  today=datetime.now().strftime("%Y-%m-%d")
56
  html=_cached("ls_today",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}"),ttl=_cache_ttl)
57
  return JSONResponse({"html":html})
58
 
59
  @app.get("/api/livescore/results")
60
  def api_livescore_results():
 
61
  today=datetime.now().strftime("%Y-%m-%d")
62
  html=_cached("ls_results",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}&status=finished"),ttl=_cache_ttl)
63
  return JSONResponse({"html":html})
64
 
65
  @app.get("/api/livescore/standings/{league}")
66
  def api_livescore_standings(league:str):
 
67
  tid=LEAGUE_IDS.get(league,27110)
68
  html=_cached(f"ls_bxh_{league}",lambda:fetch_bongda_api(f"/api/league-table/home?tournament_id={tid}&is_detail=True"),ttl=_cache_ttl)
69
  return JSONResponse({"html":html})
70
 
71
  @app.get("/api/livescore/date/{date}")
72
  def api_livescore_date(date:str):
 
73
  html=fetch_bongda_api(f"/api/fixtures/get-by-date?date={date}")
74
  return JSONResponse({"html":html})
75
 
76
+ @app.get("/api/match/{event_id}/commentaries")
77
+ def api_match_commentaries(event_id:int):
78
+ """Diễn biến trận đấu."""
79
+ html=fetch_bongda_api(f"/api/fixtures/commentaries?event_id={event_id}")
80
+ return JSONResponse({"html":html})
81
+
82
+ @app.get("/api/match/{event_id}/stats")
83
+ def api_match_stats(event_id:int):
84
+ """Thống kê cầu thủ trận đấu."""
85
+ html=fetch_bongda_api(f"/api/event-standing/player-performance?event_id={event_id}")
86
+ return JSONResponse({"html":html})
87
+
88
+ @app.get("/api/livescore/featured")
89
+ def api_livescore_featured():
90
+ """Trận tâm điểm: lấy trận live đầu tiên từ giải lớn, hoặc trận sắp tới."""
91
+ def _f():
92
+ # Try live first
93
+ html=fetch_bongda_api("/api/fixtures/live")
94
+ if html and len(html)>100:
95
+ soup=BeautifulSoup(html,"lxml")
96
+ # Find first match from top leagues
97
+ for li in soup.select("li.match-detail"):
98
+ match_div=li.select_one("div.match")
99
+ if not match_div:continue
100
+ home_el=match_div.select_one(".home-team .name")
101
+ away_el=match_div.select_one(".away-team .name")
102
+ status_el=match_div.select_one(".status a")
103
+ league_el=li.find_previous("strong")
104
+ time_el=match_div.select_one(".match-time")
105
+ home_logo=match_div.select_one(".home-team .logo img")
106
+ away_logo=match_div.select_one(".away-team .logo img")
107
+ if home_el and away_el:
108
+ # Get event_id from link
109
+ event_id=""
110
+ if status_el:
111
+ href=status_el.get("href","")
112
+ m=re.search(r'/tran-dau/(\d+)/',href)
113
+ if m:event_id=m.group(1)
114
+ spans=status_el.find_all("span") if status_el else []
115
+ score=""
116
+ minute=""
117
+ if len(spans)>=3:
118
+ score=f"{spans[0].get_text(strip=True)} - {spans[2].get_text(strip=True)}"
119
+ if len(spans)>=4:
120
+ minute=spans[3].get_text(strip=True)
121
+ return{"home":home_el.get_text(strip=True),"away":away_el.get_text(strip=True),"score":score,"minute":minute,"league":league_el.get_text(strip=True) if league_el else "","time":time_el.get_text(strip=True) if time_el else "","event_id":event_id,"home_logo":home_logo.get("src","") if home_logo else "","away_logo":away_logo.get("src","") if away_logo else "","status":"live"}
122
+ # Fallback: incoming
123
+ html2=fetch_bongda_api("/api/fixtures/incoming")
124
+ if html2 and len(html2)>100:
125
+ soup2=BeautifulSoup(html2,"lxml")
126
+ for li in soup2.select("li.match-detail"):
127
+ match_div=li.select_one("div.match")
128
+ if not match_div:continue
129
+ home_el=match_div.select_one(".home-team .name")
130
+ away_el=match_div.select_one(".away-team .name")
131
+ time_el=match_div.select_one(".match-time")
132
+ league_el=li.find_previous("strong")
133
+ home_logo=match_div.select_one(".home-team .logo img")
134
+ away_logo=match_div.select_one(".away-team .logo img")
135
+ status_el=match_div.select_one(".status a")
136
+ event_id=""
137
+ if status_el:
138
+ href=status_el.get("href","")
139
+ m=re.search(r'/tran-dau/(\d+)/',href)
140
+ if m:event_id=m.group(1)
141
+ if home_el and away_el:
142
+ return{"home":home_el.get_text(strip=True),"away":away_el.get_text(strip=True),"score":"VS","minute":"","league":league_el.get_text(strip=True) if league_el else "","time":time_el.get_text(strip=True) if time_el else "","event_id":event_id,"home_logo":home_logo.get("src","") if home_logo else "","away_logo":away_logo.get("src","") if away_logo else "","status":"upcoming"}
143
+ return None
144
+ return JSONResponse(_cached("ls_featured",_f,ttl=_cache_ttl_live))
145
+
146
  # ===== NEWS SCRAPERS =====
147
  def scrape_vne(cat_url):
148
  try:
 
175
  elif ch.name in("h2","h3"):body.append({"type":"heading","text":ch.get_text(strip=True)})
176
  return{"title":h1.get_text(strip=True) if h1 else "","summary":desc.get_text(strip=True) if desc else "","og_image":og_img,"body":body,"source":"vne","url":url}
177
  except:return None
178
+ def scrape_dantri_hot():
179
+ try:
180
+ soup=_get("https://dantri.com.vn/tin-nong.htm");arts=[];seen=set()
181
+ for a in soup.find_all("a",href=True):
182
+ href=a.get("href","");title=a.get("title","") or a.get_text(strip=True)
183
+ if not title or len(title)<15 or"javascript:" in href:continue
184
+ if not href.startswith("http"):href="https://dantri.com.vn"+href
185
+ if href in seen or not href.endswith(".htm"):continue
186
+ if href=="https://dantri.com.vn/tin-nong.htm":continue
187
+ seen.add(href);arts.append({"title":title,"link":href,"img":"","source":"dantri"})
188
+ if len(arts)>=12:break
189
+ def _og(art):
190
+ try:r2=requests.get(art["link"],headers=HEADERS,timeout=8);r2.encoding="utf-8";og=BeautifulSoup(r2.text,"lxml").find("meta",property="og:image");art["img"]=og.get("content","") if og else ""
191
+ except:pass
192
+ with ThreadPoolExecutor(4) as ex:list(ex.map(_og,arts[:10]))
193
+ return[a for a in arts if a["img"]][:10]
194
+ except:return[]
195
+ def scrape_dantri_article(url):
196
+ try:
197
+ r=requests.get(url,headers=HEADERS,timeout=15);r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml")
198
+ for tag in soup.find_all(["script","style","nav","footer","aside"]):tag.decompose()
199
+ h1=soup.find("h1");og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else ""
200
+ content=soup.select_one("div.singular-content") or soup.select_one("article")
201
+ body=[]
202
+ if content:
203
+ for el in content.find_all(["p","h2","h3","figure","img"],recursive=True):
204
+ if el.name=="p":t=el.get_text(strip=True);(body.append({"type":"p","text":t}) if t and len(t)>15 else None)
205
+ elif el.name in("h2","h3"):t=el.get_text(strip=True);(body.append({"type":"heading","text":t}) if t else None)
206
+ elif el.name=="figure" or el.name=="img":
207
+ im=el if el.name=="img" else el.find("img")
208
+ if im:s=im.get("data-src") or im.get("src","");(body.append({"type":"img","src":s,"alt":im.get("alt","")}) if s and"base64" not in s else None)
209
+ desc="";sapo=soup.select_one("h2.singular-sapo");
210
+ if sapo:desc=sapo.get_text(strip=True)
211
+ return{"title":h1.get_text(strip=True) if h1 else "","summary":desc,"og_image":og_img,"body":body,"source":"dantri","url":url}
212
+ except:return None
213
  def scrape_bdp_videos():
214
  try:
215
  soup=_get(f"{BASE_BDP}/video");arts=[];seen=set()
 
263
  if img_tag:img_src=img_tag.get("data-original") or img_tag.get("data-src") or img_tag.get("src","")
264
  if img_src and"base64" in img_src:img_src=""
265
  seen.add(href);arts.append({"title":title,"link":href,"img":img_src,"source":"24h-shorts"})
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  return arts[:20]
267
  except:return[]
268
  def scrape_bbc_vietnamese():
 
297
  for p in soup.select("[data-component='text-block'] p, article p, main p"):
298
  t=p.get_text(strip=True)
299
  if t and len(t)>20:body.append({"type":"p","text":t})
 
 
 
300
  return{"title":h1.get_text(strip=True) if h1 else "","summary":"","og_image":og_img,"body":body,"source":"bbc","url":url}
301
  except:return None
302
  def extract_video_url(article_url):
 
304
  r=requests.get(article_url,headers={**HEADERS,"Referer":"https://www.24h.com.vn/"},timeout=10);r.encoding="utf-8"
305
  m3u8s=re.findall(r'(https?://cdn\.24h\.com\.vn/[^\s"\'\\<>]+\.m3u8)',r.text)
306
  esc=[u.replace('\\//','/').replace('\\/','/') for u in re.findall(r'(https?:\\/\\/cdn\.24h\.com\.vn\\/[^\s"\'<>]+\.m3u8)',r.text)]
307
+ all_urls=list(dict.fromkeys(m3u8s+esc));primary=[u for u in all_urls if'_720p' not in u] or [u for u in all_urls if'_720p' in u]
 
308
  if not primary:return None
309
  soup=BeautifulSoup(r.text,"lxml");og=soup.find("meta",property="og:image");poster=og.get("content","") if og else ""
310
  return{"src":primary[0],"poster":poster}
 
351
  def api_highlights():return JSONResponse(_cached("highlights",scrape_24h_highlights))
352
  @app.get("/api/shorts")
353
  def api_shorts():return JSONResponse(_cached("shorts",scrape_24h_shorts))
354
+ @app.get("/api/dantri_hot")
355
+ def api_dantri_hot():return JSONResponse(_cached("dantri_hot",scrape_dantri_hot))
356
  @app.get("/api/bdp_videos")
357
  def api_bdp_videos():return JSONResponse(_cached("bdp_videos",scrape_bdp_videos))
358
  @app.get("/api/video_url")
 
365
  def api_article(url:str=Query(...)):
366
  if"vnexpress.net" in url:data=scrape_vne_article(url)
367
  elif"bbc.com" in url:data=scrape_bbc_article(url)
368
+ elif"dantri.com.vn" in url:data=scrape_dantri_article(url)
369
  else:data=None
370
  return JSONResponse(data if data else{"error":"not supported"})
371
  @app.get("/v")
 
373
  og_image=unquote(img) if img else "https://s1.vnecdn.net/vnexpress/restruct/i/v9505/logo_default.jpg"
374
  decoded_url=unquote(url);decoded_title=unquote(title)
375
  redirect_script=f'<script>localStorage.setItem("pending_video",JSON.stringify({{"url":"{decoded_url}","type":"{type}"}}));location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>'
376
+ return HTMLResponse(f'''<!DOCTYPE html><html><head><meta charset="utf-8"><title>{decoded_title}</title><meta property="og:title" content="{decoded_title}"><meta property="og:image" content="{og_image}"></head><body style="background:#111;color:#fff;text-align:center;padding:40px"><p>⏳</p>{redirect_script}</body></html>''')
377
  @app.get("/s")
378
  async def share_redirect(url:str=Query(default=""),title:str=Query(default="VNEWS"),img:str=Query(default="")):
379
  og_image=unquote(img) if img else "https://s1.vnecdn.net/vnexpress/restruct/i/v9505/logo_default.jpg"
380
  decoded_url=unquote(url)
381
  redirect_script=f'<script>localStorage.setItem("pending_article","{decoded_url}");location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>'
382
+ return HTMLResponse(f'''<!DOCTYPE html><html><head><meta charset="utf-8"><title>{unquote(title)}</title><meta property="og:title" content="{unquote(title)}"><meta property="og:image" content="{og_image}"></head><body>{redirect_script}</body></html>''')
383
  @app.get("/")
384
  async def index():
385
  with open("/app/static/index.html","r",encoding="utf-8") as f:return HTMLResponse(content=f.read())