| """VNEWS - FastAPI backend with livescore + xemlaibongda highlights + YouTube FPT shorts""" |
| import hashlib, re, time, subprocess, json, os, threading |
| import html as html_lib |
| from datetime import datetime |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from fastapi import FastAPI, Query, Request |
| from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, Response |
| from fastapi.staticfiles import StaticFiles |
| from urllib.parse import unquote, quote, urlencode |
| import requests |
| from bs4 import BeautifulSoup |
|
|
| app = FastAPI() |
| HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36","Accept-Language":"vi-VN,vi;q=0.9,en;q=0.8"} |
| BONGDA_HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36","Accept-Language":"vi-VN,vi;q=0.9","Referer":"https://bongda.com.vn/lich-thi-dau","X-Requested-With":"XMLHttpRequest"} |
| BASE_BDP = "https://bongdaplus.vn" |
| SPACE_URL = "https://bep40-vnews.hf.space" |
| _cache = {} |
| _cache_ttl = 300 |
| _cache_ttl_live = 60 |
| _cache_ttl_yt = 1800 |
| SHORTS_FALLBACK = [ |
| {"id":"Lu_iCQ5YwNM","title":"Công an lập hồ sơ xử lý người phụ nữ chửi bới, tát nam tài xế ô tô ở Hà Nội | #shorts","channel":"baodantri7941"}, |
| {"id":"CwWvijF8BOA","title":"Chú rể Ninh Bình bật khóc nhận món quà bí mật người cha quá cố gửi 26 năm trước | #shorts","channel":"baodantri7941"}, |
| {"id":"tvPewsc2ph4","title":"Tính năng ẩn trên iPhone giúp giảm mỏi mắt | #shorts","channel":"baodantri7941"}, |
| {"id":"b1Nxzv9ixlU","title":"Y án 3 năm tù với nữ tài xế uống 8 lon bia lái xe tông chủ tịch xã tử vong | #shorts","channel":"baodantri7941"}, |
| {"id":"Xp5eTwAZAis","title":"Người đánh hàng xóm tại chung cư ở Hà Nội bị tuyên hơn 4 tháng tù | #shorts","channel":"baodantri7941"}, |
| {"id":"Htzvwg6iOBM","title":"Xe điện Audi S6 Sportback e-tron có gì đặc biệt? | #shorts","channel":"baodantri7941"}, |
| {"id":"iMdFmWvYdlo","title":"Cô gái người Nga yêu thời trang và đất nước Việt Nam | #shorts","channel":"baodantri7941"}, |
| {"id":"IVaRc6moEv8","title":"Người nông dân Trung Quốc đột quỵ, bệnh viện giúp bán sạch 4 tấn táo | #shorts","channel":"baodantri7941"}, |
| {"id":"uVxqPxToItU","title":"Công an vào cuộc vụ người phụ nữ chửi bới, hành hung tài xế ô tô ở Hà Nội | #shorts","channel":"baodantri7941"}, |
| {"id":"VAfgNNgZDRs","title":"Khởi tố 4 đối tượng ném bom xăng vào nhà dân ở Đồng Nai | #shorts","channel":"baodantri7941"}, |
| {"id":"sBH_-zGh0Xw","title":"Vì sao Times New Roman vẫn nổi tiếng sau hàng chục năm? | #shorts","channel":"baodantri7941"}, |
| {"id":"woKn5f2bLHM","title":"Quảng Ninh ngập sâu diện rộng sau đợt mưa lớn | #shorts","channel":"baodantri7941"}, |
| {"id":"bcpgRoxbLPw","title":"Giông lốc quật bay mái tôn ở TP.HCM | #shorts","channel":"baodantri7941"}, |
| {"id":"ZIIC5osy544","title":"Bé trai Trung Quốc rơi từ tầng 11 vẫn sống sót kỳ diệu | #shorts","channel":"baodantri7941"}, |
| {"id":"uTMJ49NQpyc","title":"Sau lớp mascot 40kg: Câu chuyện mưu sinh của người trẻ ở TPHCM | #shorts","channel":"baodantri7941"}, |
| {"id":"7Pd6vZ2Lz1M","title":"Hành động ấm lòng của người đàn ông tham gia tìm kiếm 5 học sinh tử vong ở sông Lô | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"SlHLt_ZyPiE","title":"Xử phạt người đàn ông xóa số điện thoại cứu hộ trên cao tốc Bắc - Nam | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"IUOprcJyYr4","title":"Phụ nữ táo bón có phải do lười ăn rau? | SKĐS #shorts","channel":"baosuckhoedoisongboyte"}, |
| {"id":"YY8ojFNE-AU","title":"Quái xế tự quay clip nẹt pô, đánh võng đăng TikTok bị xử lý | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"OV7_oGdQGII","title":"Bố cô dâu khóc sụt sùi rồi quẩy cực sung gây bão mạng | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"FoxhFyz2skY","title":"Người đàn ông nước ngoài đập phá ô tô, bẻ cần gạt nước ở Đà Nẵng | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"R1oC_I8dFPU","title":"Thanh niên buông tay lái, đứng trên xe máy khi đổ đèo ở Đắk Lắk | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"U0Ft6ChWAIo","title":"Cô giáo kể phút tháo chạy khỏi xe khách trước khi bị lũ vò nát ở Cao Bằng | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"hH0ANeze_4E","title":"Liên tiếp hàng chục con bò bị sét đánh chết trong ngày mưa dông | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"pXWt0QbAzRQ","title":"Va chạm giao thông, người phụ nữ lăng mạ tài xế ô tô | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"UWWLPY1OYt4","title":"CSGT chặn xe khách khống chế đối tượng cướp dây chuyền tại Gia Lai | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"AxhVTQutsuo","title":"Xuất tinh sớm và những hiểu lầm thường gặp | SKĐS #shorts","channel":"baosuckhoedoisongboyte"}, |
| {"id":"cNy6FgaNxYM","title":"Cô dâu khóc sưng mắt vì 6 chỉ vàng không cánh mà bay trong ngày cưới | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"IDt_S6q59Ro","title":"Chở bạn gái không đội mũ bảo hiểm, thanh niên đấm CSGT | SKĐS","channel":"baosuckhoedoisongboyte"}, |
| {"id":"LFxJ9Ik6W0A","title":"Mệnh lệnh từ trái tim: CSGT Hà Nội mở đường đưa bé 5 tháng tuổi đi cấp cứu | SKĐS","channel":"baosuckhoedoisongboyte"} |
| ] |
| for _v in SHORTS_FALLBACK: |
| _v["link"]="https://www.youtube.com/watch?v="+_v["id"] |
| _v["img"]="https://i.ytimg.com/vi/"+_v["id"]+"/hqdefault.jpg" |
| _v["source"]="yt" |
| SHORT_STATS_FILE = "/data/short_stats.json" if os.path.isdir("/data") else "/app/short_stats.json" |
| _short_lock = threading.Lock() |
| def _load_short_db(): |
| try: |
| if os.path.exists(SHORT_STATS_FILE): |
| with open(SHORT_STATS_FILE,"r",encoding="utf-8") as f:return json.load(f) |
| except:pass |
| return {} |
| def _save_short_db(db): |
| try: |
| os.makedirs(os.path.dirname(SHORT_STATS_FILE),exist_ok=True) |
| tmp=SHORT_STATS_FILE+".tmp" |
| with open(tmp,"w",encoding="utf-8") as f:json.dump(db,f,ensure_ascii=False) |
| os.replace(tmp,SHORT_STATS_FILE) |
| except:pass |
|
|
| def _short_default():return {"views":0,"likes":0,"shares":0,"comments":[]} |
| WALL_FILE = "/data/wall_posts.json" if os.path.isdir("/data") else "/app/wall_posts.json" |
| def _load_wall(): |
| try: |
| if os.path.exists(WALL_FILE): |
| with open(WALL_FILE,"r",encoding="utf-8") as f:return json.load(f) |
| except:pass |
| return [] |
| def _save_wall(posts): |
| try: |
| os.makedirs(os.path.dirname(WALL_FILE),exist_ok=True) |
| tmp=WALL_FILE+".tmp" |
| with open(tmp,"w",encoding="utf-8") as f:json.dump(posts[:100],f,ensure_ascii=False) |
| os.replace(tmp,WALL_FILE) |
| except:pass |
| PRIORITY_LEAGUES = ["Ngoại Hạng Anh","FA Cup","Champions League","LaLiga","Copa del Rey","Serie A","Bundesliga","Ligue 1","V-League"] |
| LEAGUE_IDS = {"nha":27110,"laliga":27233,"seriea":27044,"bundesliga":26891,"ligue1":27212} |
| HL_LEAGUES = {"premier-league":{"path":"anh/premier-league","name":"Premier League","emoji":"🏴"},"fa-cup":{"path":"anh/fa-cup","name":"FA Cup","emoji":"🏆"},"bundesliga":{"path":"duc/bundesliga","name":"Bundesliga","emoji":"🇩🇪"},"serie-a":{"path":"italy/serie-a","name":"Serie A","emoji":"🇮🇹"},"la-liga":{"path":"tay-ban-nha/la-liga","name":"La Liga","emoji":"🇪🇸"},"champions-league":{"path":"cup-chau-au/uefa-champions-league","name":"Champions League","emoji":"⭐"},"europa-league":{"path":"cup-chau-au/uefa-europa-league","name":"Europa League","emoji":"🟠"},"world-cup":{"path":"the-gioi/world-cup-qualifiers","name":"World Cup 2026","emoji":"🌍"}} |
| def _cached(key, fn, ttl=None): |
| now=time.time();t=ttl or _cache_ttl |
| if key in _cache and now-_cache[key]["t"]<t:return _cache[key]["d"] |
| try:data=fn() |
| except:data=_cache.get(key,{}).get("d",[]) |
| _cache[key]={"d":data,"t":now};return data |
| def _get(url,headers=None): |
| h=headers or HEADERS;r=requests.get(url,headers=h,timeout=15);r.encoding="utf-8" |
| return BeautifulSoup(r.text,"lxml") |
| def fetch_bongda_api(endpoint): |
| try: |
| r=requests.get(f"https://bongda.com.vn{endpoint}",headers=BONGDA_HEADERS,timeout=10) |
| if r.status_code==200: |
| data=r.json() |
| if data.get("status")=="success":return data.get("html","") |
| return "" |
| except:return "" |
| def _parse_match_from_li(li, status_type="live"): |
| match_div=li.select_one("div.match") |
| if not match_div:return None |
| home_el=match_div.select_one(".home-team .name");away_el=match_div.select_one(".away-team .name") |
| if not home_el or not away_el:return None |
| status_el=match_div.select_one(".status a");league_el=li.find_previous("strong");time_el=match_div.select_one(".match-time") |
| home_logo=match_div.select_one(".home-team .logo img");away_logo=match_div.select_one(".away-team .logo img") |
| event_id="" |
| if status_el: |
| href=status_el.get("href","");m=re.search(r'/tran-dau/(\d+)/',href) |
| if m:event_id=m.group(1) |
| spans=status_el.find_all("span") if status_el else [];score="";minute="" |
| if len(spans)>=3:score=f"{spans[0].get_text(strip=True)} - {spans[2].get_text(strip=True)}" |
| if len(spans)>=4:minute=spans[3].get_text(strip=True) |
| if not score and status_el and status_el.select_one(".vs"):score="VS" |
| league=league_el.get_text(strip=True) if league_el else "" |
| return{"home":home_el.get_text(strip=True),"away":away_el.get_text(strip=True),"score":score or"VS","minute":minute,"league":league,"time":time_el.get_text(strip=True) if time_el else "","event_id":event_id,"home_logo":home_logo.get("src","") if home_logo else "","away_logo":away_logo.get("src","") if away_logo else "","status":status_type} |
|
|
| |
| @app.get("/api/proxy/m3u8") |
| def proxy_m3u8(url: str = Query(...)): |
| try: |
| r = requests.get(url, headers=HEADERS, timeout=15) |
| if r.status_code != 200:return Response(status_code=502, content="upstream error") |
| lines = r.text.strip().split('\n');rewritten = [] |
| for line in lines: |
| if line.startswith('#') or not line.strip():rewritten.append(line) |
| else:rewritten.append("/api/proxy/seg?url=" + quote(line.strip(), safe="")) |
| return Response(content='\n'.join(rewritten).encode('utf-8'),media_type="application/vnd.apple.mpegurl",headers={"Access-Control-Allow-Origin":"*","Cache-Control":"public, max-age=300"}) |
| except:return Response(status_code=502, content="proxy error") |
|
|
| @app.get("/api/proxy/seg") |
| def proxy_segment(url: str = Query(...)): |
| try: |
| r = requests.get(url, headers=HEADERS, timeout=30) |
| if r.status_code != 200:return Response(status_code=502, content="upstream error") |
| data = r.content |
| if len(data) > 188 and data[0:4] == b'\x89PNG' and data[188] == 0x47:data = data[188:] |
| return Response(content=data,media_type="video/mp2t",headers={"Access-Control-Allow-Origin":"*","Cache-Control":"public, max-age=3600"}) |
| except:return Response(status_code=502, content="proxy error") |
|
|
| @app.get("/api/proxy/video") |
| def proxy_video(url: str = Query(...), request: Request = None): |
| try: |
| req_headers = dict(HEADERS) |
| if request and request.headers.get("range"):req_headers["Range"] = request.headers["range"] |
| r = requests.get(url, headers=req_headers, timeout=30, stream=True) |
| resp_headers = {"Access-Control-Allow-Origin":"*","Accept-Ranges":"bytes","Content-Type":r.headers.get("Content-Type","video/mp4")} |
| if "Content-Range" in r.headers:resp_headers["Content-Range"] = r.headers["Content-Range"] |
| if "Content-Length" in r.headers:resp_headers["Content-Length"] = r.headers["Content-Length"] |
| return StreamingResponse(r.iter_content(chunk_size=256*1024),status_code=r.status_code,headers=resp_headers) |
| except:return Response(status_code=502, content="proxy error") |
|
|
| @app.get("/api/proxy/img") |
| def proxy_img(url: str = Query(...)): |
| """Proxy images from sources that block hotlinking (DanTri CDN).""" |
| try: |
| r = requests.get(url, headers={**HEADERS, "Referer": "https://dantri.com.vn/"}, timeout=10) |
| if r.status_code != 200:return Response(status_code=502) |
| ct = r.headers.get("Content-Type", "image/jpeg") |
| return Response(content=r.content, media_type=ct, headers={"Cache-Control": "public, max-age=86400", "Access-Control-Allow-Origin": "*"}) |
| except:return Response(status_code=502) |
|
|
| |
| def _scrape_xemlaibongda_page(page_path, limit=20): |
| try: |
| url = f"https://xemlaibongda.top/{page_path}" if page_path else "https://xemlaibongda.top/" |
| r=requests.get(url,headers=HEADERS,timeout=15) |
| if r.status_code!=200:return[] |
| r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml");videos=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","") |
| if"/video/" not in href:continue |
| if not href.startswith("http"):href="https://xemlaibongda.top"+href |
| if href in seen:continue |
| seen.add(href);slug=href.split("/video/")[-1].rstrip("/") |
| title=slug.replace("-"," ").title() |
| title=re.sub(r'\d{4}\s*\d{2}\s*\d{2}$','',title).strip() |
| title=re.sub(r'\s+V\s+',' vs ',title);title=re.sub(r'\s+Vs\s+',' vs ',title) |
| img=a.find("img") or (a.parent.find("img") if a.parent else None) |
| img_src="" |
| if img:img_src=img.get("data-src","") or img.get("src","") or img.get("data-lazy","") |
| if not img_src:img_src=f"https://img.refooty.com/thumbnail/{slug}.webp" |
| videos.append({"title":title,"link":href,"img":img_src,"source":"xemlaibongda"}) |
| if len(videos)>=limit:break |
| return videos |
| except:return[] |
|
|
| def scrape_xemlaibongda():return _scrape_xemlaibongda_page("",20) |
| def scrape_highlights_by_league(league_key): |
| if league_key not in HL_LEAGUES:return[] |
| return _scrape_xemlaibongda_page(HL_LEAGUES[league_key]["path"],20) |
|
|
| def scrape_all_league_highlights(): |
| results = {} |
| def _fetch(key):return key, scrape_highlights_by_league(key) |
| with ThreadPoolExecutor(8) as ex: |
| futs = [ex.submit(_fetch, k) for k in HL_LEAGUES] |
| for f in as_completed(futs): |
| try: |
| key, vids = f.result() |
| if vids:results[key] = vids |
| except:pass |
| return results |
|
|
| def extract_xemlaibongda_video(url): |
| try: |
| r=requests.get(url,headers=HEADERS,timeout=15) |
| if r.status_code!=200:return None |
| r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml");video=soup.find("video") |
| if video: |
| src=video.get("src","");poster=video.get("poster","") |
| if not src: |
| source=video.find("source") |
| if source:src=source.get("src","") |
| if src:return{"src":src,"poster":poster,"type":"hls" if".m3u8" in src else"video"} |
| m3u8s=re.findall(r'(https?://[^\s"\'<>]+\.m3u8)',r.text) |
| if m3u8s: |
| og=soup.find("meta",property="og:image");poster=og.get("content","") if og else "" |
| return{"src":m3u8s[0],"poster":poster,"type":"hls"} |
| return None |
| except:return None |
|
|
| |
| def _yt_channel_shorts(channel, count=15): |
| """Fast scrape YouTube shorts tab without yt-dlp. Returns newest-first IDs/titles.""" |
| try: |
| url=f"https://www.youtube.com/@{channel}/shorts" |
| r=requests.get(url,headers={**HEADERS,"Accept-Language":"vi,en;q=0.8"},timeout=15) |
| if r.status_code!=200:return[] |
| html=r.text |
| ids=[];items=[] |
| for m in re.finditer(r'"videoId":"([A-Za-z0-9_-]{11})"',html): |
| vid=m.group(1) |
| if vid in ids:continue |
| ids.append(vid) |
| snip=html[max(0,m.start()-900):m.start()+1600] |
| title="" |
| mt=re.search(r'"title":\{"runs":\[\{"text":"([^"]+)"',snip) |
| if not mt:mt=re.search(r'"accessibilityText":"([^"]+)"',snip) |
| if mt:title=html_lib.unescape(mt.group(1)).replace('\n',' ').strip() |
| if not title:title="YouTube Short" |
| items.append({"title":title,"link":f"https://www.youtube.com/watch?v={vid}","img":f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg","source":"yt","id":vid,"channel":channel}) |
| if len(items)>=count:break |
| return items |
| except:return[] |
| def scrape_shorts(): |
| """Stable shorts feed: fast HTML scrape + static fallback so slide never disappears.""" |
| vids=[] |
| with ThreadPoolExecutor(2) as ex: |
| futs=[ex.submit(_yt_channel_shorts,ch,24) for ch in ["baodantri7941","baosuckhoedoisongboyte"]] |
| for f in as_completed(futs): |
| try: |
| r=f.result() |
| if r:vids.extend(r) |
| except:pass |
| merged=[];seen=set() |
| for v in vids+SHORTS_FALLBACK: |
| vid=v.get("id") |
| if not vid or vid in seen:continue |
| seen.add(vid);merged.append(v) |
| return merged[:40] |
|
|
| |
| @app.get("/api/livescore/live") |
| def api_livescore_live():return JSONResponse({"html":_cached("ls_live",lambda:fetch_bongda_api("/api/fixtures/live"),ttl=_cache_ttl_live)}) |
| @app.get("/api/livescore/incoming") |
| def api_livescore_incoming():return JSONResponse({"html":_cached("ls_incoming",lambda:fetch_bongda_api("/api/fixtures/incoming"),ttl=_cache_ttl_live)}) |
| @app.get("/api/livescore/today") |
| def api_livescore_today(): |
| today=datetime.now().strftime("%Y-%m-%d");return JSONResponse({"html":_cached("ls_today",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}"),ttl=_cache_ttl)}) |
| @app.get("/api/livescore/results") |
| def api_livescore_results(): |
| today=datetime.now().strftime("%Y-%m-%d");return JSONResponse({"html":_cached("ls_results",lambda:fetch_bongda_api(f"/api/fixtures/get-by-date?date={today}&status=finished"),ttl=_cache_ttl)}) |
| @app.get("/api/livescore/standings/{league}") |
| def api_livescore_standings(league:str): |
| tid=LEAGUE_IDS.get(league,27110);return JSONResponse({"html":_cached(f"ls_bxh_{league}",lambda:fetch_bongda_api(f"/api/league-table/home?tournament_id={tid}&is_detail=True"),ttl=_cache_ttl)}) |
| @app.get("/api/livescore/date/{date}") |
| def api_livescore_date(date:str):return JSONResponse({"html":fetch_bongda_api(f"/api/fixtures/get-by-date?date={date}")}) |
| @app.get("/api/match/{event_id}/commentaries") |
| def api_match_commentaries(event_id:int):return JSONResponse({"html":fetch_bongda_api(f"/api/fixtures/commentaries?event_id={event_id}")}) |
| @app.get("/api/match/{event_id}/stats") |
| def api_match_stats(event_id:int):return JSONResponse({"html":fetch_bongda_api(f"/api/event-standing/player-performance?event_id={event_id}")}) |
| @app.get("/api/livescore/featured") |
| def api_livescore_featured(): |
| def _f(): |
| sources=[("/api/fixtures/live","live"),("/api/fixtures/get-by-date?date="+datetime.now().strftime("%Y-%m-%d"),"today"),("/api/fixtures/incoming","upcoming")] |
| for endpoint, stype in sources: |
| html=fetch_bongda_api(endpoint) |
| if not html or len(html)<100:continue |
| soup=BeautifulSoup(html,"lxml");all_matches=[] |
| for li in soup.select("li.match-detail"): |
| match=_parse_match_from_li(li, stype) |
| if not match or not match["event_id"]:continue |
| if stype=="today" and "KT" in match.get("minute",""):continue |
| all_matches.append(match) |
| if not all_matches:continue |
| for pl in PRIORITY_LEAGUES: |
| for match in all_matches: |
| if pl in match["league"]:return match |
| return all_matches[0] |
| return None |
| return JSONResponse(_cached("ls_featured",_f,ttl=30)) |
|
|
| |
| @app.get("/api/shorts") |
| def api_shorts():return JSONResponse(_cached("yt_shorts_v3",scrape_shorts,ttl=_cache_ttl_yt)) |
| @app.get("/api/short-stats") |
| def api_short_stats(ids:str=Query(default="")): |
| arr=[x for x in ids.split(",") if x] |
| with _short_lock: |
| db=_load_short_db();out={} |
| for vid in arr: |
| st=db.get(vid) or _short_default() |
| out[vid]={"views":int(st.get("views",0)),"likes":int(st.get("likes",0)),"shares":int(st.get("shares",0)),"comments":st.get("comments",[])[:80]} |
| return JSONResponse({"stats":out}) |
|
|
| @app.post("/api/short-action") |
| async def api_short_action(request:Request): |
| try:body=await request.json() |
| except:body={} |
| vid=str(body.get("id","")).strip();action=str(body.get("action","")).strip();txt=str(body.get("text","")).strip() |
| if not vid:return JSONResponse({"error":"missing id"},status_code=400) |
| with _short_lock: |
| db=_load_short_db();st=db.get(vid) or _short_default() |
| if action=="view":st["views"]=int(st.get("views",0))+1 |
| elif action=="like":st["likes"]=int(st.get("likes",0))+1 |
| elif action=="share":st["shares"]=int(st.get("shares",0))+1 |
| elif action=="comment" and txt: |
| comments=st.get("comments",[]) |
| comments.insert(0,{"text":txt[:180],"ts":int(time.time())}) |
| st["comments"]=comments[:80] |
| st["updated"]=int(time.time());db[vid]=st;_save_short_db(db) |
| out={"views":int(st.get("views",0)),"likes":int(st.get("likes",0)),"shares":int(st.get("shares",0)),"comments":st.get("comments",[])[:80]} |
| return JSONResponse({"stats":out}) |
|
|
| @app.get("/api/highlights") |
| def api_highlights():return JSONResponse(_cached("xemlaibongda_hl",scrape_xemlaibongda,ttl=_cache_ttl)) |
| @app.get("/api/highlights/leagues") |
| def api_highlights_leagues():return JSONResponse(_cached("hl_leagues",scrape_all_league_highlights,ttl=_cache_ttl)) |
| @app.get("/api/highlights/{league}") |
| def api_highlights_league(league:str): |
| if league not in HL_LEAGUES:return JSONResponse({"error":"league not found"}) |
| return JSONResponse(_cached(f"hl_{league}",lambda:scrape_highlights_by_league(league),ttl=_cache_ttl)) |
| @app.get("/api/highlights_config") |
| def api_highlights_config():return JSONResponse(HL_LEAGUES) |
| @app.get("/api/video_url") |
| def api_video_url(url:str=Query(...)): |
| if "youtube.com" in url or "youtu.be" in url: |
| m=re.search(r'(?:v=|shorts/|youtu\.be/)([a-zA-Z0-9_-]{11})',url) |
| if m:vid=m.group(1);return JSONResponse({"src":f"https://www.youtube.com/embed/{vid}?autoplay=1&rel=0&enablejsapi=1","poster":f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg","type":"youtube"}) |
| if "xemlaibongda.top" in url: |
| v=extract_xemlaibongda_video(url) |
| if v: |
| if v["type"]=="hls":v["src"]="/api/proxy/m3u8?url="+quote(v["src"],safe="") |
| return JSONResponse(v) |
| if "bongdaplus.vn" in url: |
| try: |
| m=re.search(r'-(\d{6,})\.html',url) |
| if m: |
| r=requests.get(f"{BASE_BDP}/video-embed/{m.group(1)}.html",headers=HEADERS,timeout=10);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"lxml");video=soup.select_one("video#videoPlayer") |
| if video: |
| source=video.find("source");src=source.get("src","") if source else "";poster=video.get("poster","") |
| if src:return JSONResponse({"src":"/api/proxy/video?url="+quote(src,safe=""),"poster":poster,"type":"video"}) |
| except:pass |
| return JSONResponse({"error":"not found"}) |
| @app.get("/api/bdp_videos") |
| def api_bdp_videos(): |
| def _f(): |
| try: |
| soup=_get(f"{BASE_BDP}/video");arts=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","") |
| if"/video/" not in href or href in("/video/","/video/ban-thang-dep","/video/highlight"):continue |
| if not href.startswith("http"):href=BASE_BDP+href |
| if href in seen:continue |
| title=re.sub(r'^\d{2}:\d{2}','',a.get_text(strip=True)).strip() |
| if not title or len(title)<5:continue |
| img_tag=a.find("img") or(a.parent.find("img") if a.parent else None) |
| img=(img_tag.get("data-src") or img_tag.get("src","")) if img_tag else "" |
| seen.add(href);arts.append({"title":title,"link":href,"img":img,"source":"bdp"}) |
| return arts[:20] |
| except:return[] |
| return JSONResponse(_cached("bdp_videos",_f)) |
| |
| def scrape_vne(cat_url): |
| try: |
| soup=_get(cat_url);arts=[] |
| for it in soup.select("article.item-news")[:15]: |
| a=it.select_one("h2.title-news a") or it.select_one("h3.title-news a") |
| if not a:continue |
| t=a.get("title","") or a.get_text(strip=True);lk=a.get("href","") |
| if not t or not lk:continue |
| im=it.find("img");img=(im.get("data-src") or im.get("src","")) if im else "" |
| if img and'blank'in img: |
| src=it.find("source") |
| if src:img=src.get("srcset","").split(",")[0].strip().split(" ")[0] |
| arts.append({"title":t,"link":lk,"img":img,"source":"vne"}) |
| return arts |
| except:return[] |
| def scrape_vne_article(url): |
| try: |
| soup=_get(url);h1=soup.select_one("h1.title-detail");desc=soup.select_one("p.description") |
| og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else "" |
| cd=soup.select_one("article.fck_detail");body=[] |
| if cd: |
| for ch in cd.children: |
| if not hasattr(ch,'name') or not ch.name:continue |
| if ch.name=="p":t=ch.get_text(strip=True);(body.append({"type":"p","text":t}) if t else None) |
| elif ch.name=="figure": |
| im=ch.find("img") |
| if im:s=im.get("data-src") or im.get("src","");body.append({"type":"img","src":s}) |
| elif ch.name in("h2","h3"):body.append({"type":"heading","text":ch.get_text(strip=True)}) |
| return{"title":h1.get_text(strip=True) if h1 else "","summary":desc.get_text(strip=True) if desc else "","og_image":og_img,"body":body,"source":"vne","url":url} |
| except:return None |
| def _scrape_dantri_homepage(cat_filter=None): |
| try: |
| soup=_get("https://dantri.com.vn/");arts=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","");title=a.get("title","") or a.get_text(strip=True) |
| if not title or len(title)<15 or"javascript:" in href:continue |
| if not href.startswith("http"):href="https://dantri.com.vn"+href |
| if href in seen or not href.endswith(".htm"):continue |
| if cat_filter and f"/{cat_filter}/" not in href:continue |
| img_tag=a.find("img") |
| if not img_tag and a.parent:img_tag=a.parent.find("img") |
| img_src="" |
| if img_tag:img_src=img_tag.get("data-src","") or img_tag.get("src","") |
| if not img_src or "cdn" not in img_src:continue |
| proxied_img="/api/proxy/img?url="+quote(img_src,safe="") |
| seen.add(href);arts.append({"title":title,"link":href,"img":proxied_img,"source":"dantri"}) |
| if len(arts)>=15:break |
| return arts |
| except:return[] |
| def scrape_dantri_hot():return _scrape_dantri_homepage() |
| def scrape_dantri_congnghe(): |
| try: |
| soup=_get("https://dantri.com.vn/");arts=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","");title=a.get("title","") or a.get_text(strip=True) |
| if not title or len(title)<15 or"javascript:" in href:continue |
| if not href.startswith("http"):href="https://dantri.com.vn"+href |
| if href in seen or not href.endswith(".htm"):continue |
| if"/cong-nghe/" not in href:continue |
| img_tag=a.find("img") |
| if not img_tag and a.parent:img_tag=a.parent.find("img") |
| img_src="" |
| if img_tag:img_src=img_tag.get("data-src","") or img_tag.get("src","") |
| if img_src and "cdn" in img_src:img_src="/api/proxy/img?url="+quote(img_src,safe="") |
| else:img_src="" |
| seen.add(href);arts.append({"title":title,"link":href,"img":img_src,"source":"dantri"}) |
| if len(arts)>=15:break |
| return arts |
| except:return[] |
| def scrape_genk_ai(): |
| """Scrape AI articles from genk.vn - readable in-app""" |
| try: |
| r=requests.get("https://genk.vn/ai.chn",headers=HEADERS,timeout=15) |
| if r.status_code!=200:return[] |
| r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml") |
| articles=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","") |
| if not href.endswith(".chn") or href=="/ai.chn":continue |
| if href.startswith("/"):href="https://genk.vn"+href |
| if href in seen or "genk.vn" not in href:continue |
| title=a.get("title","") or a.get_text(strip=True) |
| if not title or len(title)<20:continue |
| container=a.parent;img_src="" |
| for _ in range(6): |
| if container is None:break |
| for img in container.find_all("img"): |
| s=img.get("data-src","") or img.get("src","") |
| if s and "mediacdn" in s and "avatar" not in s and "logo" not in s: |
| img_src=s;break |
| if img_src:break |
| container=container.parent |
| seen.add(href) |
| if not img_src: |
| try: |
| og_r=requests.get(href,headers=HEADERS,timeout=8);og_r.encoding="utf-8" |
| og_soup=BeautifulSoup(og_r.text,"lxml");og_tag=og_soup.find("meta",property="og:image") |
| if og_tag:img_src=og_tag.get("content","") |
| except:pass |
| articles.append({"title":title,"link":href,"img":img_src,"source":"genk"}) |
| if len(articles)>=30:break |
| return articles |
| except:return[] |
|
|
| def scrape_dantri_article(url): |
| try: |
| r=requests.get(url,headers=HEADERS,timeout=15);r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml") |
| for tag in soup.find_all(["script","style","nav","footer","aside"]):tag.decompose() |
| h1=soup.find("h1");og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else "" |
| if og_img and "cdnphoto.dantri" in og_img:og_img="/api/proxy/img?url="+quote(og_img,safe="") |
| content=soup.select_one("main") or soup.select_one("div.singular-content") or soup.select_one("article");body=[] |
| if content: |
| for el in content.find_all(["p","h2","h3","figure","img"],recursive=True): |
| if el.name=="p":t=el.get_text(strip=True);(body.append({"type":"p","text":t}) if t and len(t)>15 else None) |
| elif el.name in("h2","h3"):t=el.get_text(strip=True);(body.append({"type":"heading","text":t}) if t else None) |
| elif el.name in("figure","img"): |
| im=el if el.name=="img" else el.find("img") |
| if im: |
| s=im.get("data-src") or im.get("src","") |
| if s and"base64" not in s: |
| if "cdnphoto.dantri" in s:s="/api/proxy/img?url="+quote(s,safe="") |
| body.append({"type":"img","src":s}) |
| desc="";sapo=soup.select_one("h2.singular-sapo") or soup.select_one("h2[class*=sapo]") |
| if not sapo: |
| og_desc=soup.find("meta",property="og:description") |
| if og_desc:desc=og_desc.get("content","") |
| else:desc=sapo.get_text(strip=True) |
| return{"title":h1.get_text(strip=True) if h1 else "","summary":desc,"og_image":og_img,"body":body,"source":"dantri","url":url} |
| except:return None |
| def scrape_bbc_vietnamese(): |
| try: |
| r=requests.get("https://www.bbc.com/vietnamese",headers={"User-Agent":"Mozilla/5.0","Accept-Language":"en-GB"},timeout=15);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"lxml");arts=[];seen=set() |
| for a in soup.select("a[href*='/vietnamese/']"): |
| href=a.get("href","") |
| if not href or href=="/vietnamese" or href.count("/")<3:continue |
| if not href.startswith("http"):href="https://www.bbc.com"+href |
| if href in seen:continue |
| title=a.get_text(strip=True) |
| if not title or len(title)<15 or any(x in title.lower() for x in["đăng nhập","trang chủ","bbc news"]):continue |
| img="";container=a.parent |
| for _ in range(3): |
| if container: |
| im=container.find("img") |
| if im:img=im.get("src","") or im.get("data-src","");break |
| container=container.parent |
| seen.add(href);arts.append({"title":title,"link":href,"img":img,"source":"bbc"}) |
| if len(arts)>=15:break |
| return arts |
| except:return[] |
| def scrape_bbc_article(url): |
| try: |
| r=requests.get(url,headers={"User-Agent":"Mozilla/5.0","Accept-Language":"en-GB"},timeout=15);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"lxml");h1=soup.find("h1") |
| og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else "" |
| body=[] |
| for p in soup.select("[data-component='text-block'] p, article p, main p"): |
| t=p.get_text(strip=True) |
| if t and len(t)>20:body.append({"type":"p","text":t}) |
| return{"title":h1.get_text(strip=True) if h1 else "","summary":"","og_image":og_img,"body":body,"source":"bbc","url":url} |
| except:return None |
|
|
| def scrape_ttvh_worldcup(): |
| """Scrape all World Cup 2026 articles from The Thao Van Hoa RSS.""" |
| try: |
| r=requests.get("https://thethaovanhoa.vn/rss/world-cup-2026.rss",headers=HEADERS,timeout=15);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"xml");arts=[];seen=set() |
| for it in soup.find_all("item"): |
| title=(it.find("title").get_text(strip=True) if it.find("title") else "") |
| link=(it.find("link").get_text(strip=True) if it.find("link") else "") |
| desc=(it.find("description").get_text(" ",strip=True) if it.find("description") else "") |
| img="";ds=BeautifulSoup(desc,"lxml");im=ds.find("img") |
| if im:img=im.get("src","") or im.get("data-src","") |
| if title and link and link not in seen: |
| seen.add(link);arts.append({"title":title,"link":link,"img":img,"source":"ttvh"}) |
| if arts:return arts |
| except:pass |
| try: |
| soup=_get("https://thethaovanhoa.vn/world-cup-2026.htm");arts=[];seen=set() |
| for a in soup.find_all("a",href=True): |
| href=a.get("href","") |
| if not href.startswith("http"):href="https://thethaovanhoa.vn"+href |
| if href in seen or "thethaovanhoa.vn" not in href:continue |
| if not re.search(r"/[^/]+-\d{8,}\.htm",href):continue |
| title=a.get("title","") or a.get_text(" ",strip=True) |
| img=None;p=a |
| for _ in range(5): |
| if p is None:break |
| img=p.find("img") |
| if img:break |
| p=p.parent |
| img_src="" |
| if img: |
| img_src=img.get("data-src","") or img.get("src","") or img.get("data-original","") or img.get("data-thumb","") |
| if len(title)<15:title=img.get("alt","") or img.get("title","") or title |
| if not title or len(title)<15:continue |
| seen.add(href);arts.append({"title":title,"link":href,"img":img_src,"source":"ttvh"}) |
| if len(arts)>=24:break |
| return arts |
| except:return[] |
|
|
| def scrape_ttvh_article(url): |
| try: |
| soup=_get(url);h1=soup.find("h1");og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else "" |
| og_title=soup.find("meta",property="og:title");fallback_title=og_title.get("content","") if og_title else "" |
| desc_el=soup.find("meta",property="og:description");desc=desc_el.get("content","") if desc_el else "" |
| cd=soup.select_one(".detail-content") or soup.select_one(".content-detail") or soup.select_one("article") or soup.select_one("main") |
| body=[] |
| if cd: |
| for el in cd.find_all(["p","h2","h3","figure","img"],recursive=True): |
| if el.name=="p": |
| t=el.get_text(strip=True) |
| if t and len(t)>20 and "Theo dõi" not in t:body.append({"type":"p","text":t}) |
| elif el.name in ("h2","h3"): |
| t=el.get_text(strip=True) |
| if t:body.append({"type":"heading","text":t}) |
| elif el.name in ("figure","img"): |
| im=el if el.name=="img" else el.find("img") |
| if im: |
| src=im.get("data-src") or im.get("src","") or im.get("data-original","") |
| if src and "base64" not in src:body.append({"type":"img","src":src}) |
| if not body and desc:body=[{"type":"p","text":desc}] |
| return {"title":h1.get_text(strip=True) if h1 else fallback_title,"summary":desc,"og_image":og_img,"body":body,"source":"ttvh","url":url} |
| except:return None |
|
|
| VNE_CATS={"thoi-su":("https://vnexpress.net/thoi-su","Thời Sự"),"the-gioi":("https://vnexpress.net/the-gioi","Thế Giới"),"kinh-doanh":("https://vnexpress.net/kinh-doanh","Kinh Doanh"),"the-thao":("https://vnexpress.net/the-thao","Thể Thao"),"giai-tri":("https://vnexpress.net/giai-tri","Giải Trí"),"suc-khoe":("https://vnexpress.net/suc-khoe","Sức Khỏe"),"phap-luat":("https://vnexpress.net/phap-luat","Pháp Luật"),"giao-duc":("https://vnexpress.net/giao-duc","Giáo Dục"),"du-lich":("https://vnexpress.net/du-lich","Du Lịch"),"doi-song":("https://vnexpress.net/doi-song","Đời Sống")} |
| @app.get("/api/homepage") |
| def api_homepage(): |
| def _f(): |
| articles=[] |
| with ThreadPoolExecutor(12) as ex: |
| futs={ex.submit(scrape_vne,VNE_CATS[k][0]):VNE_CATS[k][1] for k in["thoi-su","the-gioi","kinh-doanh","the-thao","giai-tri","phap-luat","giao-duc","du-lich","doi-song"]} |
| futs[ex.submit(scrape_bbc_vietnamese)]="BBC" |
| for f in as_completed(futs): |
| try: |
| for a in f.result():a["group"]=futs[f];articles.append(a) |
| except:pass |
| return articles |
| return JSONResponse(_cached("homepage",_f)) |
| @app.get("/api/category/{cat_id}") |
| def api_category(cat_id:str): |
| def _f(): |
| if cat_id=="bbc":return scrape_bbc_vietnamese() |
| if cat_id=="cong-nghe":return scrape_genk_ai() |
| if cat_id in VNE_CATS:arts=scrape_vne(VNE_CATS[cat_id][0]);[a.update({"group":VNE_CATS[cat_id][1]}) for a in arts];return arts |
| return[] |
| return JSONResponse(_cached(f"cat_{cat_id}",_f)) |
| @app.get("/api/categories") |
| def api_categories(): |
| cats=[{"id":"bbc","name":"BBC Tiếng Việt","source":"bbc"},{"id":"cong-nghe","name":"Công Nghệ","source":"genk"}] |
| for k,(u,n) in VNE_CATS.items():cats.append({"id":k,"name":n,"source":"vne"}) |
| return JSONResponse(cats) |
| @app.get("/api/dantri_hot") |
| def api_dantri_hot():return JSONResponse(_cached("dantri_hot",scrape_dantri_hot)) |
| @app.get("/api/genk_ai") |
| def api_genk_ai():return JSONResponse(_cached("genk_ai",scrape_genk_ai,ttl=_cache_ttl)) |
| @app.get("/api/worldcup2026") |
| def api_worldcup2026():return JSONResponse(_cached("ttvh_worldcup",scrape_ttvh_worldcup,ttl=_cache_ttl)) |
| def scrape_genk_article(url): |
| try: |
| r=requests.get(url,headers=HEADERS,timeout=15);r.encoding="utf-8";soup=BeautifulSoup(r.text,"lxml") |
| h1=soup.find("h1");og=soup.find("meta",property="og:image");og_img=og.get("content","") if og else "" |
| og_title=soup.find("meta",property="og:title");fallback_title=og_title.get("content","") if og_title else "" |
| desc_el=soup.find("meta",property="og:description");desc=desc_el.get("content","") if desc_el else "" |
| cd=soup.select_one(".knc-content");body=[] |
| if cd: |
| for el in cd.find_all(["p","h2","h3","figure","img"],recursive=True): |
| if el.name=="p":t=el.get_text(strip=True);(body.append({"type":"p","text":t}) if t and len(t)>15 else None) |
| elif el.name in("h2","h3"):t=el.get_text(strip=True);(body.append({"type":"heading","text":t}) if t else None) |
| elif el.name in("figure","img"): |
| im=el if el.name=="img" else el.find("img") |
| if im:s=im.get("data-src") or im.get("src","");(body.append({"type":"img","src":s}) if s and"base64" not in s else None) |
| return{"title":h1.get_text(strip=True) if h1 else "","summary":desc,"og_image":og_img,"body":body,"source":"genk","url":url} |
| except:return None |
|
|
| @app.get("/api/article") |
| def api_article(url:str=Query(...)): |
| if"vnexpress.net" in url:data=scrape_vne_article(url) |
| elif"bbc.com" in url:data=scrape_bbc_article(url) |
| elif"dantri.com.vn" in url:data=scrape_dantri_article(url) |
| elif"genk.vn" in url:data=scrape_genk_article(url) |
| elif"thethaovanhoa.vn" in url:data=scrape_ttvh_article(url) |
| else:data=None |
| return JSONResponse(data if data else{"error":"not supported"}) |
| def _web_context(topic): |
| """Collect real web/news context for a topic.""" |
| bits=[] |
| try: |
| rss="https://news.google.com/rss/search?q="+quote(topic)+"&hl=vi&gl=VN&ceid=VN:vi" |
| r=requests.get(rss,headers=HEADERS,timeout=12);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"xml") |
| for it in soup.find_all("item")[:8]: |
| title=it.find("title").get_text(" ",strip=True) if it.find("title") else "" |
| src=it.find("source").get_text(" ",strip=True) if it.find("source") else "" |
| if title:bits.append((title+(" — "+src if src else ""))[:280]) |
| except:pass |
| if bits:return "\n".join(bits) |
| try: |
| r=requests.get("https://html.duckduckgo.com/html/?q="+quote(topic),headers=HEADERS,timeout=12);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"lxml") |
| for res in soup.select(".result")[:6]: |
| t=res.select_one(".result__title");sn=res.select_one(".result__snippet") |
| line=((t.get_text(" ",strip=True) if t else "")+" — "+(sn.get_text(" ",strip=True) if sn else "")).strip(" —") |
| if line:bits.append(line[:280]) |
| except:pass |
| return "\n".join(bits) |
|
|
| def _jina_read(url): |
| try: |
| ju="https://r.jina.ai/http://"+url |
| r=requests.get(ju,headers=HEADERS,timeout=25);r.encoding="utf-8" |
| if r.status_code!=200 or not r.text:return None |
| lines=[x.rstrip() for x in r.text.splitlines()] |
| title="";img="";body=[];summary="" |
| for ln in lines[:40]: |
| if ln.startswith("Title:"):title=ln.replace("Title:","",1).strip() |
| elif ln.startswith("Image:"):img=ln.replace("Image:","",1).strip() |
| elif ln.startswith("Description:"):summary=ln.replace("Description:","",1).strip() |
| for ln in lines: |
| t=ln.strip() |
| if not t or t.startswith(("Title:","URL Source:","Published Time:","Markdown Content:","Image:","Description:")):continue |
| if len(t)>40:body.append({"type":"p","text":t}) |
| if not body and summary:body=[{"type":"p","text":summary}] |
| return {"title":title or url,"summary":summary,"og_image":img,"body":body[:80],"source":"jina","url":url} |
| except:return None |
|
|
| def _scrape_generic_article(url): |
| try: |
| hdr={**HEADERS,"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"} |
| r=requests.get(url,headers=hdr,timeout=15);r.encoding="utf-8" |
| ct=r.headers.get("content-type","").lower() |
| if r.status_code>=400 or "text/html" not in ct: |
| jr=_jina_read(url) |
| if jr:return jr |
| soup=BeautifulSoup(r.text,"lxml") |
| for tag in soup.find_all(["script","style","nav","footer","aside","form"]):tag.decompose() |
| h1=soup.find("h1") |
| ogt=soup.find("meta",property="og:title");title=h1.get_text(strip=True) if h1 else (ogt.get("content","") if ogt else "") |
| ogd=soup.find("meta",property="og:description");desc=ogd.get("content","") if ogd else "" |
| ogi=soup.find("meta",property="og:image");img=ogi.get("content","") if ogi else "" |
| main=soup.find("article") or soup.find("main") or soup.body |
| body=[] |
| if main: |
| for el in main.find_all(["p","h2","h3","figure","img"],recursive=True): |
| if el.name=="p": |
| t=el.get_text(" ",strip=True) |
| if t and len(t)>35:body.append({"type":"p","text":t}) |
| elif el.name in ("h2","h3"): |
| t=el.get_text(" ",strip=True) |
| if t:body.append({"type":"heading","text":t}) |
| elif el.name in ("figure","img"): |
| im=el if el.name=="img" else el.find("img") |
| if im: |
| src=im.get("data-src") or im.get("src","") or im.get("data-original","") |
| if src and "base64" not in src:body.append({"type":"img","src":src}) |
| if not body: |
| jr=_jina_read(url) |
| if jr and jr.get("body"):return jr |
| if not body and desc:body=[{"type":"p","text":desc}] |
| return {"title":title or url,"summary":desc,"og_image":img,"body":body,"source":"generic","url":url} |
| except: |
| return _jina_read(url) |
|
|
| def _article_by_url(url): |
| if "vnexpress.net" in url:return scrape_vne_article(url) |
| if "bbc.com" in url:return scrape_bbc_article(url) |
| if "dantri.com.vn" in url:return scrape_dantri_article(url) |
| if "genk.vn" in url:return scrape_genk_article(url) |
| if "thethaovanhoa.vn" in url:return scrape_ttvh_article(url) |
| return _scrape_generic_article(url) |
|
|
| def _call_qwen(prompt, max_tokens=1800): |
| """Try Qwen2.5-VL via HF router; return None if unavailable.""" |
| try: |
| token=os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN") or os.environ.get("VAISTUDIO") |
| if not token:return None |
| headers={"Authorization":"Bearer "+token,"Content-Type":"application/json"} |
| payload={"model":"Qwen/Qwen2.5-VL-7B-Instruct","messages":[{"role":"user","content":prompt}],"max_tokens":max_tokens,"temperature":0.7} |
| r=requests.post("https://router.huggingface.co/v1/chat/completions",headers=headers,json=payload,timeout=75) |
| if r.status_code>=300:return None |
| j=r.json();return j.get("choices",[{}])[0].get("message",{}).get("content") |
| except:return None |
|
|
| def _collect_article_text(data, limit=28000): |
| title=(data or {}).get("title","");summary=(data or {}).get("summary","") |
| parts=[] |
| if summary:parts.append(summary) |
| for b in (data or {}).get("body",[]): |
| if b.get("type")=="heading":parts.append("## "+b.get("text","") ) |
| elif b.get("type")=="p":parts.append(b.get("text","") ) |
| text="\n".join([p.strip() for p in parts if p and p.strip()]) |
| return title,text[:limit] |
|
|
| def _ai_rewrite_article(data,tone="tu-nhien"): |
| title,text=_collect_article_text(data) |
| prompt=("Bạn là biên tập viên báo điện tử tiếng Việt. Hãy viết lại bài dưới đây bằng ngôn ngữ tự nhiên, mạch lạc, không cắt khúc, không bỏ ý quan trọng. " |
| "Giữ đúng sự thật, không bịa, không thêm thông tin ngoài bài. Văn phong: "+tone+". " |
| "Đầu ra gồm: tiêu đề hấp dẫn, đoạn sapo 2-3 câu, các đoạn nội dung ngắn dễ đọc, và 3 gạch đầu dòng điểm chính.\n\n" |
| "TIÊU ĐỀ GỐC: "+title+"\n\nNỘI DUNG GỐC:\n"+text) |
| out=_call_qwen(prompt,2200) |
| if out and len(out)>300:return out.strip() |
| |
| paras=[p.strip() for p in text.split("\n") if len(p.strip())>30] |
| body="\n\n".join(paras[:18]) |
| bullets="\n".join(["• "+p[:220]+("..." if len(p)>220 else "") for p in paras[:5]]) |
| return ("Bản tin AI viết lại: "+title+"\n\n"+ |
| (paras[0] if paras else "")+"\n\n"+body+"\n\nĐiểm chính:\n"+bullets).strip() |
|
|
| def _image_for_topic(topic): |
| return "https://image.pollinations.ai/prompt/"+quote("editorial illustration, Vietnamese news, "+topic,safe="")+"?width=1024&height=576&nologo=true" |
|
|
| def _topic_articles(topic,limit=5): |
| items=[];seen=set() |
| try: |
| rss="https://news.google.com/rss/search?q="+quote(topic)+"&hl=vi&gl=VN&ceid=VN:vi" |
| r=requests.get(rss,headers=HEADERS,timeout=12);r.encoding="utf-8" |
| soup=BeautifulSoup(r.text,"xml") |
| for it in soup.find_all("item")[:limit*3]: |
| title=it.find("title").get_text(" ",strip=True) if it.find("title") else "" |
| link=it.find("link").get_text(strip=True) if it.find("link") else "" |
| src=it.find("source").get_text(" ",strip=True) if it.find("source") else "" |
| if not title or not link or link in seen:continue |
| seen.add(link);items.append({"title":title,"link":link,"source":src}) |
| if len(items)>=limit:break |
| except:pass |
| return items |
|
|
| def _topic_article_context(topic): |
| """Filter readable article sources by topic, then summarize actual article bodies.""" |
| raw_keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2] |
| |
| stop={"trong","năm","the","and","của","cho","với","một","các","những","hiện","nay"} |
| keys=[k for k in raw_keys if k not in stop] |
| candidates=[];seen=set() |
| def add_items(items): |
| for a in items or []: |
| link=a.get("link","");title=a.get("title","") |
| if not link or link in seen:continue |
| seen.add(link);candidates.append(a) |
| try:add_items(scrape_genk_ai()) |
| except:pass |
| try:add_items(scrape_dantri_congnghe()) |
| except:pass |
| try:add_items(scrape_ttvh_worldcup()) |
| except:pass |
| scored=[];img="" |
| for a in candidates[:40]: |
| data=_article_by_url(a.get("link","")) |
| if not data or not data.get("body"):continue |
| title=data.get("title") or a.get("title","") |
| ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40] |
| excerpt=" ".join(ps)[:1800] or data.get("summary","") |
| hay=(title+" "+excerpt).lower() |
| score=sum(1 for k in keys if k in hay) |
| |
| if keys and score==0:continue |
| if len(keys)>=2 and score<2 and not any(" ".join(keys[i:i+2]) in hay for i in range(len(keys)-1)):continue |
| scored.append((score,title,a.get("link",""),excerpt,data.get("og_image") or a.get("img","") or "")) |
| scored=sorted(scored,key=lambda x:x[0],reverse=True)[:5] |
| chunks=[] |
| for score,title,link,excerpt,im in scored: |
| if not img and im:img=im |
| chunks.append("BÀI: "+title+"\nURL: "+link+"\nNỘI DUNG LỌC: "+excerpt) |
| if chunks:return "\n\n".join(chunks),img |
| return _web_context(topic),"" |
|
|
| def _topic_post_text(topic): |
| ctx,img=_topic_article_context(topic) |
| prompt=("Bạn là cây bút báo điện tử tiếng Việt. Hãy lọc các thông tin thực tế trong những nguồn dưới đây để viết một bài tóm tắt theo chủ đề: "+topic+ |
| ". Không viết chung chung. Chỉ dùng dữ kiện có trong nguồn; nếu nguồn khác nhau thì tổng hợp khách quan. " |
| "Đầu ra gồm: tiêu đề, sapo, các ý chính theo bullet, phần phân tích ngắn và kết luận.\n\nNGUỒN THỰC TẾ:\n"+ctx) |
| out=_call_qwen(prompt,1800) |
| if out and len(out)>300:return out.strip() |
| if ctx: |
| return "Bài tóm tắt theo chủ đề: "+topic+"\n\nDữ liệu thực tế đã lọc:\n"+ctx[:3500]+"\n\nTóm tắt: Các nguồn trên cho thấy chủ đề này đang có nhiều diễn biến đáng chú ý. Khi viết bài, nên nêu rõ bối cảnh, các điểm mới, tác động thực tế và những điều còn cần kiểm chứng." |
| return "Chưa thu thập được dữ liệu đủ rõ cho chủ đề: "+topic |
|
|
| @app.get("/api/wall") |
| def api_wall():return JSONResponse({"posts":_load_wall()[:50]}) |
|
|
| @app.post("/api/rewrite_share") |
| async def api_rewrite_share(request:Request): |
| try:body=await request.json() |
| except:body={} |
| url=str(body.get("url","")).strip();tone=str(body.get("tone","tu-nhien")).strip() |
| if not url:return JSONResponse({"error":"missing url"},status_code=400) |
| data=_article_by_url(url) |
| if not data or not data.get("title") or (not data.get("body") and not data.get("summary")): |
| return JSONResponse({"error":"Không đọc được bài viết"},status_code=422) |
| post={"id":hashlib.md5((url+str(time.time())).encode()).hexdigest()[:12],"url":url,"title":data.get("title",""),"img":data.get("og_image","") or "","text":_ai_rewrite_article(data,tone),"ts":int(time.time()),"source":data.get("source","")} |
| posts=_load_wall();posts.insert(0,post);_save_wall(posts) |
| return JSONResponse({"post":post}) |
|
|
| @app.post("/api/topic_post") |
| async def api_topic_post(request:Request): |
| try:body=await request.json() |
| except:body={} |
| topic=str(body.get("topic","")).strip() |
| if not topic:return JSONResponse({"error":"missing topic"},status_code=400) |
| ctx_img=_topic_article_context(topic)[1] |
| post={"id":hashlib.md5((topic+str(time.time())).encode()).hexdigest()[:12],"url":"","title":topic,"img":ctx_img or _image_for_topic(topic),"text":_topic_post_text(topic),"ts":int(time.time()),"source":"ai-topic"} |
| posts=_load_wall();posts.insert(0,post);_save_wall(posts) |
| return JSONResponse({"post":post}) |
|
|
| @app.post("/api/url_wall") |
| async def api_url_wall(request:Request): |
| try:body=await request.json() |
| except:body={} |
| url=str(body.get("url","")).strip() |
| if not url:return JSONResponse({"error":"missing url"},status_code=400) |
| data=_article_by_url(url) |
| if not data or not data.get("title"): |
| return JSONResponse({"error":"Không đọc được URL"},status_code=422) |
| post={"id":hashlib.md5((url+str(time.time())).encode()).hexdigest()[:12],"url":url,"title":data.get("title",""),"img":data.get("og_image","") or "","text":_ai_rewrite_article(data,"ngan-gon-tu-nhien"),"ts":int(time.time()),"source":data.get("source","")} |
| posts=_load_wall();posts.insert(0,post);_save_wall(posts) |
| return JSONResponse({"post":post}) |
|
|
| @app.get("/v") |
| async def video_share(url:str=Query(default=""),title:str=Query(default="VNEWS Video"),img:str=Query(default=""),type:str=Query(default="highlights")): |
| decoded_url=unquote(url);decoded_title=unquote(title) |
| redirect_script=f'<script>localStorage.setItem("pending_video",JSON.stringify({{"url":"{decoded_url}","type":"{type}"}}));location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>' |
| return HTMLResponse(f'<!DOCTYPE html><html><head><meta charset="utf-8"><title>{decoded_title}</title></head><body style="background:#111;color:#fff;text-align:center;padding:40px"><p>⏳</p>{redirect_script}</body></html>') |
| @app.get("/s") |
| async def share_redirect(url:str=Query(default=""),title:str=Query(default="VNEWS"),img:str=Query(default="")): |
| decoded_url=unquote(url) |
| redirect_script=f'<script>localStorage.setItem("pending_article","{decoded_url}");location.href="{SPACE_URL}";</script>' if decoded_url else f'<script>location.href="{SPACE_URL}";</script>' |
| return HTMLResponse(f'<!DOCTYPE html><html><head><meta charset="utf-8"><title>{unquote(title)}</title></head><body>{redirect_script}</body></html>') |
| @app.get("/") |
| async def index(): |
| with open("/app/static/index.html","r",encoding="utf-8") as f:return HTMLResponse(content=f.read()) |
| app.mount("/static",StaticFiles(directory="/app/static"),name="static") |
|
|