VNEWS

Runtime error

App Files Files Community

bep40 commited on 3 days ago

Commit

0cc3b1f

verified ·

1 Parent(s): a043586

Fix topic scraping real article bodies and subtitle timing/font

Browse files

Files changed (1) hide show

ai_fix2.py +196 -30

ai_fix2.py CHANGED Viewed

@@ -1,4 +1,5 @@
-import os, re, subprocess, html as html_lib
 import requests
 import ai_patch as prev
 from ai_patch import app
@@ -12,6 +13,127 @@ def clean(s):
     return re.sub(r"\s+", " ", html_lib.unescape(s or "")).strip()
 def sentence_split(text):
     text = re.sub(r"^[•\-\*]\s*", "", text or "", flags=re.M)
     text = re.sub(r"\n+", ". ", text)
@@ -29,19 +151,62 @@ def srt_time(sec):
     return f"{sec//3600:02d}:{(sec%3600)//60:02d}:{sec%60:02d},{ms:03d}"
 def write_weighted_srt(script, path, total_duration):
     subs = sentence_split(script)
     if not subs:
         subs = [clean(script)[:140] or "VNEWS"]
     total_chars = max(1, sum(len(x) for x in subs))
-    usable = max(2.0, float(total_duration) - 0.8)
-    cur = 0.35
     with open(path, "w", encoding="utf-8") as f:
         for i, s in enumerate(subs, 1):
-            dur = max(1.4, min(6.0, usable * len(s) / total_chars))
             start = cur
-            end = min(total_duration - 0.1, cur + dur)
-            cur = end + 0.08
             f.write(f"{i}\n{srt_time(start)} --> {srt_time(end)}\n{s}\n\n")
             if cur >= total_duration - 0.2:
                 break
@@ -58,16 +223,16 @@ def tts_script_full(post, emotion):
         "energetic": "Cập nhật nổi bật.",
     }.get(emotion, "")
     script = f"{prefix} {title}. {text}".strip()
-    # Keep full summary, but prevent extremely long generation from hanging forever.
-    if len(script) > 2600:
-        cut = max(script[:2600].rfind("."), script[:2600].rfind("!"), script[:2600].rfind("?"))
-        script = script[:cut + 1] if cut > 1000 else script[:2600]
     script = re.sub(r"([\.\!\?])\s*", r"\1\n", script)
     script = re.sub(r"\n{2,}", "\n", script).strip()
     return script
-# Remove previous duplicated routes and add corrected versions last.
 _PATCH = {('/api/topic_post','POST'),('/api/ai/short/{post_id}','POST'),('/api/ai/short-file/{file_id}','GET'),('/','GET')}
 app.router.routes = [r for r in app.router.routes if not any(getattr(r,'path',None)==p and m in getattr(r,'methods',set()) for p,m in _PATCH)]
@@ -78,22 +243,18 @@ async def topic_post_aggregate(request: Request):
     topic = base._clean_text(body.get('topic',''))
     if not topic:
         return JSONResponse({'error':'missing topic'}, status_code=400)
-    articles = prev._topic_source_articles(topic, limit=5)
     if not articles:
-        return JSONResponse({'error':'Không lấy được bài viết nguồn cho chủ đề này.'}, status_code=422)
     source_blocks = []
     sources = []
     image = ""
     for i, art in enumerate(articles, 1):
         raw = art.get('raw','')
-        if len(raw) < 80:
-            continue
-        source_blocks.append(f"[Nguồn {i}] {art.get('title','')} ({art.get('via','')})\n{raw[:2600]}")
         sources.append(art.get('source') or {'title': art.get('title'), 'url': art.get('url'), 'via': art.get('via'), 'excerpt': raw[:600]})
         if not image and art.get('image'):
             image = art.get('image')
-    if not source_blocks:
-        return JSONResponse({'error':'Các nguồn tìm được không có đủ nội dung để tóm tắt.'}, status_code=422)
     ctx = "\n\n".join(source_blocks)
     prompt = f"""Bạn là biên tập viên tổng hợp tin tức tiếng Việt.
@@ -110,8 +271,8 @@ NHIỆM VỤ:
 - Cuối cùng thêm dòng: Nguồn tham khảo: tên website.
 Nội dung nguồn:
-{ctx[:15000]}"""
-    text = await base.qwen_generate(prompt, image_url=image or None, max_tokens=1000)
     text = prev._postprocess_ai_text(text, max_units=7)
     if 'Nguồn tham khảo:' not in text:
         text += '\n\n' + prev._source_line(sources)
@@ -134,38 +295,45 @@ async def ai_short_full(post_id: str, request: Request):
     if not post:
         return JSONResponse({'error':'post not found'}, status_code=404)
     os.makedirs(base.SHORTS_DIR, exist_ok=True)
-    suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_full"
     out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
     if os.path.exists(out_mp4):
         post['video'] = '/api/ai/short-file/' + post_id + suffix
         base._save_ai_wall(posts)
         return JSONResponse({'video': post['video'], 'speed': speed, 'subtitles': True})
     work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)); os.makedirs(work, exist_ok=True)
-    img = os.path.join(work,'image.jpg'); frame = os.path.join(work,'frame.jpg'); audio = os.path.join(work,'voice.mp3'); audio_fast=os.path.join(work,'voice_fast.mp3'); srt=os.path.join(work,'subtitles.srt')
     try:
         base._download_image(post.get('img'), post.get('title','AI news'), img)
         prev._make_short_frame_full(post, img, frame)
         script = tts_script_full(post, emotion)
         edge_voice = {'nam':'vi-VN-NamMinhNeural','male':'vi-VN-NamMinhNeural','nu':'vi-VN-HoaiMyNeural','female':'vi-VN-HoaiMyNeural','mien-nam':'vi-VN-HoaiMyNeural'}.get(voice,'vi-VN-HoaiMyNeural')
         try:
-            subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',script,'--write-media',audio], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=240)
         except Exception:
             tld = 'com.vn' if voice in ('nu','female','mien-nam') else 'com'
             try:
                 base.gTTS(script, lang='vi', tld=tld, slow=False).save(audio)
             except TypeError:
                 base.gTTS(script, lang='vi', slow=False).save(audio)
-        subprocess.run(['ffmpeg','-y','-i',audio,'-filter:a',f'atempo={speed}','-vn',audio_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180)
         duration = 45.0
         try:
             pr = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:nokey=1',audio_fast], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
             duration = float((pr.stdout or b'45').decode().strip() or 45)
         except Exception:
             pass
-        write_weighted_srt(script, srt, duration)
-        vf = "scale=1080:1920,subtitles='{}':force_style='FontName=DejaVu Sans,FontSize=22,PrimaryColour=&H00FFFFFF,OutlineColour=&HAA000000,BorderStyle=1,Outline=2,Shadow=0,Alignment=2,MarginV=55'".format(srt.replace("'", "\\'"))
         cmd = ['ffmpeg','-y','-loop','1','-i',frame,'-i',audio_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k','-vf',vf,out_mp4]
-        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=360)
         post['video'] = '/api/ai/short-file/' + post_id + suffix
         post['short_voice'] = voice; post['short_emotion'] = emotion; post['short_speed'] = speed; post['short_subtitles'] = True
         base._save_ai_wall(posts)
@@ -182,7 +350,6 @@ def ai_short_file_full(file_id: str):
     return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
-# Use previous front-end injection but override topic alert wording and progress text.
 app.router.routes = [r for r in app.router.routes if not (getattr(r,'path',None)=='/' and 'GET' in getattr(r,'methods',set()))]
 @app.get('/')
@@ -192,8 +359,7 @@ async def index_fix2():
     inject = prev.PATCH_INJECT + r'''
 <script>
 (function(){
-const oldCreateTopic=window.createTopicPost;
-window.createTopicPost=function(){let inp=document.getElementById('ai-topic-input');let topic=(inp&&inp.value||'').trim();if(!topic)return alert('Nhập chủ đề trước');fetch('/api/topic_post',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({topic})}).then(r=>r.json().then(j=>({ok:r.ok,j}))).then(({ok,j})=>{if(ok&&j.post){window.location.reload();alert('Đã tổng hợp nội dung nhiều bài nguồn thành 1 bản tóm tắt trên Tường AI');}else alert(j.error||'Lỗi tạo bài')}).catch(e=>alert(e.message||'Lỗi tạo bài'));};
 })();
 </script>
 '''

+import os, re, subprocess, html as html_lib, json
+from urllib.parse import quote_plus, urlparse, parse_qs, unquote
 import requests
 import ai_patch as prev
 from ai_patch import app
     return re.sub(r"\s+", " ", html_lib.unescape(s or "")).strip()
+def _is_real_article_text(raw):
+    raw = clean(raw)
+    if len(raw) < 500:
+        return False
+    # Reject search-result/title-only pages: need several real sentences.
+    sentences = re.split(r"(?<=[\.\!\?])\s+", raw)
+    long_sentences = [s for s in sentences if len(s) > 45]
+    return len(long_sentences) >= 5
+def _extract_ddg_url(href):
+    if not href:
+        return ""
+    if href.startswith("//"):
+        href = "https:" + href
+    if "duckduckgo.com/l/" in href:
+        try:
+            qs = parse_qs(urlparse(href).query)
+            if qs.get("uddg"):
+                return unquote(qs["uddg"][0])
+        except Exception:
+            pass
+    return href
+def _ddg_article_urls(topic, limit=12):
+    urls = []
+    try:
+        q = quote_plus(topic + " tin tức bài viết phân tích")
+        r = requests.get("https://html.duckduckgo.com/html/?q=" + q, headers=base.HEADERS, timeout=18)
+        r.encoding = "utf-8"
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(r.text, "lxml")
+        for a in soup.select("a.result__a"):
+            u = _extract_ddg_url(a.get("href", ""))
+            if not u.startswith("http"):
+                continue
+            if any(bad in u for bad in ["google.com", "youtube.com", "facebook.com", "x.com", "twitter.com"]):
+                continue
+            if u not in urls:
+                urls.append(u)
+            if len(urls) >= limit:
+                break
+    except Exception:
+        pass
+    return urls
+def _rss_article_urls(topic, limit=10):
+    out = []
+    try:
+        url = "https://news.google.com/rss/search?q=" + quote_plus(topic) + "&hl=vi&gl=VN&ceid=VN:vi"
+        r = requests.get(url, headers=base.HEADERS, timeout=15)
+        r.encoding = "utf-8"
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(r.text, "xml")
+        for it in soup.find_all("item")[:limit]:
+            title = it.find("title").get_text(" ", strip=True) if it.find("title") else ""
+            link = it.find("link").get_text(strip=True) if it.find("link") else ""
+            src = it.find("source").get_text(" ", strip=True) if it.find("source") else base._domain(link)
+            if title and link:
+                out.append({"title": title, "url": link, "via": src, "excerpt": title})
+    except Exception:
+        pass
+    return out
+def _topic_source_articles(topic, limit=5):
+    """Scrape actual article bodies. Do not accept title-only sources."""
+    candidates = []
+    seen = set()
+    # 1) DuckDuckGo actual result URLs are usually more directly scrapable.
+    for u in _ddg_article_urls(topic, limit=14):
+        if u not in seen:
+            seen.add(u)
+            candidates.append({"url": u, "title": "", "via": base._domain(u)})
+    # 2) Add base web_context sources.
+    try:
+        _ctx, srcs = base.web_context(topic, limit=8)
+        for s in srcs or []:
+            u = s.get("url") or ""
+            if u.startswith("http") and u not in seen:
+                seen.add(u)
+                candidates.append(s)
+    except Exception:
+        pass
+    # 3) Google News RSS fallback last.
+    for s in _rss_article_urls(topic, limit=10):
+        u = s.get("url") or ""
+        if u.startswith("http") and u not in seen:
+            seen.add(u)
+            candidates.append(s)
+    out = []
+    for s in candidates[:24]:
+        url = s.get("url") or ""
+        try:
+            page = base.scrape_any_url(url)
+            raw = (page.get("summary", "") + "\n" + page.get("text", "")).strip()
+            if not _is_real_article_text(raw):
+                continue
+            title = page.get("title") or s.get("title") or url
+            via = page.get("via") or s.get("via") or base._domain(url)
+            out.append({
+                "title": title,
+                "url": url,
+                "raw": raw,
+                "image": page.get("image") or "",
+                "via": via,
+                "source": {"title": title, "url": url, "excerpt": raw[:700], "via": via}
+            })
+            if len(out) >= limit:
+                break
+        except Exception:
+            continue
+    return out[:limit]
 def sentence_split(text):
     text = re.sub(r"^[•\-\*]\s*", "", text or "", flags=re.M)
     text = re.sub(r"\n+", ". ", text)
     return f"{sec//3600:02d}:{(sec%3600)//60:02d}:{sec%60:02d},{ms:03d}"
+def parse_timecode(t):
+    # 00:00:01.234 or 00:00:01,234
+    t = t.replace(',', '.')
+    parts = t.split(':')
+    if len(parts) == 3:
+        return int(parts[0])*3600 + int(parts[1])*60 + float(parts[2])
+    if len(parts) == 2:
+        return int(parts[0])*60 + float(parts[1])
+    return float(parts[0])
+def convert_vtt_to_scaled_srt(vtt_path, srt_path, speed=1.2):
+    try:
+        txt = open(vtt_path, 'r', encoding='utf-8').read().splitlines()
+        cues = []
+        i = 0
+        while i < len(txt):
+            line = txt[i].strip()
+            if '-->' in line:
+                a, b = [x.strip().split()[0] for x in line.split('-->')[:2]]
+                start = parse_timecode(a) / speed
+                end = parse_timecode(b) / speed
+                i += 1
+                texts = []
+                while i < len(txt) and txt[i].strip():
+                    texts.append(txt[i].strip())
+                    i += 1
+                s = clean(' '.join(texts))
+                if s:
+                    cues.append((start, end, s))
+            i += 1
+        if not cues:
+            return False
+        with open(srt_path, 'w', encoding='utf-8') as f:
+            for idx, (st, en, s) in enumerate(cues, 1):
+                if en <= st:
+                    en = st + 1.2
+                f.write(f"{idx}\n{srt_time(st)} --> {srt_time(en)}\n{s}\n\n")
+        return True
+    except Exception:
+        return False
 def write_weighted_srt(script, path, total_duration):
     subs = sentence_split(script)
     if not subs:
         subs = [clean(script)[:140] or "VNEWS"]
     total_chars = max(1, sum(len(x) for x in subs))
+    usable = max(2.0, float(total_duration) - 1.0)
+    cur = 0.5
     with open(path, "w", encoding="utf-8") as f:
         for i, s in enumerate(subs, 1):
+            dur = max(1.8, min(7.0, usable * len(s) / total_chars))
             start = cur
+            end = min(total_duration - 0.15, cur + dur)
+            cur = end + 0.18
             f.write(f"{i}\n{srt_time(start)} --> {srt_time(end)}\n{s}\n\n")
             if cur >= total_duration - 0.2:
                 break
         "energetic": "Cập nhật nổi bật.",
     }.get(emotion, "")
     script = f"{prefix} {title}. {text}".strip()
+    # Keep complete wall summary. Only trim pathological payloads, on sentence boundary.
+    if len(script) > 3600:
+        tmp = script[:3600]
+        cut = max(tmp.rfind("."), tmp.rfind("!"), tmp.rfind("?"))
+        script = tmp[:cut + 1] if cut > 1600 else tmp
     script = re.sub(r"([\.\!\?])\s*", r"\1\n", script)
     script = re.sub(r"\n{2,}", "\n", script).strip()
     return script
 _PATCH = {('/api/topic_post','POST'),('/api/ai/short/{post_id}','POST'),('/api/ai/short-file/{file_id}','GET'),('/','GET')}
 app.router.routes = [r for r in app.router.routes if not any(getattr(r,'path',None)==p and m in getattr(r,'methods',set()) for p,m in _PATCH)]
     topic = base._clean_text(body.get('topic',''))
     if not topic:
         return JSONResponse({'error':'missing topic'}, status_code=400)
+    articles = _topic_source_articles(topic, limit=5)
     if not articles:
+        return JSONResponse({'error':'Không scrape được nội dung bài viết thật cho chủ đề này. Hãy thử chủ đề cụ thể hơn hoặc dán URL trực tiếp.'}, status_code=422)
     source_blocks = []
     sources = []
     image = ""
     for i, art in enumerate(articles, 1):
         raw = art.get('raw','')
+        source_blocks.append(f"[Nguồn {i}] {art.get('title','')} ({art.get('via','')})\n{raw[:3000]}")
         sources.append(art.get('source') or {'title': art.get('title'), 'url': art.get('url'), 'via': art.get('via'), 'excerpt': raw[:600]})
         if not image and art.get('image'):
             image = art.get('image')
     ctx = "\n\n".join(source_blocks)
     prompt = f"""Bạn là biên tập viên tổng hợp tin tức tiếng Việt.
 - Cuối cùng thêm dòng: Nguồn tham khảo: tên website.
 Nội dung nguồn:
+{ctx[:16000]}"""
+    text = await prev.base.qwen_generate(prompt, image_url=image or None, max_tokens=1100)
     text = prev._postprocess_ai_text(text, max_units=7)
     if 'Nguồn tham khảo:' not in text:
         text += '\n\n' + prev._source_line(sources)
     if not post:
         return JSONResponse({'error':'post not found'}, status_code=404)
     os.makedirs(base.SHORTS_DIR, exist_ok=True)
+    suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_fullv2"
     out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
     if os.path.exists(out_mp4):
         post['video'] = '/api/ai/short-file/' + post_id + suffix
         base._save_ai_wall(posts)
         return JSONResponse({'video': post['video'], 'speed': speed, 'subtitles': True})
     work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)); os.makedirs(work, exist_ok=True)
+    img = os.path.join(work,'image.jpg'); frame = os.path.join(work,'frame.jpg'); audio = os.path.join(work,'voice.mp3'); audio_fast=os.path.join(work,'voice_fast.mp3'); srt=os.path.join(work,'subtitles.srt'); vtt=os.path.join(work,'subtitles.vtt')
     try:
         base._download_image(post.get('img'), post.get('title','AI news'), img)
         prev._make_short_frame_full(post, img, frame)
         script = tts_script_full(post, emotion)
         edge_voice = {'nam':'vi-VN-NamMinhNeural','male':'vi-VN-NamMinhNeural','nu':'vi-VN-HoaiMyNeural','female':'vi-VN-HoaiMyNeural','mien-nam':'vi-VN-HoaiMyNeural'}.get(voice,'vi-VN-HoaiMyNeural')
+        used_edge = False
         try:
+            subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',script,'--write-media',audio,'--write-subtitles',vtt], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=260)
+            used_edge = True
         except Exception:
             tld = 'com.vn' if voice in ('nu','female','mien-nam') else 'com'
             try:
                 base.gTTS(script, lang='vi', tld=tld, slow=False).save(audio)
             except TypeError:
                 base.gTTS(script, lang='vi', slow=False).save(audio)
+        subprocess.run(['ffmpeg','-y','-i',audio,'-filter:a',f'atempo={speed}','-vn',audio_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=220)
         duration = 45.0
         try:
             pr = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:nokey=1',audio_fast], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
             duration = float((pr.stdout or b'45').decode().strip() or 45)
         except Exception:
             pass
+        if used_edge and os.path.exists(vtt):
+            ok = convert_vtt_to_scaled_srt(vtt, srt, speed=speed)
+            if not ok:
+                write_weighted_srt(script, srt, duration)
+        else:
+            write_weighted_srt(script, srt, duration)
+        vf = "scale=1080:1920,subtitles='{}':force_style='FontName=DejaVu Sans,FontSize=16,PrimaryColour=&H00FFFFFF,OutlineColour=&HAA000000,BorderStyle=1,Outline=1.5,Shadow=0,Alignment=2,MarginV=42'".format(srt.replace("'", "\\'"))
         cmd = ['ffmpeg','-y','-loop','1','-i',frame,'-i',audio_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k','-vf',vf,out_mp4]
+        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=420)
         post['video'] = '/api/ai/short-file/' + post_id + suffix
         post['short_voice'] = voice; post['short_emotion'] = emotion; post['short_speed'] = speed; post['short_subtitles'] = True
         base._save_ai_wall(posts)
     return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
 app.router.routes = [r for r in app.router.routes if not (getattr(r,'path',None)=='/' and 'GET' in getattr(r,'methods',set()))]
 @app.get('/')
     inject = prev.PATCH_INJECT + r'''
 <script>
 (function(){
+window.createTopicPost=function(){let inp=document.getElementById('ai-topic-input');let topic=(inp&&inp.value||'').trim();if(!topic)return alert('Nhập chủ đề trước');fetch('/api/topic_post',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({topic})}).then(r=>r.json().then(j=>({ok:r.ok,j}))).then(({ok,j})=>{if(ok&&j.post){window.location.reload();alert('Đã tổng hợp NỘI DUNG các bài nguồn thành 1 bản tóm tắt trên Tường AI');}else alert(j.error||'Lỗi tạo bài')}).catch(e=>alert(e.message||'Lỗi tạo bài'));};
 })();
 </script>
 '''