Fix topic scraping real article bodies and subtitle timing/font
Browse files- ai_fix2.py +196 -30
ai_fix2.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
import os, re, subprocess, html as html_lib
|
|
|
|
| 2 |
import requests
|
| 3 |
import ai_patch as prev
|
| 4 |
from ai_patch import app
|
|
@@ -12,6 +13,127 @@ def clean(s):
|
|
| 12 |
return re.sub(r"\s+", " ", html_lib.unescape(s or "")).strip()
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def sentence_split(text):
|
| 16 |
text = re.sub(r"^[•\-\*]\s*", "", text or "", flags=re.M)
|
| 17 |
text = re.sub(r"\n+", ". ", text)
|
|
@@ -29,19 +151,62 @@ def srt_time(sec):
|
|
| 29 |
return f"{sec//3600:02d}:{(sec%3600)//60:02d}:{sec%60:02d},{ms:03d}"
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def write_weighted_srt(script, path, total_duration):
|
| 33 |
subs = sentence_split(script)
|
| 34 |
if not subs:
|
| 35 |
subs = [clean(script)[:140] or "VNEWS"]
|
| 36 |
total_chars = max(1, sum(len(x) for x in subs))
|
| 37 |
-
usable = max(2.0, float(total_duration) -
|
| 38 |
-
cur = 0.
|
| 39 |
with open(path, "w", encoding="utf-8") as f:
|
| 40 |
for i, s in enumerate(subs, 1):
|
| 41 |
-
dur = max(1.
|
| 42 |
start = cur
|
| 43 |
-
end = min(total_duration - 0.
|
| 44 |
-
cur = end + 0.
|
| 45 |
f.write(f"{i}\n{srt_time(start)} --> {srt_time(end)}\n{s}\n\n")
|
| 46 |
if cur >= total_duration - 0.2:
|
| 47 |
break
|
|
@@ -58,16 +223,16 @@ def tts_script_full(post, emotion):
|
|
| 58 |
"energetic": "Cập nhật nổi bật.",
|
| 59 |
}.get(emotion, "")
|
| 60 |
script = f"{prefix} {title}. {text}".strip()
|
| 61 |
-
# Keep
|
| 62 |
-
if len(script) >
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
script = re.sub(r"([\.\!\?])\s*", r"\1\n", script)
|
| 66 |
script = re.sub(r"\n{2,}", "\n", script).strip()
|
| 67 |
return script
|
| 68 |
|
| 69 |
|
| 70 |
-
# Remove previous duplicated routes and add corrected versions last.
|
| 71 |
_PATCH = {('/api/topic_post','POST'),('/api/ai/short/{post_id}','POST'),('/api/ai/short-file/{file_id}','GET'),('/','GET')}
|
| 72 |
app.router.routes = [r for r in app.router.routes if not any(getattr(r,'path',None)==p and m in getattr(r,'methods',set()) for p,m in _PATCH)]
|
| 73 |
|
|
@@ -78,22 +243,18 @@ async def topic_post_aggregate(request: Request):
|
|
| 78 |
topic = base._clean_text(body.get('topic',''))
|
| 79 |
if not topic:
|
| 80 |
return JSONResponse({'error':'missing topic'}, status_code=400)
|
| 81 |
-
articles =
|
| 82 |
if not articles:
|
| 83 |
-
return JSONResponse({'error':'Không
|
| 84 |
source_blocks = []
|
| 85 |
sources = []
|
| 86 |
image = ""
|
| 87 |
for i, art in enumerate(articles, 1):
|
| 88 |
raw = art.get('raw','')
|
| 89 |
-
|
| 90 |
-
continue
|
| 91 |
-
source_blocks.append(f"[Nguồn {i}] {art.get('title','')} ({art.get('via','')})\n{raw[:2600]}")
|
| 92 |
sources.append(art.get('source') or {'title': art.get('title'), 'url': art.get('url'), 'via': art.get('via'), 'excerpt': raw[:600]})
|
| 93 |
if not image and art.get('image'):
|
| 94 |
image = art.get('image')
|
| 95 |
-
if not source_blocks:
|
| 96 |
-
return JSONResponse({'error':'Các nguồn tìm được không có đủ nội dung để tóm tắt.'}, status_code=422)
|
| 97 |
ctx = "\n\n".join(source_blocks)
|
| 98 |
prompt = f"""Bạn là biên tập viên tổng hợp tin tức tiếng Việt.
|
| 99 |
|
|
@@ -110,8 +271,8 @@ NHIỆM VỤ:
|
|
| 110 |
- Cuối cùng thêm dòng: Nguồn tham khảo: tên website.
|
| 111 |
|
| 112 |
Nội dung nguồn:
|
| 113 |
-
{ctx[:
|
| 114 |
-
text = await base.qwen_generate(prompt, image_url=image or None, max_tokens=
|
| 115 |
text = prev._postprocess_ai_text(text, max_units=7)
|
| 116 |
if 'Nguồn tham khảo:' not in text:
|
| 117 |
text += '\n\n' + prev._source_line(sources)
|
|
@@ -134,38 +295,45 @@ async def ai_short_full(post_id: str, request: Request):
|
|
| 134 |
if not post:
|
| 135 |
return JSONResponse({'error':'post not found'}, status_code=404)
|
| 136 |
os.makedirs(base.SHORTS_DIR, exist_ok=True)
|
| 137 |
-
suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}
|
| 138 |
out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
|
| 139 |
if os.path.exists(out_mp4):
|
| 140 |
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 141 |
base._save_ai_wall(posts)
|
| 142 |
return JSONResponse({'video': post['video'], 'speed': speed, 'subtitles': True})
|
| 143 |
work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)); os.makedirs(work, exist_ok=True)
|
| 144 |
-
img = os.path.join(work,'image.jpg'); frame = os.path.join(work,'frame.jpg'); audio = os.path.join(work,'voice.mp3'); audio_fast=os.path.join(work,'voice_fast.mp3'); srt=os.path.join(work,'subtitles.srt')
|
| 145 |
try:
|
| 146 |
base._download_image(post.get('img'), post.get('title','AI news'), img)
|
| 147 |
prev._make_short_frame_full(post, img, frame)
|
| 148 |
script = tts_script_full(post, emotion)
|
| 149 |
edge_voice = {'nam':'vi-VN-NamMinhNeural','male':'vi-VN-NamMinhNeural','nu':'vi-VN-HoaiMyNeural','female':'vi-VN-HoaiMyNeural','mien-nam':'vi-VN-HoaiMyNeural'}.get(voice,'vi-VN-HoaiMyNeural')
|
|
|
|
| 150 |
try:
|
| 151 |
-
subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',script,'--write-media',audio], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=
|
|
|
|
| 152 |
except Exception:
|
| 153 |
tld = 'com.vn' if voice in ('nu','female','mien-nam') else 'com'
|
| 154 |
try:
|
| 155 |
base.gTTS(script, lang='vi', tld=tld, slow=False).save(audio)
|
| 156 |
except TypeError:
|
| 157 |
base.gTTS(script, lang='vi', slow=False).save(audio)
|
| 158 |
-
subprocess.run(['ffmpeg','-y','-i',audio,'-filter:a',f'atempo={speed}','-vn',audio_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=
|
| 159 |
duration = 45.0
|
| 160 |
try:
|
| 161 |
pr = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:nokey=1',audio_fast], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
|
| 162 |
duration = float((pr.stdout or b'45').decode().strip() or 45)
|
| 163 |
except Exception:
|
| 164 |
pass
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
cmd = ['ffmpeg','-y','-loop','1','-i',frame,'-i',audio_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k','-vf',vf,out_mp4]
|
| 168 |
-
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=
|
| 169 |
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 170 |
post['short_voice'] = voice; post['short_emotion'] = emotion; post['short_speed'] = speed; post['short_subtitles'] = True
|
| 171 |
base._save_ai_wall(posts)
|
|
@@ -182,7 +350,6 @@ def ai_short_file_full(file_id: str):
|
|
| 182 |
return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
|
| 183 |
|
| 184 |
|
| 185 |
-
# Use previous front-end injection but override topic alert wording and progress text.
|
| 186 |
app.router.routes = [r for r in app.router.routes if not (getattr(r,'path',None)=='/' and 'GET' in getattr(r,'methods',set()))]
|
| 187 |
|
| 188 |
@app.get('/')
|
|
@@ -192,8 +359,7 @@ async def index_fix2():
|
|
| 192 |
inject = prev.PATCH_INJECT + r'''
|
| 193 |
<script>
|
| 194 |
(function(){
|
| 195 |
-
|
| 196 |
-
window.createTopicPost=function(){let inp=document.getElementById('ai-topic-input');let topic=(inp&&inp.value||'').trim();if(!topic)return alert('Nhập chủ đề trước');fetch('/api/topic_post',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({topic})}).then(r=>r.json().then(j=>({ok:r.ok,j}))).then(({ok,j})=>{if(ok&&j.post){window.location.reload();alert('Đã tổng hợp nội dung nhiều bài nguồn thành 1 bản tóm tắt trên Tường AI');}else alert(j.error||'Lỗi tạo bài')}).catch(e=>alert(e.message||'Lỗi tạo bài'));};
|
| 197 |
})();
|
| 198 |
</script>
|
| 199 |
'''
|
|
|
|
| 1 |
+
import os, re, subprocess, html as html_lib, json
|
| 2 |
+
from urllib.parse import quote_plus, urlparse, parse_qs, unquote
|
| 3 |
import requests
|
| 4 |
import ai_patch as prev
|
| 5 |
from ai_patch import app
|
|
|
|
| 13 |
return re.sub(r"\s+", " ", html_lib.unescape(s or "")).strip()
|
| 14 |
|
| 15 |
|
| 16 |
+
def _is_real_article_text(raw):
|
| 17 |
+
raw = clean(raw)
|
| 18 |
+
if len(raw) < 500:
|
| 19 |
+
return False
|
| 20 |
+
# Reject search-result/title-only pages: need several real sentences.
|
| 21 |
+
sentences = re.split(r"(?<=[\.\!\?])\s+", raw)
|
| 22 |
+
long_sentences = [s for s in sentences if len(s) > 45]
|
| 23 |
+
return len(long_sentences) >= 5
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _extract_ddg_url(href):
|
| 27 |
+
if not href:
|
| 28 |
+
return ""
|
| 29 |
+
if href.startswith("//"):
|
| 30 |
+
href = "https:" + href
|
| 31 |
+
if "duckduckgo.com/l/" in href:
|
| 32 |
+
try:
|
| 33 |
+
qs = parse_qs(urlparse(href).query)
|
| 34 |
+
if qs.get("uddg"):
|
| 35 |
+
return unquote(qs["uddg"][0])
|
| 36 |
+
except Exception:
|
| 37 |
+
pass
|
| 38 |
+
return href
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _ddg_article_urls(topic, limit=12):
|
| 42 |
+
urls = []
|
| 43 |
+
try:
|
| 44 |
+
q = quote_plus(topic + " tin tức bài viết phân tích")
|
| 45 |
+
r = requests.get("https://html.duckduckgo.com/html/?q=" + q, headers=base.HEADERS, timeout=18)
|
| 46 |
+
r.encoding = "utf-8"
|
| 47 |
+
from bs4 import BeautifulSoup
|
| 48 |
+
soup = BeautifulSoup(r.text, "lxml")
|
| 49 |
+
for a in soup.select("a.result__a"):
|
| 50 |
+
u = _extract_ddg_url(a.get("href", ""))
|
| 51 |
+
if not u.startswith("http"):
|
| 52 |
+
continue
|
| 53 |
+
if any(bad in u for bad in ["google.com", "youtube.com", "facebook.com", "x.com", "twitter.com"]):
|
| 54 |
+
continue
|
| 55 |
+
if u not in urls:
|
| 56 |
+
urls.append(u)
|
| 57 |
+
if len(urls) >= limit:
|
| 58 |
+
break
|
| 59 |
+
except Exception:
|
| 60 |
+
pass
|
| 61 |
+
return urls
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _rss_article_urls(topic, limit=10):
|
| 65 |
+
out = []
|
| 66 |
+
try:
|
| 67 |
+
url = "https://news.google.com/rss/search?q=" + quote_plus(topic) + "&hl=vi&gl=VN&ceid=VN:vi"
|
| 68 |
+
r = requests.get(url, headers=base.HEADERS, timeout=15)
|
| 69 |
+
r.encoding = "utf-8"
|
| 70 |
+
from bs4 import BeautifulSoup
|
| 71 |
+
soup = BeautifulSoup(r.text, "xml")
|
| 72 |
+
for it in soup.find_all("item")[:limit]:
|
| 73 |
+
title = it.find("title").get_text(" ", strip=True) if it.find("title") else ""
|
| 74 |
+
link = it.find("link").get_text(strip=True) if it.find("link") else ""
|
| 75 |
+
src = it.find("source").get_text(" ", strip=True) if it.find("source") else base._domain(link)
|
| 76 |
+
if title and link:
|
| 77 |
+
out.append({"title": title, "url": link, "via": src, "excerpt": title})
|
| 78 |
+
except Exception:
|
| 79 |
+
pass
|
| 80 |
+
return out
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _topic_source_articles(topic, limit=5):
|
| 84 |
+
"""Scrape actual article bodies. Do not accept title-only sources."""
|
| 85 |
+
candidates = []
|
| 86 |
+
seen = set()
|
| 87 |
+
|
| 88 |
+
# 1) DuckDuckGo actual result URLs are usually more directly scrapable.
|
| 89 |
+
for u in _ddg_article_urls(topic, limit=14):
|
| 90 |
+
if u not in seen:
|
| 91 |
+
seen.add(u)
|
| 92 |
+
candidates.append({"url": u, "title": "", "via": base._domain(u)})
|
| 93 |
+
|
| 94 |
+
# 2) Add base web_context sources.
|
| 95 |
+
try:
|
| 96 |
+
_ctx, srcs = base.web_context(topic, limit=8)
|
| 97 |
+
for s in srcs or []:
|
| 98 |
+
u = s.get("url") or ""
|
| 99 |
+
if u.startswith("http") and u not in seen:
|
| 100 |
+
seen.add(u)
|
| 101 |
+
candidates.append(s)
|
| 102 |
+
except Exception:
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
# 3) Google News RSS fallback last.
|
| 106 |
+
for s in _rss_article_urls(topic, limit=10):
|
| 107 |
+
u = s.get("url") or ""
|
| 108 |
+
if u.startswith("http") and u not in seen:
|
| 109 |
+
seen.add(u)
|
| 110 |
+
candidates.append(s)
|
| 111 |
+
|
| 112 |
+
out = []
|
| 113 |
+
for s in candidates[:24]:
|
| 114 |
+
url = s.get("url") or ""
|
| 115 |
+
try:
|
| 116 |
+
page = base.scrape_any_url(url)
|
| 117 |
+
raw = (page.get("summary", "") + "\n" + page.get("text", "")).strip()
|
| 118 |
+
if not _is_real_article_text(raw):
|
| 119 |
+
continue
|
| 120 |
+
title = page.get("title") or s.get("title") or url
|
| 121 |
+
via = page.get("via") or s.get("via") or base._domain(url)
|
| 122 |
+
out.append({
|
| 123 |
+
"title": title,
|
| 124 |
+
"url": url,
|
| 125 |
+
"raw": raw,
|
| 126 |
+
"image": page.get("image") or "",
|
| 127 |
+
"via": via,
|
| 128 |
+
"source": {"title": title, "url": url, "excerpt": raw[:700], "via": via}
|
| 129 |
+
})
|
| 130 |
+
if len(out) >= limit:
|
| 131 |
+
break
|
| 132 |
+
except Exception:
|
| 133 |
+
continue
|
| 134 |
+
return out[:limit]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
def sentence_split(text):
|
| 138 |
text = re.sub(r"^[•\-\*]\s*", "", text or "", flags=re.M)
|
| 139 |
text = re.sub(r"\n+", ". ", text)
|
|
|
|
| 151 |
return f"{sec//3600:02d}:{(sec%3600)//60:02d}:{sec%60:02d},{ms:03d}"
|
| 152 |
|
| 153 |
|
| 154 |
+
def parse_timecode(t):
|
| 155 |
+
# 00:00:01.234 or 00:00:01,234
|
| 156 |
+
t = t.replace(',', '.')
|
| 157 |
+
parts = t.split(':')
|
| 158 |
+
if len(parts) == 3:
|
| 159 |
+
return int(parts[0])*3600 + int(parts[1])*60 + float(parts[2])
|
| 160 |
+
if len(parts) == 2:
|
| 161 |
+
return int(parts[0])*60 + float(parts[1])
|
| 162 |
+
return float(parts[0])
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def convert_vtt_to_scaled_srt(vtt_path, srt_path, speed=1.2):
|
| 166 |
+
try:
|
| 167 |
+
txt = open(vtt_path, 'r', encoding='utf-8').read().splitlines()
|
| 168 |
+
cues = []
|
| 169 |
+
i = 0
|
| 170 |
+
while i < len(txt):
|
| 171 |
+
line = txt[i].strip()
|
| 172 |
+
if '-->' in line:
|
| 173 |
+
a, b = [x.strip().split()[0] for x in line.split('-->')[:2]]
|
| 174 |
+
start = parse_timecode(a) / speed
|
| 175 |
+
end = parse_timecode(b) / speed
|
| 176 |
+
i += 1
|
| 177 |
+
texts = []
|
| 178 |
+
while i < len(txt) and txt[i].strip():
|
| 179 |
+
texts.append(txt[i].strip())
|
| 180 |
+
i += 1
|
| 181 |
+
s = clean(' '.join(texts))
|
| 182 |
+
if s:
|
| 183 |
+
cues.append((start, end, s))
|
| 184 |
+
i += 1
|
| 185 |
+
if not cues:
|
| 186 |
+
return False
|
| 187 |
+
with open(srt_path, 'w', encoding='utf-8') as f:
|
| 188 |
+
for idx, (st, en, s) in enumerate(cues, 1):
|
| 189 |
+
if en <= st:
|
| 190 |
+
en = st + 1.2
|
| 191 |
+
f.write(f"{idx}\n{srt_time(st)} --> {srt_time(en)}\n{s}\n\n")
|
| 192 |
+
return True
|
| 193 |
+
except Exception:
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
|
| 197 |
def write_weighted_srt(script, path, total_duration):
|
| 198 |
subs = sentence_split(script)
|
| 199 |
if not subs:
|
| 200 |
subs = [clean(script)[:140] or "VNEWS"]
|
| 201 |
total_chars = max(1, sum(len(x) for x in subs))
|
| 202 |
+
usable = max(2.0, float(total_duration) - 1.0)
|
| 203 |
+
cur = 0.5
|
| 204 |
with open(path, "w", encoding="utf-8") as f:
|
| 205 |
for i, s in enumerate(subs, 1):
|
| 206 |
+
dur = max(1.8, min(7.0, usable * len(s) / total_chars))
|
| 207 |
start = cur
|
| 208 |
+
end = min(total_duration - 0.15, cur + dur)
|
| 209 |
+
cur = end + 0.18
|
| 210 |
f.write(f"{i}\n{srt_time(start)} --> {srt_time(end)}\n{s}\n\n")
|
| 211 |
if cur >= total_duration - 0.2:
|
| 212 |
break
|
|
|
|
| 223 |
"energetic": "Cập nhật nổi bật.",
|
| 224 |
}.get(emotion, "")
|
| 225 |
script = f"{prefix} {title}. {text}".strip()
|
| 226 |
+
# Keep complete wall summary. Only trim pathological payloads, on sentence boundary.
|
| 227 |
+
if len(script) > 3600:
|
| 228 |
+
tmp = script[:3600]
|
| 229 |
+
cut = max(tmp.rfind("."), tmp.rfind("!"), tmp.rfind("?"))
|
| 230 |
+
script = tmp[:cut + 1] if cut > 1600 else tmp
|
| 231 |
script = re.sub(r"([\.\!\?])\s*", r"\1\n", script)
|
| 232 |
script = re.sub(r"\n{2,}", "\n", script).strip()
|
| 233 |
return script
|
| 234 |
|
| 235 |
|
|
|
|
| 236 |
_PATCH = {('/api/topic_post','POST'),('/api/ai/short/{post_id}','POST'),('/api/ai/short-file/{file_id}','GET'),('/','GET')}
|
| 237 |
app.router.routes = [r for r in app.router.routes if not any(getattr(r,'path',None)==p and m in getattr(r,'methods',set()) for p,m in _PATCH)]
|
| 238 |
|
|
|
|
| 243 |
topic = base._clean_text(body.get('topic',''))
|
| 244 |
if not topic:
|
| 245 |
return JSONResponse({'error':'missing topic'}, status_code=400)
|
| 246 |
+
articles = _topic_source_articles(topic, limit=5)
|
| 247 |
if not articles:
|
| 248 |
+
return JSONResponse({'error':'Không scrape được nội dung bài viết thật cho chủ đề này. Hãy thử chủ đề cụ thể hơn hoặc dán URL trực tiếp.'}, status_code=422)
|
| 249 |
source_blocks = []
|
| 250 |
sources = []
|
| 251 |
image = ""
|
| 252 |
for i, art in enumerate(articles, 1):
|
| 253 |
raw = art.get('raw','')
|
| 254 |
+
source_blocks.append(f"[Nguồn {i}] {art.get('title','')} ({art.get('via','')})\n{raw[:3000]}")
|
|
|
|
|
|
|
| 255 |
sources.append(art.get('source') or {'title': art.get('title'), 'url': art.get('url'), 'via': art.get('via'), 'excerpt': raw[:600]})
|
| 256 |
if not image and art.get('image'):
|
| 257 |
image = art.get('image')
|
|
|
|
|
|
|
| 258 |
ctx = "\n\n".join(source_blocks)
|
| 259 |
prompt = f"""Bạn là biên tập viên tổng hợp tin tức tiếng Việt.
|
| 260 |
|
|
|
|
| 271 |
- Cuối cùng thêm dòng: Nguồn tham khảo: tên website.
|
| 272 |
|
| 273 |
Nội dung nguồn:
|
| 274 |
+
{ctx[:16000]}"""
|
| 275 |
+
text = await prev.base.qwen_generate(prompt, image_url=image or None, max_tokens=1100)
|
| 276 |
text = prev._postprocess_ai_text(text, max_units=7)
|
| 277 |
if 'Nguồn tham khảo:' not in text:
|
| 278 |
text += '\n\n' + prev._source_line(sources)
|
|
|
|
| 295 |
if not post:
|
| 296 |
return JSONResponse({'error':'post not found'}, status_code=404)
|
| 297 |
os.makedirs(base.SHORTS_DIR, exist_ok=True)
|
| 298 |
+
suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_fullv2"
|
| 299 |
out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
|
| 300 |
if os.path.exists(out_mp4):
|
| 301 |
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 302 |
base._save_ai_wall(posts)
|
| 303 |
return JSONResponse({'video': post['video'], 'speed': speed, 'subtitles': True})
|
| 304 |
work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)); os.makedirs(work, exist_ok=True)
|
| 305 |
+
img = os.path.join(work,'image.jpg'); frame = os.path.join(work,'frame.jpg'); audio = os.path.join(work,'voice.mp3'); audio_fast=os.path.join(work,'voice_fast.mp3'); srt=os.path.join(work,'subtitles.srt'); vtt=os.path.join(work,'subtitles.vtt')
|
| 306 |
try:
|
| 307 |
base._download_image(post.get('img'), post.get('title','AI news'), img)
|
| 308 |
prev._make_short_frame_full(post, img, frame)
|
| 309 |
script = tts_script_full(post, emotion)
|
| 310 |
edge_voice = {'nam':'vi-VN-NamMinhNeural','male':'vi-VN-NamMinhNeural','nu':'vi-VN-HoaiMyNeural','female':'vi-VN-HoaiMyNeural','mien-nam':'vi-VN-HoaiMyNeural'}.get(voice,'vi-VN-HoaiMyNeural')
|
| 311 |
+
used_edge = False
|
| 312 |
try:
|
| 313 |
+
subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',script,'--write-media',audio,'--write-subtitles',vtt], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=260)
|
| 314 |
+
used_edge = True
|
| 315 |
except Exception:
|
| 316 |
tld = 'com.vn' if voice in ('nu','female','mien-nam') else 'com'
|
| 317 |
try:
|
| 318 |
base.gTTS(script, lang='vi', tld=tld, slow=False).save(audio)
|
| 319 |
except TypeError:
|
| 320 |
base.gTTS(script, lang='vi', slow=False).save(audio)
|
| 321 |
+
subprocess.run(['ffmpeg','-y','-i',audio,'-filter:a',f'atempo={speed}','-vn',audio_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=220)
|
| 322 |
duration = 45.0
|
| 323 |
try:
|
| 324 |
pr = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:nokey=1',audio_fast], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
|
| 325 |
duration = float((pr.stdout or b'45').decode().strip() or 45)
|
| 326 |
except Exception:
|
| 327 |
pass
|
| 328 |
+
if used_edge and os.path.exists(vtt):
|
| 329 |
+
ok = convert_vtt_to_scaled_srt(vtt, srt, speed=speed)
|
| 330 |
+
if not ok:
|
| 331 |
+
write_weighted_srt(script, srt, duration)
|
| 332 |
+
else:
|
| 333 |
+
write_weighted_srt(script, srt, duration)
|
| 334 |
+
vf = "scale=1080:1920,subtitles='{}':force_style='FontName=DejaVu Sans,FontSize=16,PrimaryColour=&H00FFFFFF,OutlineColour=&HAA000000,BorderStyle=1,Outline=1.5,Shadow=0,Alignment=2,MarginV=42'".format(srt.replace("'", "\\'"))
|
| 335 |
cmd = ['ffmpeg','-y','-loop','1','-i',frame,'-i',audio_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k','-vf',vf,out_mp4]
|
| 336 |
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=420)
|
| 337 |
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 338 |
post['short_voice'] = voice; post['short_emotion'] = emotion; post['short_speed'] = speed; post['short_subtitles'] = True
|
| 339 |
base._save_ai_wall(posts)
|
|
|
|
| 350 |
return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
|
| 351 |
|
| 352 |
|
|
|
|
| 353 |
app.router.routes = [r for r in app.router.routes if not (getattr(r,'path',None)=='/' and 'GET' in getattr(r,'methods',set()))]
|
| 354 |
|
| 355 |
@app.get('/')
|
|
|
|
| 359 |
inject = prev.PATCH_INJECT + r'''
|
| 360 |
<script>
|
| 361 |
(function(){
|
| 362 |
+
window.createTopicPost=function(){let inp=document.getElementById('ai-topic-input');let topic=(inp&&inp.value||'').trim();if(!topic)return alert('Nhập chủ đề trước');fetch('/api/topic_post',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({topic})}).then(r=>r.json().then(j=>({ok:r.ok,j}))).then(({ok,j})=>{if(ok&&j.post){window.location.reload();alert('Đã tổng hợp NỘI DUNG các bài nguồn thành 1 bản tóm tắt trên Tường AI');}else alert(j.error||'Lỗi tạo bài')}).catch(e=>alert(e.message||'Lỗi tạo bài'));};
|
|
|
|
| 363 |
})();
|
| 364 |
</script>
|
| 365 |
'''
|