Final AI topic aggregate and short timing fixes
Browse files- ai_fix2.py +200 -0
ai_fix2.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, re, subprocess, html as html_lib
|
| 2 |
+
import requests
|
| 3 |
+
import ai_patch as prev
|
| 4 |
+
from ai_patch import app
|
| 5 |
+
from fastapi import Request
|
| 6 |
+
from fastapi.responses import JSONResponse, HTMLResponse, FileResponse
|
| 7 |
+
|
| 8 |
+
base = prev.base
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def clean(s):
|
| 12 |
+
return re.sub(r"\s+", " ", html_lib.unescape(s or "")).strip()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def sentence_split(text):
|
| 16 |
+
text = re.sub(r"^[•\-\*]\s*", "", text or "", flags=re.M)
|
| 17 |
+
text = re.sub(r"\n+", ". ", text)
|
| 18 |
+
parts = []
|
| 19 |
+
for s in re.split(r"(?<=[\.\!\?])\s+", text):
|
| 20 |
+
s = clean(s)
|
| 21 |
+
if len(s) >= 8:
|
| 22 |
+
parts.append(s)
|
| 23 |
+
return parts
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def srt_time(sec):
|
| 27 |
+
ms = int((sec - int(sec)) * 1000)
|
| 28 |
+
sec = int(sec)
|
| 29 |
+
return f"{sec//3600:02d}:{(sec%3600)//60:02d}:{sec%60:02d},{ms:03d}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def write_weighted_srt(script, path, total_duration):
|
| 33 |
+
subs = sentence_split(script)
|
| 34 |
+
if not subs:
|
| 35 |
+
subs = [clean(script)[:140] or "VNEWS"]
|
| 36 |
+
total_chars = max(1, sum(len(x) for x in subs))
|
| 37 |
+
usable = max(2.0, float(total_duration) - 0.8)
|
| 38 |
+
cur = 0.35
|
| 39 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 40 |
+
for i, s in enumerate(subs, 1):
|
| 41 |
+
dur = max(1.4, min(6.0, usable * len(s) / total_chars))
|
| 42 |
+
start = cur
|
| 43 |
+
end = min(total_duration - 0.1, cur + dur)
|
| 44 |
+
cur = end + 0.08
|
| 45 |
+
f.write(f"{i}\n{srt_time(start)} --> {srt_time(end)}\n{s}\n\n")
|
| 46 |
+
if cur >= total_duration - 0.2:
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def tts_script_full(post, emotion):
|
| 51 |
+
title = clean(post.get("title", ""))
|
| 52 |
+
text = clean(post.get("text", ""))
|
| 53 |
+
text = re.sub(r"Nguồn tham khảo:.*", "", text, flags=re.S).strip()
|
| 54 |
+
prefix = {
|
| 55 |
+
"urgent": "Tin nhanh.",
|
| 56 |
+
"warm": "Câu chuyện đáng chú ý.",
|
| 57 |
+
"serious": "Bản tin nghiêm túc.",
|
| 58 |
+
"energetic": "Cập nhật nổi bật.",
|
| 59 |
+
}.get(emotion, "")
|
| 60 |
+
script = f"{prefix} {title}. {text}".strip()
|
| 61 |
+
# Keep full summary, but prevent extremely long generation from hanging forever.
|
| 62 |
+
if len(script) > 2600:
|
| 63 |
+
cut = max(script[:2600].rfind("."), script[:2600].rfind("!"), script[:2600].rfind("?"))
|
| 64 |
+
script = script[:cut + 1] if cut > 1000 else script[:2600]
|
| 65 |
+
script = re.sub(r"([\.\!\?])\s*", r"\1\n", script)
|
| 66 |
+
script = re.sub(r"\n{2,}", "\n", script).strip()
|
| 67 |
+
return script
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# Remove previous duplicated routes and add corrected versions last.
|
| 71 |
+
_PATCH = {('/api/topic_post','POST'),('/api/ai/short/{post_id}','POST'),('/api/ai/short-file/{file_id}','GET'),('/','GET')}
|
| 72 |
+
app.router.routes = [r for r in app.router.routes if not any(getattr(r,'path',None)==p and m in getattr(r,'methods',set()) for p,m in _PATCH)]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@app.post('/api/topic_post')
|
| 76 |
+
async def topic_post_aggregate(request: Request):
|
| 77 |
+
body = await request.json()
|
| 78 |
+
topic = base._clean_text(body.get('topic',''))
|
| 79 |
+
if not topic:
|
| 80 |
+
return JSONResponse({'error':'missing topic'}, status_code=400)
|
| 81 |
+
articles = prev._topic_source_articles(topic, limit=5)
|
| 82 |
+
if not articles:
|
| 83 |
+
return JSONResponse({'error':'Không lấy được bài viết nguồn cho chủ đề này.'}, status_code=422)
|
| 84 |
+
source_blocks = []
|
| 85 |
+
sources = []
|
| 86 |
+
image = ""
|
| 87 |
+
for i, art in enumerate(articles, 1):
|
| 88 |
+
raw = art.get('raw','')
|
| 89 |
+
if len(raw) < 80:
|
| 90 |
+
continue
|
| 91 |
+
source_blocks.append(f"[Nguồn {i}] {art.get('title','')} ({art.get('via','')})\n{raw[:2600]}")
|
| 92 |
+
sources.append(art.get('source') or {'title': art.get('title'), 'url': art.get('url'), 'via': art.get('via'), 'excerpt': raw[:600]})
|
| 93 |
+
if not image and art.get('image'):
|
| 94 |
+
image = art.get('image')
|
| 95 |
+
if not source_blocks:
|
| 96 |
+
return JSONResponse({'error':'Các nguồn tìm được không có đủ nội dung để tóm tắt.'}, status_code=422)
|
| 97 |
+
ctx = "\n\n".join(source_blocks)
|
| 98 |
+
prompt = f"""Bạn là biên tập viên tổng hợp tin tức tiếng Việt.
|
| 99 |
+
|
| 100 |
+
Chủ đề: {topic}
|
| 101 |
+
|
| 102 |
+
NHIỆM VỤ:
|
| 103 |
+
- Đọc nội dung của TẤT CẢ các bài nguồn bên dưới.
|
| 104 |
+
- Tổng hợp thành 1 bản tóm tắt chung duy nhất, giống cách tóm tắt qua URL.
|
| 105 |
+
- Không tạo mỗi tiêu đề thành một bài riêng.
|
| 106 |
+
- Không chỉ liệt kê tiêu đề; phải dựa vào nội dung trong từng bài.
|
| 107 |
+
- Không lặp ý giữa các nguồn.
|
| 108 |
+
- Tối đa 6 gạch đầu dòng, mỗi dòng 1 câu rõ ràng.
|
| 109 |
+
- Nếu các nguồn có góc nhìn khác nhau, gộp lại thành ý tổng hợp.
|
| 110 |
+
- Cuối cùng thêm dòng: Nguồn tham khảo: tên website.
|
| 111 |
+
|
| 112 |
+
Nội dung nguồn:
|
| 113 |
+
{ctx[:15000]}"""
|
| 114 |
+
text = await base.qwen_generate(prompt, image_url=image or None, max_tokens=1000)
|
| 115 |
+
text = prev._postprocess_ai_text(text, max_units=7)
|
| 116 |
+
if 'Nguồn tham khảo:' not in text:
|
| 117 |
+
text += '\n\n' + prev._source_line(sources)
|
| 118 |
+
post = base.make_post('Tổng hợp: ' + topic, text, image or base.pollinations_image_url(topic), '', 'topic_aggregate', sources=sources[:5])
|
| 119 |
+
posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts)
|
| 120 |
+
return JSONResponse({'post': post, 'count_sources': len(sources)})
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@app.post('/api/ai/short/{post_id}')
|
| 124 |
+
async def ai_short_full(post_id: str, request: Request):
|
| 125 |
+
try:
|
| 126 |
+
body = await request.json()
|
| 127 |
+
except Exception:
|
| 128 |
+
body = {}
|
| 129 |
+
voice = str(body.get('voice','nu')).lower().strip()
|
| 130 |
+
emotion = str(body.get('emotion','neutral')).lower().strip()
|
| 131 |
+
speed = max(0.85, min(1.35, float(body.get('speed', 1.2) or 1.2)))
|
| 132 |
+
posts = base._load_ai_wall()
|
| 133 |
+
post = next((p for p in posts if str(p.get('id')) == str(post_id)), None)
|
| 134 |
+
if not post:
|
| 135 |
+
return JSONResponse({'error':'post not found'}, status_code=404)
|
| 136 |
+
os.makedirs(base.SHORTS_DIR, exist_ok=True)
|
| 137 |
+
suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_full"
|
| 138 |
+
out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
|
| 139 |
+
if os.path.exists(out_mp4):
|
| 140 |
+
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 141 |
+
base._save_ai_wall(posts)
|
| 142 |
+
return JSONResponse({'video': post['video'], 'speed': speed, 'subtitles': True})
|
| 143 |
+
work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)); os.makedirs(work, exist_ok=True)
|
| 144 |
+
img = os.path.join(work,'image.jpg'); frame = os.path.join(work,'frame.jpg'); audio = os.path.join(work,'voice.mp3'); audio_fast=os.path.join(work,'voice_fast.mp3'); srt=os.path.join(work,'subtitles.srt')
|
| 145 |
+
try:
|
| 146 |
+
base._download_image(post.get('img'), post.get('title','AI news'), img)
|
| 147 |
+
prev._make_short_frame_full(post, img, frame)
|
| 148 |
+
script = tts_script_full(post, emotion)
|
| 149 |
+
edge_voice = {'nam':'vi-VN-NamMinhNeural','male':'vi-VN-NamMinhNeural','nu':'vi-VN-HoaiMyNeural','female':'vi-VN-HoaiMyNeural','mien-nam':'vi-VN-HoaiMyNeural'}.get(voice,'vi-VN-HoaiMyNeural')
|
| 150 |
+
try:
|
| 151 |
+
subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',script,'--write-media',audio], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=240)
|
| 152 |
+
except Exception:
|
| 153 |
+
tld = 'com.vn' if voice in ('nu','female','mien-nam') else 'com'
|
| 154 |
+
try:
|
| 155 |
+
base.gTTS(script, lang='vi', tld=tld, slow=False).save(audio)
|
| 156 |
+
except TypeError:
|
| 157 |
+
base.gTTS(script, lang='vi', slow=False).save(audio)
|
| 158 |
+
subprocess.run(['ffmpeg','-y','-i',audio,'-filter:a',f'atempo={speed}','-vn',audio_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180)
|
| 159 |
+
duration = 45.0
|
| 160 |
+
try:
|
| 161 |
+
pr = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:nokey=1',audio_fast], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
|
| 162 |
+
duration = float((pr.stdout or b'45').decode().strip() or 45)
|
| 163 |
+
except Exception:
|
| 164 |
+
pass
|
| 165 |
+
write_weighted_srt(script, srt, duration)
|
| 166 |
+
vf = "scale=1080:1920,subtitles='{}':force_style='FontName=DejaVu Sans,FontSize=22,PrimaryColour=&H00FFFFFF,OutlineColour=&HAA000000,BorderStyle=1,Outline=2,Shadow=0,Alignment=2,MarginV=55'".format(srt.replace("'", "\\'"))
|
| 167 |
+
cmd = ['ffmpeg','-y','-loop','1','-i',frame,'-i',audio_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k','-vf',vf,out_mp4]
|
| 168 |
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=360)
|
| 169 |
+
post['video'] = '/api/ai/short-file/' + post_id + suffix
|
| 170 |
+
post['short_voice'] = voice; post['short_emotion'] = emotion; post['short_speed'] = speed; post['short_subtitles'] = True
|
| 171 |
+
base._save_ai_wall(posts)
|
| 172 |
+
return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': True, 'duration': duration})
|
| 173 |
+
except Exception as e:
|
| 174 |
+
return JSONResponse({'error':'Không tạo được shorts: '+str(e)[:180]}, status_code=500)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
@app.get('/api/ai/short-file/{file_id}')
|
| 178 |
+
def ai_short_file_full(file_id: str):
|
| 179 |
+
path = os.path.join(base.SHORTS_DIR, base._safe_name(file_id) + '.mp4')
|
| 180 |
+
if not os.path.exists(path):
|
| 181 |
+
return JSONResponse({'error':'not found'}, status_code=404)
|
| 182 |
+
return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# Use previous front-end injection but override topic alert wording and progress text.
|
| 186 |
+
app.router.routes = [r for r in app.router.routes if not (getattr(r,'path',None)=='/' and 'GET' in getattr(r,'methods',set()))]
|
| 187 |
+
|
| 188 |
+
@app.get('/')
|
| 189 |
+
async def index_fix2():
|
| 190 |
+
with open('/app/static/index.html','r',encoding='utf-8') as f:
|
| 191 |
+
html = f.read()
|
| 192 |
+
inject = prev.PATCH_INJECT + r'''
|
| 193 |
+
<script>
|
| 194 |
+
(function(){
|
| 195 |
+
const oldCreateTopic=window.createTopicPost;
|
| 196 |
+
window.createTopicPost=function(){let inp=document.getElementById('ai-topic-input');let topic=(inp&&inp.value||'').trim();if(!topic)return alert('Nhập chủ đề trước');fetch('/api/topic_post',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({topic})}).then(r=>r.json().then(j=>({ok:r.ok,j}))).then(({ok,j})=>{if(ok&&j.post){window.location.reload();alert('Đã tổng hợp nội dung nhiều bài nguồn thành 1 bản tóm tắt trên Tường AI');}else alert(j.error||'Lỗi tạo bài')}).catch(e=>alert(e.message||'Lỗi tạo bài'));};
|
| 197 |
+
})();
|
| 198 |
+
</script>
|
| 199 |
+
'''
|
| 200 |
+
return HTMLResponse(html.replace('</body>', inject+'\n</body>'))
|