VNEWS / ai_patch.py
bep40's picture
Squash main
4b6e868
Raw
History Blame Contribute Delete
37.2 kB
import os
import re
import time
import random
import json
import html as html_lib
import subprocess
import requests
import hashlib
import ai_ext as base
from ai_ext import app
from fastapi import Request
from fastapi.responses import JSONResponse, HTMLResponse, FileResponse
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
try:
from PIL import Image, ImageDraw, ImageFont
except Exception:
Image = ImageDraw = ImageFont = None
def _clean(s):
s = html_lib.unescape(s or "")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\n{3,}", "\n\n", s)
return s.strip()
def _norm(s):
s = s.lower()
s = re.sub(r"[^\wÀ-ỹ\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _similar(a, b):
ta = set(_norm(a).split())
tb = set(_norm(b).split())
if not ta or not tb:
return False
return len(ta & tb) / max(1, min(len(ta), len(tb))) >= 0.72
def _dedupe_units(units, max_units=25):
"""Deduplicate units - only skip exact matches to ensure all bullet points are read."""
out, seen = [], set()
for u in units:
u = _clean(re.sub(r"^[-•*\d\.\)\s]+", "", u))
if len(u) < 18:
continue
nu = _norm(u)
# Only skip exact matches, NOT similar content (to avoid skipping valid bullet points)
if nu in seen:
continue
seen.add(nu)
out.append(u)
if len(out) >= max_units:
break
return out
def _postprocess_ai_text(text, max_units=20):
text = _clean(text)
if not text:
return text
drop_prefixes = (
"dưới đây", "sau đây", "bài viết", "tôi sẽ", "mình sẽ",
"tóm tắt bài", "tiêu đề:", "sapo:", "nội dung:", "kết luận:"
)
raw_lines = []
for line in re.split(r"\n+", text):
line = _clean(line)
if not line:
continue
low = line.lower().strip()
if any(low.startswith(p) and len(line) < 80 for p in drop_prefixes):
continue
raw_lines.append(line)
units = []
for line in raw_lines:
# KEEP FULL bullet point - don't truncate or split into segments
if len(line) >= 18:
units.append(_clean(re.sub(r"^[-•*\d\.\)\s]+", "", line)))
units = _dedupe_units(units, max_units=max_units)
if not units:
return text[:900]
title = ""
if raw_lines and len(raw_lines[0]) <= 90 and not raw_lines[0].startswith(("-", "•", "*")):
title = raw_lines[0]
units = [u for u in units if not _similar(u, title)]
body = "\n".join("• " + u for u in units[:max_units])
return (title + "\n\n" + body).strip() if title else body
def _fallback_summary_from_prompt(prompt, max_units=6):
text = prompt or ""
for marker in ["Nội dung nguồn:", "Nội dung bài:", "Nội dung gốc:", "Nội dung:", "Nguồn/bối cảnh internet:"]:
if marker in text:
text = text.split(marker, 1)[1]
break
text = re.sub(r"https?://\S+", "", text)
text = re.sub(r"\s+", " ", text).strip()
sentences = re.split(r"(?<=[\.\!\?])\s+(?=[A-ZÀ-Ỹ0-9])", text)
candidates = []
for s in sentences:
s = _clean(s)
if 45 <= len(s) <= 260:
candidates.append(s)
units = _dedupe_units(candidates, max_units=max_units)
if units:
return "\n".join("• " + u for u in units)
if text:
return "• " + text[:700].rsplit(" ", 1)[0]
return "• Không có đủ nội dung nguồn để tóm tắt."
def _source_line(sources):
names = []
for s in (sources or [])[:5]:
via = s.get("via") or base._domain(s.get("url", "")) or s.get("title", "")
if via and via not in names:
names.append(via)
return "Nguồn tham khảo: " + ", ".join(names[:5]) if names else "Nguồn tham khảo: tổng hợp internet"
def _make_summary_prompt(title, raw, source_hint=""):
return f"""Bạn là biên tập viên tóm tắt tin tức tiếng Việt.
NHIỆM VỤ BẮT BUỘC:
- Chỉ TÓM TẮT nội dung chính, KHÔNG viết lại toàn bộ bài.
- Không lặp lại cùng một ý, cùng một câu, cùng một chi tiết.
- Không thêm thông tin ngoài nguồn.
- Tối đa 5 gạch đầu dòng, mỗi gạch đầu dòng 1 câu ngắn.
- Nếu bài có số liệu/nhân vật/thời điểm quan trọng thì giữ lại.
- Không viết phần mở bài dài, không viết văn kể lại.
Tiêu đề nguồn: {title}
Nguồn: {source_hint}
Nội dung nguồn:
{raw[:14000]}
"""
def _direct_news_rss(topic, limit=10):
out = []
try:
url = "https://news.google.com/rss/search?q=" + quote_plus(topic) + "&hl=vi&gl=VN&ceid=VN:vi"
r = requests.get(url, headers=base.HEADERS, timeout=15)
r.encoding = "utf-8"
soup = BeautifulSoup(r.text, "xml")
for it in soup.find_all("item")[:limit]:
title = it.find("title").get_text(" ", strip=True) if it.find("title") else ""
link = it.find("link").get_text(strip=True) if it.find("link") else ""
src = it.find("source").get_text(" ", strip=True) if it.find("source") else base._domain(link)
if title and link:
out.append({"title": title, "url": link, "via": src, "excerpt": title})
except Exception:
pass
return out
def _topic_source_articles(topic, limit=5):
"""Return actual scraped article bodies for a topic. Each source becomes one Wall AI post."""
try:
_ctx, sources = base.web_context(topic, limit=limit)
except Exception:
sources = []
if not sources:
sources = _direct_news_rss(topic, limit=10)
out, seen = [], set()
for s in (sources or [])[:limit * 3]:
url = s.get("url") or ""
if not url.startswith("http") or url in seen:
continue
seen.add(url)
try:
page = base.scrape_any_url(url)
raw = (page.get("summary", "") + "\n" + page.get("text", "")).strip()
if len(raw) < 180:
continue
title = page.get("title") or s.get("title") or url
via = page.get("via") or s.get("via") or base._domain(url)
out.append({
"title": title,
"url": url,
"raw": raw,
"image": page.get("image") or "",
"via": via,
"source": {"title": title, "url": url, "excerpt": raw[:700], "via": via}
})
if len(out) >= limit:
break
except Exception:
continue
if not out:
for s in (sources or _direct_news_rss(topic, 6))[:limit]:
title = s.get("title") or topic
excerpt = s.get("excerpt") or s.get("description") or s.get("content") or title
url = s.get("url", "")
via = s.get("via") or base._domain(url)
out.append({
"title": title,
"url": url,
"raw": excerpt,
"image": base.pollinations_image_url(title),
"via": via,
"source": {"title": title, "url": url, "excerpt": excerpt[:700], "via": via}
})
return out[:limit]
async def qwen_generate_resilient(prompt: str, image_url=None, max_tokens: int = 1200):
errors = []
token = base._hf_token()
try:
original = getattr(base, "_original_qwen_generate", None)
if original:
txt = await original(prompt, image_url=image_url, max_tokens=max_tokens)
if txt:
base.LAST_QWEN_ERROR = ""
return txt
if getattr(base, "LAST_QWEN_ERROR", ""):
errors.append("sdk: " + str(base.LAST_QWEN_ERROR)[:260])
except Exception as e:
errors.append(f"sdk: {type(e).__name__}: {str(e)[:260]}")
if token:
models = []
for m in [
os.getenv("QWEN_VL_MODEL", ""),
"Qwen/Qwen2.5-VL-7B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2.5-7B-Instruct",
"Qwen/Qwen2.5-3B-Instruct",
"Qwen/Qwen2.5-1.5B-Instruct",
]:
if m and m not in models:
models.append(m)
headers = {"Authorization": "Bearer " + token, "Content-Type": "application/json"}
for model in models:
try:
is_vl = "VL" in model and bool(image_url)
user_content = ([{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}] if is_vl else prompt)
payload = {
"model": model,
"messages": [
{"role": "system", "content": "Bạn là biên tập viên AI tiếng Việt. Chỉ tóm tắt súc tích nội dung nguồn, không viết lại toàn bài, không lặp ý, không bịa chi tiết."},
{"role": "user", "content": user_content},
],
"max_tokens": min(int(max_tokens or 900), 1400),
"temperature": 0.35,
"top_p": 0.85,
}
r = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, json=payload, timeout=95)
if r.status_code >= 300:
errors.append(f"{model}: HTTP {r.status_code} {r.text[:180]}")
continue
j = r.json()
txt = (j.get("choices", [{}])[0].get("message", {}).get("content") or "").strip()
if txt:
base.LAST_QWEN_ERROR = ""
return txt
errors.append(f"{model}: empty response")
except Exception as e:
errors.append(f"{model}: {type(e).__name__}: {str(e)[:220]}")
else:
errors.append("missing HF_TOKEN")
base.LAST_QWEN_ERROR = " | ".join(errors[-6:]) or "Qwen unavailable; used extractive fallback"
print("[qwen resilient fallback]", base.LAST_QWEN_ERROR)
return _fallback_summary_from_prompt(prompt, max_units=12)
if not hasattr(base, "_original_qwen_generate"):
base._original_qwen_generate = base.qwen_generate
base.qwen_generate = qwen_generate_resilient
@app.get('/api/wall')
def compat_wall():
return JSONResponse({'posts': base._load_ai_wall()[:80]})
_PATCHED_PATHS = {
('/api/topic_post', 'POST'),
('/api/url_wall', 'POST'),
('/api/rewrite_share', 'POST'),
('/api/ai/short/{post_id}', 'POST'),
}
app.router.routes = [
r for r in app.router.routes
if not any(getattr(r, 'path', None) == p and m in getattr(r, 'methods', set()) for p, m in _PATCHED_PATHS)
]
@app.post('/api/topic_post')
async def compat_topic_post(request: Request):
body = await request.json()
topic = base._clean_text(body.get('topic', ''))
if not topic:
return JSONResponse({'error': 'missing topic'}, status_code=400)
articles = _topic_source_articles(topic, limit=4)
if not articles:
return JSONResponse({'error': 'Không lấy được bài viết nguồn cho chủ đề này.'}, status_code=422)
new_posts = []
posts = base._load_ai_wall()
for art in articles:
prompt = f"""Tóm tắt RIÊNG bài viết nguồn sau để đăng Tường AI.
Chủ đề lọc: {topic}
Tiêu đề bài nguồn: {art['title']}
Nguồn: {art['via']}
Yêu cầu bắt buộc:
- Tóm tắt nội dung trong BÀI VIẾT này, không chỉ tiêu đề.
- Không trộn với bài khác.
- Không viết lại toàn bộ bài.
- Không lặp ý.
- 4-6 gạch đầu dòng, mỗi dòng 1 câu rõ ràng.
- Giữ số liệu/nhân vật/thời điểm quan trọng nếu có.
Nội dung bài:
{art['raw'][:14000]}"""
text = await base.qwen_generate(prompt, image_url=art.get('image') or None, max_tokens=1500)
text = _postprocess_ai_text(text, max_units=20)
src = [art['source']]
if 'Nguồn tham khảo:' not in text:
text += "\n\n" + _source_line(src)
post = base.make_post(art['title'], text, art.get('image') or base.pollinations_image_url(art['title']), art.get('url') or '', 'topic_article', sources=src)
new_posts.append(post)
posts = new_posts + posts
base._save_ai_wall(posts)
return JSONResponse({'post': new_posts[0], 'posts': new_posts, 'count': len(new_posts)})
@app.post('/api/url_wall')
async def compat_url_wall(request: Request):
body = await request.json()
url = base._clean_text(body.get('url', ''))
if not url.startswith('http'):
return JSONResponse({'error': 'missing url'}, status_code=400)
try:
data = base.scrape_any_url(url)
except Exception as e:
return JSONResponse({'error': 'Không scrape được URL: ' + str(e)[:180]}, status_code=422)
raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip()
if len(raw) < 120:
return JSONResponse({'error': 'URL không có đủ nội dung để tóm tắt'}, status_code=422)
prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url))
text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500)
text = _postprocess_ai_text(text, max_units=20)
src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}]
if 'Nguồn tham khảo:' not in text:
text += "\n\n" + _source_line(src)
post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'url', sources=src)
posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts)
return JSONResponse({'post': post})
def _is_relevant_image(img_url, title, text):
"""Check if an image is relevant to the article content."""
if not img_url:
return False
skip_patterns = ['pixel', 'analytics', 'tracking', '1x1.gif', 'spacer.gif',
'logo', 'icon', 'avatar', 'emoji', 'smiley', 'sprite',
'advertisement', 'ad-banner', 'sponsored', 'banner-ads']
img_lower = img_url.lower()
for p in skip_patterns:
if p in img_lower:
return False
if not any(img_lower.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.webp', '.gif']):
return False
return True
def _filter_relevant_images(images, title, text, max_images=8):
"""Filter and rank images by relevance to article content."""
if not images:
return []
seen = set()
relevant = []
for img in images:
if img in seen:
continue
seen.add(img)
if _is_relevant_image(img, title, text):
relevant.append(img)
return relevant[:max_images]
def _extract_key_points_for_slides(paragraphs, max_points=12):
"""Extract key points from paragraphs for slides - extracts ALL sentences, not just first one."""
points = []
for p in paragraphs:
if len(points) >= max_points:
break
p = _clean(p)
if not p:
continue
# Split paragraph into sentences using Vietnamese + English punctuation - GET ALL SENTENCES
sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÀ-Ỹ0-9])', p)
sentences = [s.strip() for s in sentences if s.strip()]
for sentence in sentences:
if len(points) >= max_points:
break
sentence = _clean(sentence)
if len(sentence) < 30:
continue
if any(sentence[:60] in existing for existing in points):
continue
if not sentence.endswith(('.', '!', '?')):
sentence = sentence + '.'
points.append(sentence)
return points
def _scrape_article_images(url):
"""Scrape article page and return only relevant images."""
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8"}
r = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'aside', 'form']):
tag.decompose()
h1 = soup.find('h1')
ogt = soup.find('meta', property='og:title')
title = (h1.get_text(strip=True) if h1 else '') or (ogt.get('content', '') if ogt else '')
ogi = soup.find('meta', property='og:image')
og_img = ogi.get('content', '') if ogi else ''
if og_img and og_img.startswith('//'):
og_img = 'https:' + og_img
block = None
for sel in ['article', '.singular-content', '.detail-content', '.fck_detail', '.content-detail', '.knc-content', 'main', '.cms-body', '.article__body']:
el = soup.select_one(sel)
if el and len(el.find_all('p')) >= 2:
block = el
break
if not block:
block = soup.body or soup
paragraphs = []
all_images = []
seen_imgs = set()
if og_img and og_img not in seen_imgs:
all_images.append(og_img)
seen_imgs.add(og_img)
for el in block.find_all(['p', 'h2', 'h3', 'figure', 'img'], recursive=True):
if el.name == 'p':
t = _clean(el.get_text(strip=True))
if t and len(t) > 40:
paragraphs.append(t)
elif el.name in ('figure', 'img'):
im = el if el.name == 'img' else el.find('img')
if im:
src = im.get('data-src') or im.get('src') or im.get('data-original') or ''
if src and 'base64' not in src:
if src.startswith('//'):
src = 'https:' + src
if src not in seen_imgs:
all_images.append(src)
seen_imgs.add(src)
relevant_images = _filter_relevant_images(all_images, title, ' '.join(paragraphs[:5]))
return {'title': _clean(title), 'paragraphs': paragraphs, 'images': relevant_images, 'og_img': og_img}
except Exception:
return None
@app.post('/api/rewrite_share')
async def compat_rewrite_share(request: Request):
body = await request.json()
url = base._clean_text(body.get('url', ''))
if not url.startswith('http'):
return JSONResponse({'error': 'missing url'}, status_code=400)
try:
data = base.scrape_any_url(url)
except Exception as e:
return JSONResponse({'error': 'Không đọc được bài viết: ' + str(e)[:180]}, status_code=422)
raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip()
if len(raw) < 120:
return JSONResponse({'error': 'Bài viết không đủ nội dung để tóm tắt'}, status_code=422)
prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url))
text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500)
text = _postprocess_ai_text(text, max_units=20)
src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}]
if 'Nguồn tham khảo:' not in text:
text += "\n\n" + _source_line(src)
post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'summary', sources=src)
posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts)
# Generate slides with relevant images only
slides = []
page_data = _scrape_article_images(url)
if page_data and page_data.get('paragraphs'):
key_points = _extract_key_points_for_slides(page_data['paragraphs'], max_points=12)
if key_points:
relevant_imgs = page_data.get('images', [])
if not relevant_imgs and page_data.get('og_img'):
relevant_imgs = [page_data['og_img']]
for i, point in enumerate(key_points):
img = relevant_imgs[i] if i < len(relevant_imgs) else (relevant_imgs[-1] if relevant_imgs else '')
slides.append({'text': point, 'image': img, 'index': i + 1})
return JSONResponse({'post': post, 'slides': slides})
def _emotion_script(text, emotion):
"""Prepend emotion-appropriate prefix to text based on emotion type.
NOTE: Prefix is NOT added to avoid cluttering Short AI speech.
The emotion is still used for voice selection but content is read cleanly.
"""
text = _clean(text)
# REMOVED: No prefix added to keep content clean and natural
return text
def _tts_script_smart(post, emotion):
raw = base._short_script(post) if hasattr(base, '_short_script') else _clean(post.get('text', '') or post.get('title', ''))
raw = re.sub(r"^[•\-\*]\s*", "", raw, flags=re.M)
raw = re.sub(r"\s*\n\s*", ". ", raw)
raw = re.sub(r"([\.\!\?])\s*", r"\1\n", raw)
raw = re.sub(r"\n{2,}", "\n", raw).strip()
# REMOVED: _emotion_script call - read content cleanly without prefix
# INCREASED to 3000 to read full content of all bullet points
if len(raw) > 3000:
raw = raw[:3000]
cut = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
if cut > 700:
raw = raw[:cut + 1]
return raw
def _split_subtitle_sentences(script):
parts = []
for line in script.splitlines():
line = _clean(line)
if not line:
continue
for s in re.split(r"(?<=[\.\!\?])\s+", line):
s = _clean(s)
if 8 <= len(s) <= 140:
parts.append(s)
return parts[:12]
def _srt_time(sec):
ms = int((sec - int(sec)) * 1000)
sec = int(sec)
h = sec // 3600
m = (sec % 3600) // 60
s = sec % 60
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def _write_srt(script, path, total_duration=30):
subs = _split_subtitle_sentences(script)
if not subs:
subs = [script[:120]]
dur = max(2.2, min(5.0, total_duration / max(1, len(subs))))
cur = 0.3
with open(path, 'w', encoding='utf-8') as f:
for i, s in enumerate(subs, 1):
start = cur
end = cur + dur
cur = end + 0.15
f.write(f"{i}\n{_srt_time(start)} --> {_srt_time(end)}\n{s}\n\n")
def _wrap_text_px(draw, text, font, max_width, max_lines):
words = _clean(text).split()
lines, cur = [], ""
for w in words:
test = (cur + " " + w).strip()
try:
width = draw.textbbox((0, 0), test, font=font)[2]
except Exception:
width = len(test) * 20
if width <= max_width:
cur = test
else:
if cur:
lines.append(cur)
cur = w
if len(lines) >= max_lines:
break
if cur and len(lines) < max_lines:
lines.append(cur)
return lines
def _make_short_frame_full(post, img_path, out_path):
if Image is None:
return base._make_short_frame(post, img_path, out_path)
W, H = 1080, 1920
bg = Image.new("RGB", (W, H), (14, 14, 14))
try:
im = Image.open(img_path).convert("RGB")
target = (1080, 760)
im_ratio = im.width / im.height
target_ratio = target[0] / target[1]
if im_ratio > target_ratio:
new_h = target[1]
new_w = int(new_h * im_ratio)
else:
new_w = target[0]
new_h = int(new_w / im_ratio)
im = im.resize((new_w, new_h))
left = (new_w - target[0]) // 2
top = (new_h - target[1]) // 2
im = im.crop((left, top, left + target[0], top + target[1]))
bg.paste(im, (0, 0))
except Exception:
pass
draw = ImageDraw.Draw(bg)
try:
font_title = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 54)
font_body = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 38)
font_label = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
except Exception:
font_title = font_body = font_label = None
draw.rectangle((0, 720, W, H), fill=(14, 14, 14))
margin = 48
maxw = W - margin * 2
draw.text((margin, 770), "VNEWS · Tường AI", fill=(92, 184, 122), font=font_label)
y = 830
for ln in _wrap_text_px(draw, post.get("title", ""), font_title, maxw, 4):
draw.text((margin, y), ln, fill=(255, 255, 255), font=font_title)
y += 66
y += 18
text = post.get("text", "")
text = re.sub(r"Nguồn tham khảo:.*", "", text, flags=re.S).strip()
body_lines = _wrap_text_px(draw, text, font_body, maxw, 14)
for ln in body_lines:
draw.text((margin, y), ln, fill=(220, 220, 220), font=font_body)
y += 50
if y > 1640:
break
bg.save(out_path, quality=92)
def _summary_segments_from_post(post, max_segments=25):
raw = _clean(post.get('text') or post.get('title') or '')
raw = re.sub(r'^Bản tin AI viết lại:\s*', '', raw, flags=re.I)
raw = re.sub(r'Nguồn tham khảo:.*$', '', raw, flags=re.I|re.S).strip()
lines=[]
for ln in raw.splitlines():
ln=_clean(re.sub(r'^[•\-\*\d\.\)\s]+','',ln))
if not ln: continue
low=ln.lower()
if low.startswith(('điểm chính','tiêu đề','sapo','nguồn tham khảo')): continue
if len(ln)>=18: lines.append(ln)
if len(lines)<3:
lines=[]
for s in re.split(r'(?<=[\.\!\?])\s+', raw):
s=_clean(s)
if len(s)>=25: lines.append(s)
segs=_dedupe_units(lines, max_units=max_segments)
return segs[:max_segments] if segs else [post.get('title','Bản tin VNEWS')]
def _make_scene_frame(post, segment, idx, total, img_path, out_path, emotion='neutral'):
if Image is None:
return _make_short_frame_full(post, img_path, out_path)
W,H=1080,1920
bg=Image.new('RGB',(W,H),(10,10,10))
try:
im=Image.open(img_path).convert('RGB')
ratio=im.width/max(1,im.height); target=W/H
if ratio>target:
nh=H; nw=int(nh*ratio)
else:
nw=W; nh=int(nw/ratio)
cover=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-H)//2
cover=cover.crop((left,top,left+W,top+H))
bg.paste(cover,(0,0))
bg=Image.blend(bg, Image.new('RGB',(W,H),(0,0,0)), 0.50)
hero_h=720; target=W/hero_h
if ratio>target:
nh=hero_h; nw=int(nh*ratio)
else:
nw=W; nh=int(nw/ratio)
hero=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-hero_h)//2
hero=hero.crop((left,top,left+W,top+hero_h))
bg.paste(hero,(0,0))
except Exception:
pass
draw=ImageDraw.Draw(bg)
try:
font_brand=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',34)
font_small=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',28)
font_seg=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',58)
font_title=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',34)
except Exception:
font_brand=font_small=font_seg=font_title=None
draw.rectangle((0,680,W,H), fill=(12,12,12))
dot_x=48; dot_y=742
for i in range(total):
fill=(92,184,122) if i==idx else (70,70,70)
draw.rounded_rectangle((dot_x+i*38,dot_y,dot_x+i*38+24,dot_y+10), radius=5, fill=fill)
draw.text((48,780),'VNEWS AI SHORT',fill=(110,231,143),font=font_brand)
draw.rounded_rectangle((48,834,260,880), radius=20, fill=(28,70,45))
draw.text((66,842),f'Đoạn {idx+1}/{total}',fill=(235,235,235),font=font_small)
y=940; maxw=W-96
# INCREASED from 12 to 18 for full content display - each key point can span multiple lines
for ln in _wrap_text_px(draw, segment, font_seg, maxw, 18):
draw.text((48,y),ln,fill=(255,255,255),font=font_seg)
y+=74
if y>1500: break
y2=1640
draw.line((48,y2-22,W-48,y2-22),fill=(70,70,70),width=2)
for ln in _wrap_text_px(draw, post.get('title',''), font_title, maxw, 3):
draw.text((48,y2),ln,fill=(220,220,220),font=font_title)
y2+=46
bg.save(out_path, quality=92)
def _estimate_audio_duration(path, fallback=15.0):
"""Estimate audio duration with 15s minimum per segment for complete bullet reading."""
try:
pr=subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:no_key=1',path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
return max(12.0, float((pr.stdout or b'').decode().strip() or fallback))
except Exception:
return fallback
@app.post('/api/ai/short/{post_id}')
async def patched_ai_short(post_id: str, request: Request):
try:
body = await request.json()
except Exception:
body = {}
voice = str(body.get('voice', 'nu')).strip().lower()
emotion = str(body.get('emotion', 'neutral')).strip().lower()
speed = float(body.get('speed', 1.0) or 1.0)
speed = max(0.85, min(1.35, speed))
posts = base._load_ai_wall()
post = next((p for p in posts if str(p.get('id')) == str(post_id)), None)
if not post:
return JSONResponse({'error': 'post not found'}, status_code=404)
segments = _summary_segments_from_post(post, max_segments=25)
seg_hash = hashlib.md5(('|'.join(segments)+voice+emotion+str(speed)).encode('utf-8')).hexdigest()[:8]
os.makedirs(base.SHORTS_DIR, exist_ok=True)
suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_{seg_hash}_scenes_nosub"
out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
if os.path.exists(out_mp4):
post['video'] = '/api/ai/short-file/' + post_id + suffix
post['short_voice'] = voice
post['short_emotion'] = emotion
post['short_speed'] = speed
post['short_segments'] = segments
post['short_subtitles'] = False
base._save_ai_wall(posts)
return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments})
if base.gTTS is None:
return JSONResponse({'error': 'gTTS chưa sẵn sàng'}, status_code=503)
work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix))
os.makedirs(work, exist_ok=True)
img = os.path.join(work, 'image.jpg')
try:
base._download_image(post.get('img'), post.get('title', 'AI news'), img)
edge_voice = {
# Vietnamese
'vi-vn-hoaimyneural': 'vi-VN-HoaiMyNeural',
'vi-vn-namminhneural': 'vi-VN-NamMinhNeural',
'hoaimy': 'vi-VN-HoaiMyNeural',
'namminh': 'vi-VN-NamMinhNeural',
'nam': 'vi-VN-NamMinhNeural',
'male': 'vi-VN-NamMinhNeural',
'nu': 'vi-VN-HoaiMyNeural',
'female': 'vi-VN-HoaiMyNeural',
'mien-nam': 'vi-VN-HoaiMyNeural',
# English - Multilingual
'en-us-andrewmultilingualneural': 'en-US-AndrewMultilingualNeural',
'en-au-williammultilingualneural': 'en-AU-WilliamMultilingualNeural',
'andrew': 'en-US-AndrewMultilingualNeural',
'en_andrew': 'en-US-AndrewMultilingualNeural',
'jenny': 'en-US-AndrewMultilingualNeural',
'en_jenny': 'en-US-AndrewMultilingualNeural',
# Portuguese - Multilingual (ONLY Thalita)
'pt-br-thalitamultilingualneural': 'pt-BR-ThalitaMultilingualNeural',
'thalita': 'pt-BR-ThalitaMultilingualNeural',
'pt_thalita': 'pt-BR-ThalitaMultilingualNeural',
'pt_br_thalita': 'pt-BR-ThalitaMultilingualNeural',
'pt': 'pt-BR-ThalitaMultilingualNeural',
'pt_francisco': 'pt-BR-ThalitaMultilingualNeural',
# French - Multilingual
'fr-fr-viviennemultilingualneural': 'fr-FR-VivienneMultilingualNeural',
'fr-fr-remymultilingualneural': 'fr-FR-RemyMultilingualNeural',
'denise': 'fr-FR-VivienneMultilingualNeural',
'fr': 'fr-FR-VivienneMultilingualNeural',
'fr_denise': 'fr-FR-VivienneMultilingualNeural',
# German - Multilingual
'de-de-seraphinamultilingualneural': 'de-DE-SeraphinaMultilingualNeural',
'de-de-florianmultilingualneural': 'de-DE-FlorianMultilingualNeural',
'katja': 'de-DE-SeraphinaMultilingualNeural',
'de': 'de-DE-SeraphinaMultilingualNeural',
'de_katja': 'de-DE-SeraphinaMultilingualNeural',
# Korean - Multilingual (Hyunsu, NOT SunHee)
'ko-kr-hyusumultilingualneural': 'ko-KR-HyunsuMultilingualNeural',
'ko-kr-hyunsuneural': 'ko-KR-HyunsuMultilingualNeural',
'sunhee': 'ko-KR-HyunsuMultilingualNeural',
'ko': 'ko-KR-HyunsuMultilingualNeural',
'ko_sunhee': 'ko-KR-HyunsuMultilingualNeural',
# Italian - Multilingual
'it-it-giuseppemultilingualneural': 'it-IT-GiuseppeMultilingualNeural',
# Spanish (keep for backward compat)
'ela': 'en-US-AndrewMultilingualNeural',
'es_ela': 'en-US-AndrewMultilingualNeural',
'es': 'en-US-AndrewMultilingualNeural',
'es_carlos': 'en-US-AndrewMultilingualNeural',
# Japanese (keep for backward compat)
'nanami': 'en-US-AndrewMultilingualNeural',
'ja': 'en-US-AndrewMultilingualNeural',
'ja_nanami': 'en-US-AndrewMultilingualNeural',
# Chinese (keep for backward compat)
'xiaochen': 'en-US-AndrewMultilingualNeural',
'zh': 'en-US-AndrewMultilingualNeural',
'zh_xiaochen': 'en-US-AndrewMultilingualNeural',
}.get(voice, 'vi-VN-HoaiMyNeural')
part_files=[]
for idx, seg in enumerate(segments):
frame=os.path.join(work,f'frame_{idx:02d}.jpg')
aud=os.path.join(work,f'voice_{idx:02d}.mp3')
aud_fast=os.path.join(work,f'voice_{idx:02d}_fast.mp3')
part=os.path.join(work,f'part_{idx:02d}.mp4')
_make_scene_frame(post, seg, idx, len(segments), img, frame, emotion=emotion)
spoken=_emotion_script(seg, emotion)
try:
subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',spoken,'--write-media',aud], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=120)
except Exception:
tld='com.vn' if voice in ('nu','female','mien-nam','hoaimy') else 'com'
try:
base.gTTS(spoken, lang='vi', tld=tld, slow=False).save(aud)
except TypeError:
base.gTTS(spoken, lang='vi', slow=False).save(aud)
subprocess.run(['ffmpeg','-y','-i',aud,'-filter:a',f'atempo={speed}','-vn',aud_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=90)
dur=_estimate_audio_duration(aud_fast, fallback=15.0)+0.35
subprocess.run(['ffmpeg','-y','-loop','1','-t',str(dur),'-i',frame,'-i',aud_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k',part], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=150)
part_files.append(part)
concat=os.path.join(work,'concat.txt')
with open(concat,'w',encoding='utf-8') as f:
for p in part_files:
f.write("file '" + p.replace("'", "'\\''") + "'\n")
subprocess.run(['ffmpeg','-y','-f','concat','-safe','0','-i',concat,'-c','copy',out_mp4], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180)
post['video'] = '/api/ai/short-file/' + post_id + suffix
post['short_voice'] = voice
post['short_emotion'] = emotion
post['short_speed'] = speed
post['short_segments'] = segments
post['short_subtitles'] = False
base._save_ai_wall(posts)
return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments})
except Exception as e:
return JSONResponse({'error': 'Không tạo được shorts: ' + str(e)[:220]}, status_code=500)
@app.get('/api/ai/short-file/{file_id}')
def patched_ai_short_file(file_id: str):
path = os.path.join(base.SHORTS_DIR, base._safe_name(file_id) + '.mp4')
if not os.path.exists(path):
return JSONResponse({'error': 'not found'}, status_code=404)
return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')
@app.get('/api/ai_shorts')
def api_ai_shorts():
posts = [p for p in base._load_ai_wall() if p.get('video')]
return JSONResponse({'posts': posts[:80]})
app.router.routes = [r for r in app.router.routes if not (getattr(r, 'path', None) == '/' and 'GET' in getattr(r, 'methods', set()))]