VNEWS

Running

File size: 37,178 Bytes

4b6e868

import os
import re
import time
import random
import json
import html as html_lib
import subprocess
import requests
import hashlib
import ai_ext as base
from ai_ext import app
from fastapi import Request
from fastapi.responses import JSONResponse, HTMLResponse, FileResponse
from bs4 import BeautifulSoup
from urllib.parse import quote_plus

try:
    from PIL import Image, ImageDraw, ImageFont
except Exception:
    Image = ImageDraw = ImageFont = None


def _clean(s):
    s = html_lib.unescape(s or "")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()


def _norm(s):
    s = s.lower()
    s = re.sub(r"[^\wÀ-ỹ\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def _similar(a, b):
    ta = set(_norm(a).split())
    tb = set(_norm(b).split())
    if not ta or not tb:
        return False
    return len(ta & tb) / max(1, min(len(ta), len(tb))) >= 0.72


def _dedupe_units(units, max_units=25):
    """Deduplicate units - only skip exact matches to ensure all bullet points are read."""
    out, seen = [], set()
    for u in units:
        u = _clean(re.sub(r"^[-•*\d\.\)\s]+", "", u))
        if len(u) < 18:
            continue
        nu = _norm(u)
        # Only skip exact matches, NOT similar content (to avoid skipping valid bullet points)
        if nu in seen:
            continue
        seen.add(nu)
        out.append(u)
        if len(out) >= max_units:
            break
    return out


def _postprocess_ai_text(text, max_units=20):
    text = _clean(text)
    if not text:
        return text
    drop_prefixes = (
        "dưới đây", "sau đây", "bài viết", "tôi sẽ", "mình sẽ",
        "tóm tắt bài", "tiêu đề:", "sapo:", "nội dung:", "kết luận:"
    )
    raw_lines = []
    for line in re.split(r"\n+", text):
        line = _clean(line)
        if not line:
            continue
        low = line.lower().strip()
        if any(low.startswith(p) and len(line) < 80 for p in drop_prefixes):
            continue
        raw_lines.append(line)
    units = []
    for line in raw_lines:
        # KEEP FULL bullet point - don't truncate or split into segments
        if len(line) >= 18:
            units.append(_clean(re.sub(r"^[-•*\d\.\)\s]+", "", line)))
    units = _dedupe_units(units, max_units=max_units)
    if not units:
        return text[:900]
    title = ""
    if raw_lines and len(raw_lines[0]) <= 90 and not raw_lines[0].startswith(("-", "•", "*")):
        title = raw_lines[0]
        units = [u for u in units if not _similar(u, title)]
    body = "\n".join("• " + u for u in units[:max_units])
    return (title + "\n\n" + body).strip() if title else body


def _fallback_summary_from_prompt(prompt, max_units=6):
    text = prompt or ""
    for marker in ["Nội dung nguồn:", "Nội dung bài:", "Nội dung gốc:", "Nội dung:", "Nguồn/bối cảnh internet:"]:
        if marker in text:
            text = text.split(marker, 1)[1]
            break
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    sentences = re.split(r"(?<=[\.\!\?])\s+(?=[A-ZÀ-Ỹ0-9])", text)
    candidates = []
    for s in sentences:
        s = _clean(s)
        if 45 <= len(s) <= 260:
            candidates.append(s)
    units = _dedupe_units(candidates, max_units=max_units)
    if units:
        return "\n".join("• " + u for u in units)
    if text:
        return "• " + text[:700].rsplit(" ", 1)[0]
    return "• Không có đủ nội dung nguồn để tóm tắt."


def _source_line(sources):
    names = []
    for s in (sources or [])[:5]:
        via = s.get("via") or base._domain(s.get("url", "")) or s.get("title", "")
        if via and via not in names:
            names.append(via)
    return "Nguồn tham khảo: " + ", ".join(names[:5]) if names else "Nguồn tham khảo: tổng hợp internet"


def _make_summary_prompt(title, raw, source_hint=""):
    return f"""Bạn là biên tập viên tóm tắt tin tức tiếng Việt.

NHIỆM VỤ BẮT BUỘC:
- Chỉ TÓM TẮT nội dung chính, KHÔNG viết lại toàn bộ bài.
- Không lặp lại cùng một ý, cùng một câu, cùng một chi tiết.
- Không thêm thông tin ngoài nguồn.
- Tối đa 5 gạch đầu dòng, mỗi gạch đầu dòng 1 câu ngắn.
- Nếu bài có số liệu/nhân vật/thời điểm quan trọng thì giữ lại.
- Không viết phần mở bài dài, không viết văn kể lại.

Tiêu đề nguồn: {title}
Nguồn: {source_hint}

Nội dung nguồn:
{raw[:14000]}
"""


def _direct_news_rss(topic, limit=10):
    out = []
    try:
        url = "https://news.google.com/rss/search?q=" + quote_plus(topic) + "&hl=vi&gl=VN&ceid=VN:vi"
        r = requests.get(url, headers=base.HEADERS, timeout=15)
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, "xml")
        for it in soup.find_all("item")[:limit]:
            title = it.find("title").get_text(" ", strip=True) if it.find("title") else ""
            link = it.find("link").get_text(strip=True) if it.find("link") else ""
            src = it.find("source").get_text(" ", strip=True) if it.find("source") else base._domain(link)
            if title and link:
                out.append({"title": title, "url": link, "via": src, "excerpt": title})
    except Exception:
        pass
    return out


def _topic_source_articles(topic, limit=5):
    """Return actual scraped article bodies for a topic. Each source becomes one Wall AI post."""
    try:
        _ctx, sources = base.web_context(topic, limit=limit)
    except Exception:
        sources = []
    if not sources:
        sources = _direct_news_rss(topic, limit=10)
    out, seen = [], set()
    for s in (sources or [])[:limit * 3]:
        url = s.get("url") or ""
        if not url.startswith("http") or url in seen:
            continue
        seen.add(url)
        try:
            page = base.scrape_any_url(url)
            raw = (page.get("summary", "") + "\n" + page.get("text", "")).strip()
            if len(raw) < 180:
                continue
            title = page.get("title") or s.get("title") or url
            via = page.get("via") or s.get("via") or base._domain(url)
            out.append({
                "title": title,
                "url": url,
                "raw": raw,
                "image": page.get("image") or "",
                "via": via,
                "source": {"title": title, "url": url, "excerpt": raw[:700], "via": via}
            })
            if len(out) >= limit:
                break
        except Exception:
            continue
    if not out:
        for s in (sources or _direct_news_rss(topic, 6))[:limit]:
            title = s.get("title") or topic
            excerpt = s.get("excerpt") or s.get("description") or s.get("content") or title
            url = s.get("url", "")
            via = s.get("via") or base._domain(url)
            out.append({
                "title": title,
                "url": url,
                "raw": excerpt,
                "image": base.pollinations_image_url(title),
                "via": via,
                "source": {"title": title, "url": url, "excerpt": excerpt[:700], "via": via}
            })
    return out[:limit]


async def qwen_generate_resilient(prompt: str, image_url=None, max_tokens: int = 1200):
    errors = []
    token = base._hf_token()
    try:
        original = getattr(base, "_original_qwen_generate", None)
        if original:
            txt = await original(prompt, image_url=image_url, max_tokens=max_tokens)
            if txt:
                base.LAST_QWEN_ERROR = ""
                return txt
            if getattr(base, "LAST_QWEN_ERROR", ""):
                errors.append("sdk: " + str(base.LAST_QWEN_ERROR)[:260])
    except Exception as e:
        errors.append(f"sdk: {type(e).__name__}: {str(e)[:260]}")
    if token:
        models = []
        for m in [
            os.getenv("QWEN_VL_MODEL", ""),
            "Qwen/Qwen2.5-VL-7B-Instruct",
            "Qwen/Qwen2.5-VL-3B-Instruct",
            "Qwen/Qwen2.5-7B-Instruct",
            "Qwen/Qwen2.5-3B-Instruct",
            "Qwen/Qwen2.5-1.5B-Instruct",
        ]:
            if m and m not in models:
                models.append(m)
        headers = {"Authorization": "Bearer " + token, "Content-Type": "application/json"}
        for model in models:
            try:
                is_vl = "VL" in model and bool(image_url)
                user_content = ([{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}] if is_vl else prompt)
                payload = {
                    "model": model,
                    "messages": [
                        {"role": "system", "content": "Bạn là biên tập viên AI tiếng Việt. Chỉ tóm tắt súc tích nội dung nguồn, không viết lại toàn bài, không lặp ý, không bịa chi tiết."},
                        {"role": "user", "content": user_content},
                    ],
                    "max_tokens": min(int(max_tokens or 900), 1400),
                    "temperature": 0.35,
                    "top_p": 0.85,
                }
                r = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, json=payload, timeout=95)
                if r.status_code >= 300:
                    errors.append(f"{model}: HTTP {r.status_code} {r.text[:180]}")
                    continue
                j = r.json()
                txt = (j.get("choices", [{}])[0].get("message", {}).get("content") or "").strip()
                if txt:
                    base.LAST_QWEN_ERROR = ""
                    return txt
                errors.append(f"{model}: empty response")
            except Exception as e:
                errors.append(f"{model}: {type(e).__name__}: {str(e)[:220]}")
    else:
        errors.append("missing HF_TOKEN")
    base.LAST_QWEN_ERROR = " | ".join(errors[-6:]) or "Qwen unavailable; used extractive fallback"
    print("[qwen resilient fallback]", base.LAST_QWEN_ERROR)
    return _fallback_summary_from_prompt(prompt, max_units=12)


if not hasattr(base, "_original_qwen_generate"):
    base._original_qwen_generate = base.qwen_generate
base.qwen_generate = qwen_generate_resilient


@app.get('/api/wall')
def compat_wall():
    return JSONResponse({'posts': base._load_ai_wall()[:80]})


_PATCHED_PATHS = {
    ('/api/topic_post', 'POST'),
    ('/api/url_wall', 'POST'),
    ('/api/rewrite_share', 'POST'),
    ('/api/ai/short/{post_id}', 'POST'),
}
app.router.routes = [
    r for r in app.router.routes
    if not any(getattr(r, 'path', None) == p and m in getattr(r, 'methods', set()) for p, m in _PATCHED_PATHS)
]


@app.post('/api/topic_post')
async def compat_topic_post(request: Request):
    body = await request.json()
    topic = base._clean_text(body.get('topic', ''))
    if not topic:
        return JSONResponse({'error': 'missing topic'}, status_code=400)
    articles = _topic_source_articles(topic, limit=4)
    if not articles:
        return JSONResponse({'error': 'Không lấy được bài viết nguồn cho chủ đề này.'}, status_code=422)
    new_posts = []
    posts = base._load_ai_wall()
    for art in articles:
        prompt = f"""Tóm tắt RIÊNG bài viết nguồn sau để đăng Tường AI.

Chủ đề lọc: {topic}
Tiêu đề bài nguồn: {art['title']}
Nguồn: {art['via']}

Yêu cầu bắt buộc:
- Tóm tắt nội dung trong BÀI VIẾT này, không chỉ tiêu đề.
- Không trộn với bài khác.
- Không viết lại toàn bộ bài.
- Không lặp ý.
- 4-6 gạch đầu dòng, mỗi dòng 1 câu rõ ràng.
- Giữ số liệu/nhân vật/thời điểm quan trọng nếu có.

Nội dung bài:
{art['raw'][:14000]}"""
        text = await base.qwen_generate(prompt, image_url=art.get('image') or None, max_tokens=1500)
        text = _postprocess_ai_text(text, max_units=20)
        src = [art['source']]
        if 'Nguồn tham khảo:' not in text:
            text += "\n\n" + _source_line(src)
        post = base.make_post(art['title'], text, art.get('image') or base.pollinations_image_url(art['title']), art.get('url') or '', 'topic_article', sources=src)
        new_posts.append(post)
    posts = new_posts + posts
    base._save_ai_wall(posts)
    return JSONResponse({'post': new_posts[0], 'posts': new_posts, 'count': len(new_posts)})


@app.post('/api/url_wall')
async def compat_url_wall(request: Request):
    body = await request.json()
    url = base._clean_text(body.get('url', ''))
    if not url.startswith('http'):
        return JSONResponse({'error': 'missing url'}, status_code=400)
    try:
        data = base.scrape_any_url(url)
    except Exception as e:
        return JSONResponse({'error': 'Không scrape được URL: ' + str(e)[:180]}, status_code=422)
    raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip()
    if len(raw) < 120:
        return JSONResponse({'error': 'URL không có đủ nội dung để tóm tắt'}, status_code=422)
    prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url))
    text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500)
    text = _postprocess_ai_text(text, max_units=20)
    src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}]
    if 'Nguồn tham khảo:' not in text:
        text += "\n\n" + _source_line(src)
    post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'url', sources=src)
    posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts)
    return JSONResponse({'post': post})


def _is_relevant_image(img_url, title, text):
    """Check if an image is relevant to the article content."""
    if not img_url:
        return False
    skip_patterns = ['pixel', 'analytics', 'tracking', '1x1.gif', 'spacer.gif',
                     'logo', 'icon', 'avatar', 'emoji', 'smiley', 'sprite',
                     'advertisement', 'ad-banner', 'sponsored', 'banner-ads']
    img_lower = img_url.lower()
    for p in skip_patterns:
        if p in img_lower:
            return False
    if not any(img_lower.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.webp', '.gif']):
        return False
    return True


def _filter_relevant_images(images, title, text, max_images=8):
    """Filter and rank images by relevance to article content."""
    if not images:
        return []
    seen = set()
    relevant = []
    for img in images:
        if img in seen:
            continue
        seen.add(img)
        if _is_relevant_image(img, title, text):
            relevant.append(img)
    return relevant[:max_images]


def _extract_key_points_for_slides(paragraphs, max_points=12):
    """Extract key points from paragraphs for slides - extracts ALL sentences, not just first one."""
    points = []
    for p in paragraphs:
        if len(points) >= max_points:
            break
        p = _clean(p)
        if not p:
            continue
        # Split paragraph into sentences using Vietnamese + English punctuation - GET ALL SENTENCES
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÀ-Ỹ0-9])', p)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        for sentence in sentences:
            if len(points) >= max_points:
                break
            sentence = _clean(sentence)
            if len(sentence) < 30:
                continue
            if any(sentence[:60] in existing for existing in points):
                continue
            if not sentence.endswith(('.', '!', '?')):
                sentence = sentence + '.'
            points.append(sentence)
    return points


def _scrape_article_images(url):
    """Scrape article page and return only relevant images."""
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                   "Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8"}
        r = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')
        for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'aside', 'form']):
            tag.decompose()
        h1 = soup.find('h1')
        ogt = soup.find('meta', property='og:title')
        title = (h1.get_text(strip=True) if h1 else '') or (ogt.get('content', '') if ogt else '')
        ogi = soup.find('meta', property='og:image')
        og_img = ogi.get('content', '') if ogi else ''
        if og_img and og_img.startswith('//'):
            og_img = 'https:' + og_img
        block = None
        for sel in ['article', '.singular-content', '.detail-content', '.fck_detail', '.content-detail', '.knc-content', 'main', '.cms-body', '.article__body']:
            el = soup.select_one(sel)
            if el and len(el.find_all('p')) >= 2:
                block = el
                break
        if not block:
            block = soup.body or soup
        paragraphs = []
        all_images = []
        seen_imgs = set()
        if og_img and og_img not in seen_imgs:
            all_images.append(og_img)
            seen_imgs.add(og_img)
        for el in block.find_all(['p', 'h2', 'h3', 'figure', 'img'], recursive=True):
            if el.name == 'p':
                t = _clean(el.get_text(strip=True))
                if t and len(t) > 40:
                    paragraphs.append(t)
            elif el.name in ('figure', 'img'):
                im = el if el.name == 'img' else el.find('img')
                if im:
                    src = im.get('data-src') or im.get('src') or im.get('data-original') or ''
                    if src and 'base64' not in src:
                        if src.startswith('//'):
                            src = 'https:' + src
                        if src not in seen_imgs:
                            all_images.append(src)
                            seen_imgs.add(src)
        relevant_images = _filter_relevant_images(all_images, title, ' '.join(paragraphs[:5]))
        return {'title': _clean(title), 'paragraphs': paragraphs, 'images': relevant_images, 'og_img': og_img}
    except Exception:
        return None


@app.post('/api/rewrite_share')
async def compat_rewrite_share(request: Request):
    body = await request.json()
    url = base._clean_text(body.get('url', ''))
    if not url.startswith('http'):
        return JSONResponse({'error': 'missing url'}, status_code=400)
    try:
        data = base.scrape_any_url(url)
    except Exception as e:
        return JSONResponse({'error': 'Không đọc được bài viết: ' + str(e)[:180]}, status_code=422)
    raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip()
    if len(raw) < 120:
        return JSONResponse({'error': 'Bài viết không đủ nội dung để tóm tắt'}, status_code=422)
    prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url))
    text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500)
    text = _postprocess_ai_text(text, max_units=20)
    src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}]
    if 'Nguồn tham khảo:' not in text:
        text += "\n\n" + _source_line(src)
    post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'summary', sources=src)
    posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts)

    # Generate slides with relevant images only
    slides = []
    page_data = _scrape_article_images(url)
    if page_data and page_data.get('paragraphs'):
        key_points = _extract_key_points_for_slides(page_data['paragraphs'], max_points=12)
        if key_points:
            relevant_imgs = page_data.get('images', [])
            if not relevant_imgs and page_data.get('og_img'):
                relevant_imgs = [page_data['og_img']]
            for i, point in enumerate(key_points):
                img = relevant_imgs[i] if i < len(relevant_imgs) else (relevant_imgs[-1] if relevant_imgs else '')
                slides.append({'text': point, 'image': img, 'index': i + 1})

    return JSONResponse({'post': post, 'slides': slides})


def _emotion_script(text, emotion):
    """Prepend emotion-appropriate prefix to text based on emotion type.
    
    NOTE: Prefix is NOT added to avoid cluttering Short AI speech.
    The emotion is still used for voice selection but content is read cleanly.
    """
    text = _clean(text)
    # REMOVED: No prefix added to keep content clean and natural
    return text


def _tts_script_smart(post, emotion):
    raw = base._short_script(post) if hasattr(base, '_short_script') else _clean(post.get('text', '') or post.get('title', ''))
    raw = re.sub(r"^[•\-\*]\s*", "", raw, flags=re.M)
    raw = re.sub(r"\s*\n\s*", ". ", raw)
    raw = re.sub(r"([\.\!\?])\s*", r"\1\n", raw)
    raw = re.sub(r"\n{2,}", "\n", raw).strip()
    # REMOVED: _emotion_script call - read content cleanly without prefix
    # INCREASED to 3000 to read full content of all bullet points
    if len(raw) > 3000:
        raw = raw[:3000]
        cut = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
        if cut > 700:
            raw = raw[:cut + 1]
    return raw


def _split_subtitle_sentences(script):
    parts = []
    for line in script.splitlines():
        line = _clean(line)
        if not line:
            continue
        for s in re.split(r"(?<=[\.\!\?])\s+", line):
            s = _clean(s)
            if 8 <= len(s) <= 140:
                parts.append(s)
    return parts[:12]


def _srt_time(sec):
    ms = int((sec - int(sec)) * 1000)
    sec = int(sec)
    h = sec // 3600
    m = (sec % 3600) // 60
    s = sec % 60
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"


def _write_srt(script, path, total_duration=30):
    subs = _split_subtitle_sentences(script)
    if not subs:
        subs = [script[:120]]
    dur = max(2.2, min(5.0, total_duration / max(1, len(subs))))
    cur = 0.3
    with open(path, 'w', encoding='utf-8') as f:
        for i, s in enumerate(subs, 1):
            start = cur
            end = cur + dur
            cur = end + 0.15
            f.write(f"{i}\n{_srt_time(start)} --> {_srt_time(end)}\n{s}\n\n")


def _wrap_text_px(draw, text, font, max_width, max_lines):
    words = _clean(text).split()
    lines, cur = [], ""
    for w in words:
        test = (cur + " " + w).strip()
        try:
            width = draw.textbbox((0, 0), test, font=font)[2]
        except Exception:
            width = len(test) * 20
        if width <= max_width:
            cur = test
        else:
            if cur:
                lines.append(cur)
            cur = w
        if len(lines) >= max_lines:
            break
    if cur and len(lines) < max_lines:
        lines.append(cur)
    return lines


def _make_short_frame_full(post, img_path, out_path):
    if Image is None:
        return base._make_short_frame(post, img_path, out_path)
    W, H = 1080, 1920
    bg = Image.new("RGB", (W, H), (14, 14, 14))
    try:
        im = Image.open(img_path).convert("RGB")
        target = (1080, 760)
        im_ratio = im.width / im.height
        target_ratio = target[0] / target[1]
        if im_ratio > target_ratio:
            new_h = target[1]
            new_w = int(new_h * im_ratio)
        else:
            new_w = target[0]
            new_h = int(new_w / im_ratio)
        im = im.resize((new_w, new_h))
        left = (new_w - target[0]) // 2
        top = (new_h - target[1]) // 2
        im = im.crop((left, top, left + target[0], top + target[1]))
        bg.paste(im, (0, 0))
    except Exception:
        pass
    draw = ImageDraw.Draw(bg)
    try:
        font_title = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 54)
        font_body = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 38)
        font_label = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
    except Exception:
        font_title = font_body = font_label = None
    draw.rectangle((0, 720, W, H), fill=(14, 14, 14))
    margin = 48
    maxw = W - margin * 2
    draw.text((margin, 770), "VNEWS · Tường AI", fill=(92, 184, 122), font=font_label)
    y = 830
    for ln in _wrap_text_px(draw, post.get("title", ""), font_title, maxw, 4):
        draw.text((margin, y), ln, fill=(255, 255, 255), font=font_title)
        y += 66
    y += 18
    text = post.get("text", "")
    text = re.sub(r"Nguồn tham khảo:.*", "", text, flags=re.S).strip()
    body_lines = _wrap_text_px(draw, text, font_body, maxw, 14)
    for ln in body_lines:
        draw.text((margin, y), ln, fill=(220, 220, 220), font=font_body)
        y += 50
        if y > 1640:
            break
    bg.save(out_path, quality=92)




def _summary_segments_from_post(post, max_segments=25):
    raw = _clean(post.get('text') or post.get('title') or '')
    raw = re.sub(r'^Bản tin AI viết lại:\s*', '', raw, flags=re.I)
    raw = re.sub(r'Nguồn tham khảo:.*$', '', raw, flags=re.I|re.S).strip()
    lines=[]
    for ln in raw.splitlines():
        ln=_clean(re.sub(r'^[•\-\*\d\.\)\s]+','',ln))
        if not ln: continue
        low=ln.lower()
        if low.startswith(('điểm chính','tiêu đề','sapo','nguồn tham khảo')): continue
        if len(ln)>=18: lines.append(ln)
    if len(lines)<3:
        lines=[]
        for s in re.split(r'(?<=[\.\!\?])\s+', raw):
            s=_clean(s)
            if len(s)>=25: lines.append(s)
    segs=_dedupe_units(lines, max_units=max_segments)
    return segs[:max_segments] if segs else [post.get('title','Bản tin VNEWS')]


def _make_scene_frame(post, segment, idx, total, img_path, out_path, emotion='neutral'):
    if Image is None:
        return _make_short_frame_full(post, img_path, out_path)
    W,H=1080,1920
    bg=Image.new('RGB',(W,H),(10,10,10))
    try:
        im=Image.open(img_path).convert('RGB')
        ratio=im.width/max(1,im.height); target=W/H
        if ratio>target:
            nh=H; nw=int(nh*ratio)
        else:
            nw=W; nh=int(nw/ratio)
        cover=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-H)//2
        cover=cover.crop((left,top,left+W,top+H))
        bg.paste(cover,(0,0))
        bg=Image.blend(bg, Image.new('RGB',(W,H),(0,0,0)), 0.50)
        hero_h=720; target=W/hero_h
        if ratio>target:
            nh=hero_h; nw=int(nh*ratio)
        else:
            nw=W; nh=int(nw/ratio)
        hero=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-hero_h)//2
        hero=hero.crop((left,top,left+W,top+hero_h))
        bg.paste(hero,(0,0))
    except Exception:
        pass
    draw=ImageDraw.Draw(bg)
    try:
        font_brand=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',34)
        font_small=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',28)
        font_seg=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',58)
        font_title=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',34)
    except Exception:
        font_brand=font_small=font_seg=font_title=None
    draw.rectangle((0,680,W,H), fill=(12,12,12))
    dot_x=48; dot_y=742
    for i in range(total):
        fill=(92,184,122) if i==idx else (70,70,70)
        draw.rounded_rectangle((dot_x+i*38,dot_y,dot_x+i*38+24,dot_y+10), radius=5, fill=fill)
    draw.text((48,780),'VNEWS AI SHORT',fill=(110,231,143),font=font_brand)
    draw.rounded_rectangle((48,834,260,880), radius=20, fill=(28,70,45))
    draw.text((66,842),f'Đoạn {idx+1}/{total}',fill=(235,235,235),font=font_small)
    y=940; maxw=W-96
    # INCREASED from 12 to 18 for full content display - each key point can span multiple lines
    for ln in _wrap_text_px(draw, segment, font_seg, maxw, 18):
        draw.text((48,y),ln,fill=(255,255,255),font=font_seg)
        y+=74
        if y>1500: break
    y2=1640
    draw.line((48,y2-22,W-48,y2-22),fill=(70,70,70),width=2)
    for ln in _wrap_text_px(draw, post.get('title',''), font_title, maxw, 3):
        draw.text((48,y2),ln,fill=(220,220,220),font=font_title)
        y2+=46
    bg.save(out_path, quality=92)


def _estimate_audio_duration(path, fallback=15.0):
    """Estimate audio duration with 15s minimum per segment for complete bullet reading."""
    try:
        pr=subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:no_key=1',path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20)
        return max(12.0, float((pr.stdout or b'').decode().strip() or fallback))
    except Exception:
        return fallback


@app.post('/api/ai/short/{post_id}')
async def patched_ai_short(post_id: str, request: Request):
    try:
        body = await request.json()
    except Exception:
        body = {}
    voice = str(body.get('voice', 'nu')).strip().lower()
    emotion = str(body.get('emotion', 'neutral')).strip().lower()
    speed = float(body.get('speed', 1.0) or 1.0)
    speed = max(0.85, min(1.35, speed))

    posts = base._load_ai_wall()
    post = next((p for p in posts if str(p.get('id')) == str(post_id)), None)
    if not post:
        return JSONResponse({'error': 'post not found'}, status_code=404)

    segments = _summary_segments_from_post(post, max_segments=25)
    seg_hash = hashlib.md5(('|'.join(segments)+voice+emotion+str(speed)).encode('utf-8')).hexdigest()[:8]
    os.makedirs(base.SHORTS_DIR, exist_ok=True)
    suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_{seg_hash}_scenes_nosub"
    out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4')
    if os.path.exists(out_mp4):
        post['video'] = '/api/ai/short-file/' + post_id + suffix
        post['short_voice'] = voice
        post['short_emotion'] = emotion
        post['short_speed'] = speed
        post['short_segments'] = segments
        post['short_subtitles'] = False
        base._save_ai_wall(posts)
        return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments})
    if base.gTTS is None:
        return JSONResponse({'error': 'gTTS chưa sẵn sàng'}, status_code=503)

    work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix))
    os.makedirs(work, exist_ok=True)
    img = os.path.join(work, 'image.jpg')
    try:
        base._download_image(post.get('img'), post.get('title', 'AI news'), img)
        edge_voice = {
            # Vietnamese
            'vi-vn-hoaimyneural': 'vi-VN-HoaiMyNeural',
            'vi-vn-namminhneural': 'vi-VN-NamMinhNeural',
            'hoaimy': 'vi-VN-HoaiMyNeural',
            'namminh': 'vi-VN-NamMinhNeural',
            'nam': 'vi-VN-NamMinhNeural',
            'male': 'vi-VN-NamMinhNeural',
            'nu': 'vi-VN-HoaiMyNeural',
            'female': 'vi-VN-HoaiMyNeural',
            'mien-nam': 'vi-VN-HoaiMyNeural',
            # English - Multilingual
            'en-us-andrewmultilingualneural': 'en-US-AndrewMultilingualNeural',
            'en-au-williammultilingualneural': 'en-AU-WilliamMultilingualNeural',
            'andrew': 'en-US-AndrewMultilingualNeural',
            'en_andrew': 'en-US-AndrewMultilingualNeural',
            'jenny': 'en-US-AndrewMultilingualNeural',
            'en_jenny': 'en-US-AndrewMultilingualNeural',
            # Portuguese - Multilingual (ONLY Thalita)
            'pt-br-thalitamultilingualneural': 'pt-BR-ThalitaMultilingualNeural',
            'thalita': 'pt-BR-ThalitaMultilingualNeural',
            'pt_thalita': 'pt-BR-ThalitaMultilingualNeural',
            'pt_br_thalita': 'pt-BR-ThalitaMultilingualNeural',
            'pt': 'pt-BR-ThalitaMultilingualNeural',
            'pt_francisco': 'pt-BR-ThalitaMultilingualNeural',
            # French - Multilingual
            'fr-fr-viviennemultilingualneural': 'fr-FR-VivienneMultilingualNeural',
            'fr-fr-remymultilingualneural': 'fr-FR-RemyMultilingualNeural',
            'denise': 'fr-FR-VivienneMultilingualNeural',
            'fr': 'fr-FR-VivienneMultilingualNeural',
            'fr_denise': 'fr-FR-VivienneMultilingualNeural',
            # German - Multilingual
            'de-de-seraphinamultilingualneural': 'de-DE-SeraphinaMultilingualNeural',
            'de-de-florianmultilingualneural': 'de-DE-FlorianMultilingualNeural',
            'katja': 'de-DE-SeraphinaMultilingualNeural',
            'de': 'de-DE-SeraphinaMultilingualNeural',
            'de_katja': 'de-DE-SeraphinaMultilingualNeural',
            # Korean - Multilingual (Hyunsu, NOT SunHee)
            'ko-kr-hyusumultilingualneural': 'ko-KR-HyunsuMultilingualNeural',
            'ko-kr-hyunsuneural': 'ko-KR-HyunsuMultilingualNeural',
            'sunhee': 'ko-KR-HyunsuMultilingualNeural',
            'ko': 'ko-KR-HyunsuMultilingualNeural',
            'ko_sunhee': 'ko-KR-HyunsuMultilingualNeural',
            # Italian - Multilingual
            'it-it-giuseppemultilingualneural': 'it-IT-GiuseppeMultilingualNeural',
            # Spanish (keep for backward compat)
            'ela': 'en-US-AndrewMultilingualNeural',
            'es_ela': 'en-US-AndrewMultilingualNeural',
            'es': 'en-US-AndrewMultilingualNeural',
            'es_carlos': 'en-US-AndrewMultilingualNeural',
            # Japanese (keep for backward compat)
            'nanami': 'en-US-AndrewMultilingualNeural',
            'ja': 'en-US-AndrewMultilingualNeural',
            'ja_nanami': 'en-US-AndrewMultilingualNeural',
            # Chinese (keep for backward compat)
            'xiaochen': 'en-US-AndrewMultilingualNeural',
            'zh': 'en-US-AndrewMultilingualNeural',
            'zh_xiaochen': 'en-US-AndrewMultilingualNeural',
        }.get(voice, 'vi-VN-HoaiMyNeural')
        part_files=[]
        for idx, seg in enumerate(segments):
            frame=os.path.join(work,f'frame_{idx:02d}.jpg')
            aud=os.path.join(work,f'voice_{idx:02d}.mp3')
            aud_fast=os.path.join(work,f'voice_{idx:02d}_fast.mp3')
            part=os.path.join(work,f'part_{idx:02d}.mp4')
            _make_scene_frame(post, seg, idx, len(segments), img, frame, emotion=emotion)
            spoken=_emotion_script(seg, emotion)
            try:
                subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',spoken,'--write-media',aud], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=120)
            except Exception:
                tld='com.vn' if voice in ('nu','female','mien-nam','hoaimy') else 'com'
                try:
                    base.gTTS(spoken, lang='vi', tld=tld, slow=False).save(aud)
                except TypeError:
                    base.gTTS(spoken, lang='vi', slow=False).save(aud)
            subprocess.run(['ffmpeg','-y','-i',aud,'-filter:a',f'atempo={speed}','-vn',aud_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=90)
            dur=_estimate_audio_duration(aud_fast, fallback=15.0)+0.35
            subprocess.run(['ffmpeg','-y','-loop','1','-t',str(dur),'-i',frame,'-i',aud_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k',part], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=150)
            part_files.append(part)
        concat=os.path.join(work,'concat.txt')
        with open(concat,'w',encoding='utf-8') as f:
            for p in part_files:
                f.write("file '" + p.replace("'", "'\\''") + "'\n")
        subprocess.run(['ffmpeg','-y','-f','concat','-safe','0','-i',concat,'-c','copy',out_mp4], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180)
        post['video'] = '/api/ai/short-file/' + post_id + suffix
        post['short_voice'] = voice
        post['short_emotion'] = emotion
        post['short_speed'] = speed
        post['short_segments'] = segments
        post['short_subtitles'] = False
        base._save_ai_wall(posts)
        return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments})
    except Exception as e:
        return JSONResponse({'error': 'Không tạo được shorts: ' + str(e)[:220]}, status_code=500)


@app.get('/api/ai/short-file/{file_id}')
def patched_ai_short_file(file_id: str):
    path = os.path.join(base.SHORTS_DIR, base._safe_name(file_id) + '.mp4')
    if not os.path.exists(path):
        return JSONResponse({'error': 'not found'}, status_code=404)
    return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4')


@app.get('/api/ai_shorts')
def api_ai_shorts():
    posts = [p for p in base._load_ai_wall() if p.get('video')]
    return JSONResponse({'posts': posts[:80]})


app.router.routes = [r for r in app.router.routes if not (getattr(r, 'path', None) == '/' and 'GET' in getattr(r, 'methods', set()))]