Spaces:
Running
Running
| import os | |
| import re | |
| import time | |
| import random | |
| import json | |
| import html as html_lib | |
| import subprocess | |
| import requests | |
| import hashlib | |
| import ai_ext as base | |
| from ai_ext import app | |
| from fastapi import Request | |
| from fastapi.responses import JSONResponse, HTMLResponse, FileResponse | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import quote_plus | |
| try: | |
| from PIL import Image, ImageDraw, ImageFont | |
| except Exception: | |
| Image = ImageDraw = ImageFont = None | |
| def _clean(s): | |
| s = html_lib.unescape(s or "") | |
| s = re.sub(r"[ \t]+", " ", s) | |
| s = re.sub(r"\n{3,}", "\n\n", s) | |
| return s.strip() | |
| def _norm(s): | |
| s = s.lower() | |
| s = re.sub(r"[^\wÀ-ỹ\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def _similar(a, b): | |
| ta = set(_norm(a).split()) | |
| tb = set(_norm(b).split()) | |
| if not ta or not tb: | |
| return False | |
| return len(ta & tb) / max(1, min(len(ta), len(tb))) >= 0.72 | |
| def _dedupe_units(units, max_units=25): | |
| """Deduplicate units - only skip exact matches to ensure all bullet points are read.""" | |
| out, seen = [], set() | |
| for u in units: | |
| u = _clean(re.sub(r"^[-•*\d\.\)\s]+", "", u)) | |
| if len(u) < 18: | |
| continue | |
| nu = _norm(u) | |
| # Only skip exact matches, NOT similar content (to avoid skipping valid bullet points) | |
| if nu in seen: | |
| continue | |
| seen.add(nu) | |
| out.append(u) | |
| if len(out) >= max_units: | |
| break | |
| return out | |
| def _postprocess_ai_text(text, max_units=20): | |
| text = _clean(text) | |
| if not text: | |
| return text | |
| drop_prefixes = ( | |
| "dưới đây", "sau đây", "bài viết", "tôi sẽ", "mình sẽ", | |
| "tóm tắt bài", "tiêu đề:", "sapo:", "nội dung:", "kết luận:" | |
| ) | |
| raw_lines = [] | |
| for line in re.split(r"\n+", text): | |
| line = _clean(line) | |
| if not line: | |
| continue | |
| low = line.lower().strip() | |
| if any(low.startswith(p) and len(line) < 80 for p in drop_prefixes): | |
| continue | |
| raw_lines.append(line) | |
| units = [] | |
| for line in raw_lines: | |
| # KEEP FULL bullet point - don't truncate or split into segments | |
| if len(line) >= 18: | |
| units.append(_clean(re.sub(r"^[-•*\d\.\)\s]+", "", line))) | |
| units = _dedupe_units(units, max_units=max_units) | |
| if not units: | |
| return text[:900] | |
| title = "" | |
| if raw_lines and len(raw_lines[0]) <= 90 and not raw_lines[0].startswith(("-", "•", "*")): | |
| title = raw_lines[0] | |
| units = [u for u in units if not _similar(u, title)] | |
| body = "\n".join("• " + u for u in units[:max_units]) | |
| return (title + "\n\n" + body).strip() if title else body | |
| def _fallback_summary_from_prompt(prompt, max_units=6): | |
| text = prompt or "" | |
| for marker in ["Nội dung nguồn:", "Nội dung bài:", "Nội dung gốc:", "Nội dung:", "Nguồn/bối cảnh internet:"]: | |
| if marker in text: | |
| text = text.split(marker, 1)[1] | |
| break | |
| text = re.sub(r"https?://\S+", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| sentences = re.split(r"(?<=[\.\!\?])\s+(?=[A-ZÀ-Ỹ0-9])", text) | |
| candidates = [] | |
| for s in sentences: | |
| s = _clean(s) | |
| if 45 <= len(s) <= 260: | |
| candidates.append(s) | |
| units = _dedupe_units(candidates, max_units=max_units) | |
| if units: | |
| return "\n".join("• " + u for u in units) | |
| if text: | |
| return "• " + text[:700].rsplit(" ", 1)[0] | |
| return "• Không có đủ nội dung nguồn để tóm tắt." | |
| def _source_line(sources): | |
| names = [] | |
| for s in (sources or [])[:5]: | |
| via = s.get("via") or base._domain(s.get("url", "")) or s.get("title", "") | |
| if via and via not in names: | |
| names.append(via) | |
| return "Nguồn tham khảo: " + ", ".join(names[:5]) if names else "Nguồn tham khảo: tổng hợp internet" | |
| def _make_summary_prompt(title, raw, source_hint=""): | |
| return f"""Bạn là biên tập viên tóm tắt tin tức tiếng Việt. | |
| NHIỆM VỤ BẮT BUỘC: | |
| - Chỉ TÓM TẮT nội dung chính, KHÔNG viết lại toàn bộ bài. | |
| - Không lặp lại cùng một ý, cùng một câu, cùng một chi tiết. | |
| - Không thêm thông tin ngoài nguồn. | |
| - Tối đa 5 gạch đầu dòng, mỗi gạch đầu dòng 1 câu ngắn. | |
| - Nếu bài có số liệu/nhân vật/thời điểm quan trọng thì giữ lại. | |
| - Không viết phần mở bài dài, không viết văn kể lại. | |
| Tiêu đề nguồn: {title} | |
| Nguồn: {source_hint} | |
| Nội dung nguồn: | |
| {raw[:14000]} | |
| """ | |
| def _direct_news_rss(topic, limit=10): | |
| out = [] | |
| try: | |
| url = "https://news.google.com/rss/search?q=" + quote_plus(topic) + "&hl=vi&gl=VN&ceid=VN:vi" | |
| r = requests.get(url, headers=base.HEADERS, timeout=15) | |
| r.encoding = "utf-8" | |
| soup = BeautifulSoup(r.text, "xml") | |
| for it in soup.find_all("item")[:limit]: | |
| title = it.find("title").get_text(" ", strip=True) if it.find("title") else "" | |
| link = it.find("link").get_text(strip=True) if it.find("link") else "" | |
| src = it.find("source").get_text(" ", strip=True) if it.find("source") else base._domain(link) | |
| if title and link: | |
| out.append({"title": title, "url": link, "via": src, "excerpt": title}) | |
| except Exception: | |
| pass | |
| return out | |
| def _topic_source_articles(topic, limit=5): | |
| """Return actual scraped article bodies for a topic. Each source becomes one Wall AI post.""" | |
| try: | |
| _ctx, sources = base.web_context(topic, limit=limit) | |
| except Exception: | |
| sources = [] | |
| if not sources: | |
| sources = _direct_news_rss(topic, limit=10) | |
| out, seen = [], set() | |
| for s in (sources or [])[:limit * 3]: | |
| url = s.get("url") or "" | |
| if not url.startswith("http") or url in seen: | |
| continue | |
| seen.add(url) | |
| try: | |
| page = base.scrape_any_url(url) | |
| raw = (page.get("summary", "") + "\n" + page.get("text", "")).strip() | |
| if len(raw) < 180: | |
| continue | |
| title = page.get("title") or s.get("title") or url | |
| via = page.get("via") or s.get("via") or base._domain(url) | |
| out.append({ | |
| "title": title, | |
| "url": url, | |
| "raw": raw, | |
| "image": page.get("image") or "", | |
| "via": via, | |
| "source": {"title": title, "url": url, "excerpt": raw[:700], "via": via} | |
| }) | |
| if len(out) >= limit: | |
| break | |
| except Exception: | |
| continue | |
| if not out: | |
| for s in (sources or _direct_news_rss(topic, 6))[:limit]: | |
| title = s.get("title") or topic | |
| excerpt = s.get("excerpt") or s.get("description") or s.get("content") or title | |
| url = s.get("url", "") | |
| via = s.get("via") or base._domain(url) | |
| out.append({ | |
| "title": title, | |
| "url": url, | |
| "raw": excerpt, | |
| "image": base.pollinations_image_url(title), | |
| "via": via, | |
| "source": {"title": title, "url": url, "excerpt": excerpt[:700], "via": via} | |
| }) | |
| return out[:limit] | |
| async def qwen_generate_resilient(prompt: str, image_url=None, max_tokens: int = 1200): | |
| errors = [] | |
| token = base._hf_token() | |
| try: | |
| original = getattr(base, "_original_qwen_generate", None) | |
| if original: | |
| txt = await original(prompt, image_url=image_url, max_tokens=max_tokens) | |
| if txt: | |
| base.LAST_QWEN_ERROR = "" | |
| return txt | |
| if getattr(base, "LAST_QWEN_ERROR", ""): | |
| errors.append("sdk: " + str(base.LAST_QWEN_ERROR)[:260]) | |
| except Exception as e: | |
| errors.append(f"sdk: {type(e).__name__}: {str(e)[:260]}") | |
| if token: | |
| models = [] | |
| for m in [ | |
| os.getenv("QWEN_VL_MODEL", ""), | |
| "Qwen/Qwen2.5-VL-7B-Instruct", | |
| "Qwen/Qwen2.5-VL-3B-Instruct", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "Qwen/Qwen2.5-3B-Instruct", | |
| "Qwen/Qwen2.5-1.5B-Instruct", | |
| ]: | |
| if m and m not in models: | |
| models.append(m) | |
| headers = {"Authorization": "Bearer " + token, "Content-Type": "application/json"} | |
| for model in models: | |
| try: | |
| is_vl = "VL" in model and bool(image_url) | |
| user_content = ([{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}] if is_vl else prompt) | |
| payload = { | |
| "model": model, | |
| "messages": [ | |
| {"role": "system", "content": "Bạn là biên tập viên AI tiếng Việt. Chỉ tóm tắt súc tích nội dung nguồn, không viết lại toàn bài, không lặp ý, không bịa chi tiết."}, | |
| {"role": "user", "content": user_content}, | |
| ], | |
| "max_tokens": min(int(max_tokens or 900), 1400), | |
| "temperature": 0.35, | |
| "top_p": 0.85, | |
| } | |
| r = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, json=payload, timeout=95) | |
| if r.status_code >= 300: | |
| errors.append(f"{model}: HTTP {r.status_code} {r.text[:180]}") | |
| continue | |
| j = r.json() | |
| txt = (j.get("choices", [{}])[0].get("message", {}).get("content") or "").strip() | |
| if txt: | |
| base.LAST_QWEN_ERROR = "" | |
| return txt | |
| errors.append(f"{model}: empty response") | |
| except Exception as e: | |
| errors.append(f"{model}: {type(e).__name__}: {str(e)[:220]}") | |
| else: | |
| errors.append("missing HF_TOKEN") | |
| base.LAST_QWEN_ERROR = " | ".join(errors[-6:]) or "Qwen unavailable; used extractive fallback" | |
| print("[qwen resilient fallback]", base.LAST_QWEN_ERROR) | |
| return _fallback_summary_from_prompt(prompt, max_units=12) | |
| if not hasattr(base, "_original_qwen_generate"): | |
| base._original_qwen_generate = base.qwen_generate | |
| base.qwen_generate = qwen_generate_resilient | |
| def compat_wall(): | |
| return JSONResponse({'posts': base._load_ai_wall()[:80]}) | |
| _PATCHED_PATHS = { | |
| ('/api/topic_post', 'POST'), | |
| ('/api/url_wall', 'POST'), | |
| ('/api/rewrite_share', 'POST'), | |
| ('/api/ai/short/{post_id}', 'POST'), | |
| } | |
| app.router.routes = [ | |
| r for r in app.router.routes | |
| if not any(getattr(r, 'path', None) == p and m in getattr(r, 'methods', set()) for p, m in _PATCHED_PATHS) | |
| ] | |
| async def compat_topic_post(request: Request): | |
| body = await request.json() | |
| topic = base._clean_text(body.get('topic', '')) | |
| if not topic: | |
| return JSONResponse({'error': 'missing topic'}, status_code=400) | |
| articles = _topic_source_articles(topic, limit=4) | |
| if not articles: | |
| return JSONResponse({'error': 'Không lấy được bài viết nguồn cho chủ đề này.'}, status_code=422) | |
| new_posts = [] | |
| posts = base._load_ai_wall() | |
| for art in articles: | |
| prompt = f"""Tóm tắt RIÊNG bài viết nguồn sau để đăng Tường AI. | |
| Chủ đề lọc: {topic} | |
| Tiêu đề bài nguồn: {art['title']} | |
| Nguồn: {art['via']} | |
| Yêu cầu bắt buộc: | |
| - Tóm tắt nội dung trong BÀI VIẾT này, không chỉ tiêu đề. | |
| - Không trộn với bài khác. | |
| - Không viết lại toàn bộ bài. | |
| - Không lặp ý. | |
| - 4-6 gạch đầu dòng, mỗi dòng 1 câu rõ ràng. | |
| - Giữ số liệu/nhân vật/thời điểm quan trọng nếu có. | |
| Nội dung bài: | |
| {art['raw'][:14000]}""" | |
| text = await base.qwen_generate(prompt, image_url=art.get('image') or None, max_tokens=1500) | |
| text = _postprocess_ai_text(text, max_units=20) | |
| src = [art['source']] | |
| if 'Nguồn tham khảo:' not in text: | |
| text += "\n\n" + _source_line(src) | |
| post = base.make_post(art['title'], text, art.get('image') or base.pollinations_image_url(art['title']), art.get('url') or '', 'topic_article', sources=src) | |
| new_posts.append(post) | |
| posts = new_posts + posts | |
| base._save_ai_wall(posts) | |
| return JSONResponse({'post': new_posts[0], 'posts': new_posts, 'count': len(new_posts)}) | |
| async def compat_url_wall(request: Request): | |
| body = await request.json() | |
| url = base._clean_text(body.get('url', '')) | |
| if not url.startswith('http'): | |
| return JSONResponse({'error': 'missing url'}, status_code=400) | |
| try: | |
| data = base.scrape_any_url(url) | |
| except Exception as e: | |
| return JSONResponse({'error': 'Không scrape được URL: ' + str(e)[:180]}, status_code=422) | |
| raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip() | |
| if len(raw) < 120: | |
| return JSONResponse({'error': 'URL không có đủ nội dung để tóm tắt'}, status_code=422) | |
| prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url)) | |
| text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500) | |
| text = _postprocess_ai_text(text, max_units=20) | |
| src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}] | |
| if 'Nguồn tham khảo:' not in text: | |
| text += "\n\n" + _source_line(src) | |
| post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'url', sources=src) | |
| posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts) | |
| return JSONResponse({'post': post}) | |
| def _is_relevant_image(img_url, title, text): | |
| """Check if an image is relevant to the article content.""" | |
| if not img_url: | |
| return False | |
| skip_patterns = ['pixel', 'analytics', 'tracking', '1x1.gif', 'spacer.gif', | |
| 'logo', 'icon', 'avatar', 'emoji', 'smiley', 'sprite', | |
| 'advertisement', 'ad-banner', 'sponsored', 'banner-ads'] | |
| img_lower = img_url.lower() | |
| for p in skip_patterns: | |
| if p in img_lower: | |
| return False | |
| if not any(img_lower.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.webp', '.gif']): | |
| return False | |
| return True | |
| def _filter_relevant_images(images, title, text, max_images=8): | |
| """Filter and rank images by relevance to article content.""" | |
| if not images: | |
| return [] | |
| seen = set() | |
| relevant = [] | |
| for img in images: | |
| if img in seen: | |
| continue | |
| seen.add(img) | |
| if _is_relevant_image(img, title, text): | |
| relevant.append(img) | |
| return relevant[:max_images] | |
| def _extract_key_points_for_slides(paragraphs, max_points=12): | |
| """Extract key points from paragraphs for slides - extracts ALL sentences, not just first one.""" | |
| points = [] | |
| for p in paragraphs: | |
| if len(points) >= max_points: | |
| break | |
| p = _clean(p) | |
| if not p: | |
| continue | |
| # Split paragraph into sentences using Vietnamese + English punctuation - GET ALL SENTENCES | |
| sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÀ-Ỹ0-9])', p) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| for sentence in sentences: | |
| if len(points) >= max_points: | |
| break | |
| sentence = _clean(sentence) | |
| if len(sentence) < 30: | |
| continue | |
| if any(sentence[:60] in existing for existing in points): | |
| continue | |
| if not sentence.endswith(('.', '!', '?')): | |
| sentence = sentence + '.' | |
| points.append(sentence) | |
| return points | |
| def _scrape_article_images(url): | |
| """Scrape article page and return only relevant images.""" | |
| try: | |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", | |
| "Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8"} | |
| r = requests.get(url, headers=headers, timeout=15, allow_redirects=True) | |
| r.encoding = 'utf-8' | |
| soup = BeautifulSoup(r.text, 'lxml') | |
| for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'aside', 'form']): | |
| tag.decompose() | |
| h1 = soup.find('h1') | |
| ogt = soup.find('meta', property='og:title') | |
| title = (h1.get_text(strip=True) if h1 else '') or (ogt.get('content', '') if ogt else '') | |
| ogi = soup.find('meta', property='og:image') | |
| og_img = ogi.get('content', '') if ogi else '' | |
| if og_img and og_img.startswith('//'): | |
| og_img = 'https:' + og_img | |
| block = None | |
| for sel in ['article', '.singular-content', '.detail-content', '.fck_detail', '.content-detail', '.knc-content', 'main', '.cms-body', '.article__body']: | |
| el = soup.select_one(sel) | |
| if el and len(el.find_all('p')) >= 2: | |
| block = el | |
| break | |
| if not block: | |
| block = soup.body or soup | |
| paragraphs = [] | |
| all_images = [] | |
| seen_imgs = set() | |
| if og_img and og_img not in seen_imgs: | |
| all_images.append(og_img) | |
| seen_imgs.add(og_img) | |
| for el in block.find_all(['p', 'h2', 'h3', 'figure', 'img'], recursive=True): | |
| if el.name == 'p': | |
| t = _clean(el.get_text(strip=True)) | |
| if t and len(t) > 40: | |
| paragraphs.append(t) | |
| elif el.name in ('figure', 'img'): | |
| im = el if el.name == 'img' else el.find('img') | |
| if im: | |
| src = im.get('data-src') or im.get('src') or im.get('data-original') or '' | |
| if src and 'base64' not in src: | |
| if src.startswith('//'): | |
| src = 'https:' + src | |
| if src not in seen_imgs: | |
| all_images.append(src) | |
| seen_imgs.add(src) | |
| relevant_images = _filter_relevant_images(all_images, title, ' '.join(paragraphs[:5])) | |
| return {'title': _clean(title), 'paragraphs': paragraphs, 'images': relevant_images, 'og_img': og_img} | |
| except Exception: | |
| return None | |
| async def compat_rewrite_share(request: Request): | |
| body = await request.json() | |
| url = base._clean_text(body.get('url', '')) | |
| if not url.startswith('http'): | |
| return JSONResponse({'error': 'missing url'}, status_code=400) | |
| try: | |
| data = base.scrape_any_url(url) | |
| except Exception as e: | |
| return JSONResponse({'error': 'Không đọc được bài viết: ' + str(e)[:180]}, status_code=422) | |
| raw = (data.get('summary', '') + '\n' + data.get('text', '')).strip() | |
| if len(raw) < 120: | |
| return JSONResponse({'error': 'Bài viết không đủ nội dung để tóm tắt'}, status_code=422) | |
| prompt = _make_summary_prompt(data.get('title', ''), raw, data.get('via', '') or base._domain(url)) | |
| text = await base.qwen_generate(prompt, image_url=data.get('image') or None, max_tokens=1500) | |
| text = _postprocess_ai_text(text, max_units=20) | |
| src = [{'title': data.get('title'), 'url': url, 'excerpt': raw[:500], 'via': data.get('via') or base._domain(url)}] | |
| if 'Nguồn tham khảo:' not in text: | |
| text += "\n\n" + _source_line(src) | |
| post = base.make_post(data.get('title') or 'Bài viết', text, data.get('image') or '', url, 'summary', sources=src) | |
| posts = base._load_ai_wall(); posts.insert(0, post); base._save_ai_wall(posts) | |
| # Generate slides with relevant images only | |
| slides = [] | |
| page_data = _scrape_article_images(url) | |
| if page_data and page_data.get('paragraphs'): | |
| key_points = _extract_key_points_for_slides(page_data['paragraphs'], max_points=12) | |
| if key_points: | |
| relevant_imgs = page_data.get('images', []) | |
| if not relevant_imgs and page_data.get('og_img'): | |
| relevant_imgs = [page_data['og_img']] | |
| for i, point in enumerate(key_points): | |
| img = relevant_imgs[i] if i < len(relevant_imgs) else (relevant_imgs[-1] if relevant_imgs else '') | |
| slides.append({'text': point, 'image': img, 'index': i + 1}) | |
| return JSONResponse({'post': post, 'slides': slides}) | |
| def _emotion_script(text, emotion): | |
| """Prepend emotion-appropriate prefix to text based on emotion type. | |
| NOTE: Prefix is NOT added to avoid cluttering Short AI speech. | |
| The emotion is still used for voice selection but content is read cleanly. | |
| """ | |
| text = _clean(text) | |
| # REMOVED: No prefix added to keep content clean and natural | |
| return text | |
| def _tts_script_smart(post, emotion): | |
| raw = base._short_script(post) if hasattr(base, '_short_script') else _clean(post.get('text', '') or post.get('title', '')) | |
| raw = re.sub(r"^[•\-\*]\s*", "", raw, flags=re.M) | |
| raw = re.sub(r"\s*\n\s*", ". ", raw) | |
| raw = re.sub(r"([\.\!\?])\s*", r"\1\n", raw) | |
| raw = re.sub(r"\n{2,}", "\n", raw).strip() | |
| # REMOVED: _emotion_script call - read content cleanly without prefix | |
| # INCREASED to 3000 to read full content of all bullet points | |
| if len(raw) > 3000: | |
| raw = raw[:3000] | |
| cut = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?")) | |
| if cut > 700: | |
| raw = raw[:cut + 1] | |
| return raw | |
| def _split_subtitle_sentences(script): | |
| parts = [] | |
| for line in script.splitlines(): | |
| line = _clean(line) | |
| if not line: | |
| continue | |
| for s in re.split(r"(?<=[\.\!\?])\s+", line): | |
| s = _clean(s) | |
| if 8 <= len(s) <= 140: | |
| parts.append(s) | |
| return parts[:12] | |
| def _srt_time(sec): | |
| ms = int((sec - int(sec)) * 1000) | |
| sec = int(sec) | |
| h = sec // 3600 | |
| m = (sec % 3600) // 60 | |
| s = sec % 60 | |
| return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" | |
| def _write_srt(script, path, total_duration=30): | |
| subs = _split_subtitle_sentences(script) | |
| if not subs: | |
| subs = [script[:120]] | |
| dur = max(2.2, min(5.0, total_duration / max(1, len(subs)))) | |
| cur = 0.3 | |
| with open(path, 'w', encoding='utf-8') as f: | |
| for i, s in enumerate(subs, 1): | |
| start = cur | |
| end = cur + dur | |
| cur = end + 0.15 | |
| f.write(f"{i}\n{_srt_time(start)} --> {_srt_time(end)}\n{s}\n\n") | |
| def _wrap_text_px(draw, text, font, max_width, max_lines): | |
| words = _clean(text).split() | |
| lines, cur = [], "" | |
| for w in words: | |
| test = (cur + " " + w).strip() | |
| try: | |
| width = draw.textbbox((0, 0), test, font=font)[2] | |
| except Exception: | |
| width = len(test) * 20 | |
| if width <= max_width: | |
| cur = test | |
| else: | |
| if cur: | |
| lines.append(cur) | |
| cur = w | |
| if len(lines) >= max_lines: | |
| break | |
| if cur and len(lines) < max_lines: | |
| lines.append(cur) | |
| return lines | |
| def _make_short_frame_full(post, img_path, out_path): | |
| if Image is None: | |
| return base._make_short_frame(post, img_path, out_path) | |
| W, H = 1080, 1920 | |
| bg = Image.new("RGB", (W, H), (14, 14, 14)) | |
| try: | |
| im = Image.open(img_path).convert("RGB") | |
| target = (1080, 760) | |
| im_ratio = im.width / im.height | |
| target_ratio = target[0] / target[1] | |
| if im_ratio > target_ratio: | |
| new_h = target[1] | |
| new_w = int(new_h * im_ratio) | |
| else: | |
| new_w = target[0] | |
| new_h = int(new_w / im_ratio) | |
| im = im.resize((new_w, new_h)) | |
| left = (new_w - target[0]) // 2 | |
| top = (new_h - target[1]) // 2 | |
| im = im.crop((left, top, left + target[0], top + target[1])) | |
| bg.paste(im, (0, 0)) | |
| except Exception: | |
| pass | |
| draw = ImageDraw.Draw(bg) | |
| try: | |
| font_title = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 54) | |
| font_body = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 38) | |
| font_label = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30) | |
| except Exception: | |
| font_title = font_body = font_label = None | |
| draw.rectangle((0, 720, W, H), fill=(14, 14, 14)) | |
| margin = 48 | |
| maxw = W - margin * 2 | |
| draw.text((margin, 770), "VNEWS · Tường AI", fill=(92, 184, 122), font=font_label) | |
| y = 830 | |
| for ln in _wrap_text_px(draw, post.get("title", ""), font_title, maxw, 4): | |
| draw.text((margin, y), ln, fill=(255, 255, 255), font=font_title) | |
| y += 66 | |
| y += 18 | |
| text = post.get("text", "") | |
| text = re.sub(r"Nguồn tham khảo:.*", "", text, flags=re.S).strip() | |
| body_lines = _wrap_text_px(draw, text, font_body, maxw, 14) | |
| for ln in body_lines: | |
| draw.text((margin, y), ln, fill=(220, 220, 220), font=font_body) | |
| y += 50 | |
| if y > 1640: | |
| break | |
| bg.save(out_path, quality=92) | |
| def _summary_segments_from_post(post, max_segments=25): | |
| raw = _clean(post.get('text') or post.get('title') or '') | |
| raw = re.sub(r'^Bản tin AI viết lại:\s*', '', raw, flags=re.I) | |
| raw = re.sub(r'Nguồn tham khảo:.*$', '', raw, flags=re.I|re.S).strip() | |
| lines=[] | |
| for ln in raw.splitlines(): | |
| ln=_clean(re.sub(r'^[•\-\*\d\.\)\s]+','',ln)) | |
| if not ln: continue | |
| low=ln.lower() | |
| if low.startswith(('điểm chính','tiêu đề','sapo','nguồn tham khảo')): continue | |
| if len(ln)>=18: lines.append(ln) | |
| if len(lines)<3: | |
| lines=[] | |
| for s in re.split(r'(?<=[\.\!\?])\s+', raw): | |
| s=_clean(s) | |
| if len(s)>=25: lines.append(s) | |
| segs=_dedupe_units(lines, max_units=max_segments) | |
| return segs[:max_segments] if segs else [post.get('title','Bản tin VNEWS')] | |
| def _make_scene_frame(post, segment, idx, total, img_path, out_path, emotion='neutral'): | |
| if Image is None: | |
| return _make_short_frame_full(post, img_path, out_path) | |
| W,H=1080,1920 | |
| bg=Image.new('RGB',(W,H),(10,10,10)) | |
| try: | |
| im=Image.open(img_path).convert('RGB') | |
| ratio=im.width/max(1,im.height); target=W/H | |
| if ratio>target: | |
| nh=H; nw=int(nh*ratio) | |
| else: | |
| nw=W; nh=int(nw/ratio) | |
| cover=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-H)//2 | |
| cover=cover.crop((left,top,left+W,top+H)) | |
| bg.paste(cover,(0,0)) | |
| bg=Image.blend(bg, Image.new('RGB',(W,H),(0,0,0)), 0.50) | |
| hero_h=720; target=W/hero_h | |
| if ratio>target: | |
| nh=hero_h; nw=int(nh*ratio) | |
| else: | |
| nw=W; nh=int(nw/ratio) | |
| hero=im.resize((nw,nh)); left=(nw-W)//2; top=(nh-hero_h)//2 | |
| hero=hero.crop((left,top,left+W,top+hero_h)) | |
| bg.paste(hero,(0,0)) | |
| except Exception: | |
| pass | |
| draw=ImageDraw.Draw(bg) | |
| try: | |
| font_brand=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',34) | |
| font_small=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',28) | |
| font_seg=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',58) | |
| font_title=ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',34) | |
| except Exception: | |
| font_brand=font_small=font_seg=font_title=None | |
| draw.rectangle((0,680,W,H), fill=(12,12,12)) | |
| dot_x=48; dot_y=742 | |
| for i in range(total): | |
| fill=(92,184,122) if i==idx else (70,70,70) | |
| draw.rounded_rectangle((dot_x+i*38,dot_y,dot_x+i*38+24,dot_y+10), radius=5, fill=fill) | |
| draw.text((48,780),'VNEWS AI SHORT',fill=(110,231,143),font=font_brand) | |
| draw.rounded_rectangle((48,834,260,880), radius=20, fill=(28,70,45)) | |
| draw.text((66,842),f'Đoạn {idx+1}/{total}',fill=(235,235,235),font=font_small) | |
| y=940; maxw=W-96 | |
| # INCREASED from 12 to 18 for full content display - each key point can span multiple lines | |
| for ln in _wrap_text_px(draw, segment, font_seg, maxw, 18): | |
| draw.text((48,y),ln,fill=(255,255,255),font=font_seg) | |
| y+=74 | |
| if y>1500: break | |
| y2=1640 | |
| draw.line((48,y2-22,W-48,y2-22),fill=(70,70,70),width=2) | |
| for ln in _wrap_text_px(draw, post.get('title',''), font_title, maxw, 3): | |
| draw.text((48,y2),ln,fill=(220,220,220),font=font_title) | |
| y2+=46 | |
| bg.save(out_path, quality=92) | |
| def _estimate_audio_duration(path, fallback=15.0): | |
| """Estimate audio duration with 15s minimum per segment for complete bullet reading.""" | |
| try: | |
| pr=subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','default=noprint_wrappers=1:no_key=1',path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20) | |
| return max(12.0, float((pr.stdout or b'').decode().strip() or fallback)) | |
| except Exception: | |
| return fallback | |
| async def patched_ai_short(post_id: str, request: Request): | |
| try: | |
| body = await request.json() | |
| except Exception: | |
| body = {} | |
| voice = str(body.get('voice', 'nu')).strip().lower() | |
| emotion = str(body.get('emotion', 'neutral')).strip().lower() | |
| speed = float(body.get('speed', 1.0) or 1.0) | |
| speed = max(0.85, min(1.35, speed)) | |
| posts = base._load_ai_wall() | |
| post = next((p for p in posts if str(p.get('id')) == str(post_id)), None) | |
| if not post: | |
| return JSONResponse({'error': 'post not found'}, status_code=404) | |
| segments = _summary_segments_from_post(post, max_segments=25) | |
| seg_hash = hashlib.md5(('|'.join(segments)+voice+emotion+str(speed)).encode('utf-8')).hexdigest()[:8] | |
| os.makedirs(base.SHORTS_DIR, exist_ok=True) | |
| suffix = f"_{voice}_{emotion}_{str(speed).replace('.', 'p')}_{seg_hash}_scenes_nosub" | |
| out_mp4 = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix) + '.mp4') | |
| if os.path.exists(out_mp4): | |
| post['video'] = '/api/ai/short-file/' + post_id + suffix | |
| post['short_voice'] = voice | |
| post['short_emotion'] = emotion | |
| post['short_speed'] = speed | |
| post['short_segments'] = segments | |
| post['short_subtitles'] = False | |
| base._save_ai_wall(posts) | |
| return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments}) | |
| if base.gTTS is None: | |
| return JSONResponse({'error': 'gTTS chưa sẵn sàng'}, status_code=503) | |
| work = os.path.join(base.SHORTS_DIR, base._safe_name(post_id + suffix)) | |
| os.makedirs(work, exist_ok=True) | |
| img = os.path.join(work, 'image.jpg') | |
| try: | |
| base._download_image(post.get('img'), post.get('title', 'AI news'), img) | |
| edge_voice = { | |
| # Vietnamese | |
| 'vi-vn-hoaimyneural': 'vi-VN-HoaiMyNeural', | |
| 'vi-vn-namminhneural': 'vi-VN-NamMinhNeural', | |
| 'hoaimy': 'vi-VN-HoaiMyNeural', | |
| 'namminh': 'vi-VN-NamMinhNeural', | |
| 'nam': 'vi-VN-NamMinhNeural', | |
| 'male': 'vi-VN-NamMinhNeural', | |
| 'nu': 'vi-VN-HoaiMyNeural', | |
| 'female': 'vi-VN-HoaiMyNeural', | |
| 'mien-nam': 'vi-VN-HoaiMyNeural', | |
| # English - Multilingual | |
| 'en-us-andrewmultilingualneural': 'en-US-AndrewMultilingualNeural', | |
| 'en-au-williammultilingualneural': 'en-AU-WilliamMultilingualNeural', | |
| 'andrew': 'en-US-AndrewMultilingualNeural', | |
| 'en_andrew': 'en-US-AndrewMultilingualNeural', | |
| 'jenny': 'en-US-AndrewMultilingualNeural', | |
| 'en_jenny': 'en-US-AndrewMultilingualNeural', | |
| # Portuguese - Multilingual (ONLY Thalita) | |
| 'pt-br-thalitamultilingualneural': 'pt-BR-ThalitaMultilingualNeural', | |
| 'thalita': 'pt-BR-ThalitaMultilingualNeural', | |
| 'pt_thalita': 'pt-BR-ThalitaMultilingualNeural', | |
| 'pt_br_thalita': 'pt-BR-ThalitaMultilingualNeural', | |
| 'pt': 'pt-BR-ThalitaMultilingualNeural', | |
| 'pt_francisco': 'pt-BR-ThalitaMultilingualNeural', | |
| # French - Multilingual | |
| 'fr-fr-viviennemultilingualneural': 'fr-FR-VivienneMultilingualNeural', | |
| 'fr-fr-remymultilingualneural': 'fr-FR-RemyMultilingualNeural', | |
| 'denise': 'fr-FR-VivienneMultilingualNeural', | |
| 'fr': 'fr-FR-VivienneMultilingualNeural', | |
| 'fr_denise': 'fr-FR-VivienneMultilingualNeural', | |
| # German - Multilingual | |
| 'de-de-seraphinamultilingualneural': 'de-DE-SeraphinaMultilingualNeural', | |
| 'de-de-florianmultilingualneural': 'de-DE-FlorianMultilingualNeural', | |
| 'katja': 'de-DE-SeraphinaMultilingualNeural', | |
| 'de': 'de-DE-SeraphinaMultilingualNeural', | |
| 'de_katja': 'de-DE-SeraphinaMultilingualNeural', | |
| # Korean - Multilingual (Hyunsu, NOT SunHee) | |
| 'ko-kr-hyusumultilingualneural': 'ko-KR-HyunsuMultilingualNeural', | |
| 'ko-kr-hyunsuneural': 'ko-KR-HyunsuMultilingualNeural', | |
| 'sunhee': 'ko-KR-HyunsuMultilingualNeural', | |
| 'ko': 'ko-KR-HyunsuMultilingualNeural', | |
| 'ko_sunhee': 'ko-KR-HyunsuMultilingualNeural', | |
| # Italian - Multilingual | |
| 'it-it-giuseppemultilingualneural': 'it-IT-GiuseppeMultilingualNeural', | |
| # Spanish (keep for backward compat) | |
| 'ela': 'en-US-AndrewMultilingualNeural', | |
| 'es_ela': 'en-US-AndrewMultilingualNeural', | |
| 'es': 'en-US-AndrewMultilingualNeural', | |
| 'es_carlos': 'en-US-AndrewMultilingualNeural', | |
| # Japanese (keep for backward compat) | |
| 'nanami': 'en-US-AndrewMultilingualNeural', | |
| 'ja': 'en-US-AndrewMultilingualNeural', | |
| 'ja_nanami': 'en-US-AndrewMultilingualNeural', | |
| # Chinese (keep for backward compat) | |
| 'xiaochen': 'en-US-AndrewMultilingualNeural', | |
| 'zh': 'en-US-AndrewMultilingualNeural', | |
| 'zh_xiaochen': 'en-US-AndrewMultilingualNeural', | |
| }.get(voice, 'vi-VN-HoaiMyNeural') | |
| part_files=[] | |
| for idx, seg in enumerate(segments): | |
| frame=os.path.join(work,f'frame_{idx:02d}.jpg') | |
| aud=os.path.join(work,f'voice_{idx:02d}.mp3') | |
| aud_fast=os.path.join(work,f'voice_{idx:02d}_fast.mp3') | |
| part=os.path.join(work,f'part_{idx:02d}.mp4') | |
| _make_scene_frame(post, seg, idx, len(segments), img, frame, emotion=emotion) | |
| spoken=_emotion_script(seg, emotion) | |
| try: | |
| subprocess.run(['python','-m','edge_tts','--voice',edge_voice,'--text',spoken,'--write-media',aud], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=120) | |
| except Exception: | |
| tld='com.vn' if voice in ('nu','female','mien-nam','hoaimy') else 'com' | |
| try: | |
| base.gTTS(spoken, lang='vi', tld=tld, slow=False).save(aud) | |
| except TypeError: | |
| base.gTTS(spoken, lang='vi', slow=False).save(aud) | |
| subprocess.run(['ffmpeg','-y','-i',aud,'-filter:a',f'atempo={speed}','-vn',aud_fast], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=90) | |
| dur=_estimate_audio_duration(aud_fast, fallback=15.0)+0.35 | |
| subprocess.run(['ffmpeg','-y','-loop','1','-t',str(dur),'-i',frame,'-i',aud_fast,'-shortest','-c:v','libx264','-tune','stillimage','-pix_fmt','yuv420p','-c:a','aac','-b:a','128k',part], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=150) | |
| part_files.append(part) | |
| concat=os.path.join(work,'concat.txt') | |
| with open(concat,'w',encoding='utf-8') as f: | |
| for p in part_files: | |
| f.write("file '" + p.replace("'", "'\\''") + "'\n") | |
| subprocess.run(['ffmpeg','-y','-f','concat','-safe','0','-i',concat,'-c','copy',out_mp4], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180) | |
| post['video'] = '/api/ai/short-file/' + post_id + suffix | |
| post['short_voice'] = voice | |
| post['short_emotion'] = emotion | |
| post['short_speed'] = speed | |
| post['short_segments'] = segments | |
| post['short_subtitles'] = False | |
| base._save_ai_wall(posts) | |
| return JSONResponse({'video': post['video'], 'voice': voice, 'emotion': emotion, 'speed': speed, 'subtitles': False, 'segments': segments}) | |
| except Exception as e: | |
| return JSONResponse({'error': 'Không tạo được shorts: ' + str(e)[:220]}, status_code=500) | |
| def patched_ai_short_file(file_id: str): | |
| path = os.path.join(base.SHORTS_DIR, base._safe_name(file_id) + '.mp4') | |
| if not os.path.exists(path): | |
| return JSONResponse({'error': 'not found'}, status_code=404) | |
| return FileResponse(path, media_type='video/mp4', filename=f'vnews-ai-{file_id}.mp4') | |
| def api_ai_shorts(): | |
| posts = [p for p in base._load_ai_wall() if p.get('video')] | |
| return JSONResponse({'posts': posts[:80]}) | |
| app.router.routes = [r for r in app.router.routes if not (getattr(r, 'path', None) == '/' and 'GET' in getattr(r, 'methods', set()))] | |