import base64, copy, hashlib, json, math, os, re, struct, time, unicodedata, cv2, httpx, numpy as np, budoux from urllib.parse import parse_qs, urlencode, urlparse from PIL import Image, ImageChops, ImageDraw, ImageFilter, ImageFont IMAGE_PATH = "33.jpg" OUT_JSON = "output.json" LANG = "th" AI_API_KEY = os.getenv("AI_API_KEY", "").strip() FIREBASE_URL = "https://cookie-6e1cd-default-rtdb.asia-southeast1.firebasedatabase.app/lens/cookie.json" WRITE_OUT_JSON = True DECODE_IMAGEURL_TO_DATAURI = True DO_ORIGINAL = True DO_TRANSLATED = True DO_ORIGINAL_HTML = True DO_TRANSLATED_HTML = True DO_AI_HTML = True HTML_INCLUDE_CSS = True DRAW_OVERLAY_ORIGINAL = False DRAW_OVERLAY_TRANSLATED = False OVERLAY_ORIGINAL_PATH = "overlay_original.png" OVERLAY_TRANSLATED_PATH = "overlay_translated.png" TRANSLATED_OVERLAY_FONT_SCALE = 1.0 TRANSLATED_OVERLAY_FIT_TO_BOX = True AI_OVERLAY_FONT_SCALE = 1.5 AI_OVERLAY_FIT_TO_BOX = True DO_AI = True DO_AI_JSON = False DO_AI_OVERLAY = False AI_CACHE = False AI_CACHE_PATH = "ai_cache.json" AI_PATH_OVERLAY = "overlay_ai.png" AI_PROVIDER = "auto" AI_MODEL = "auto" AI_BASE_URL = "auto" AI_TEMPERATURE = 0.2 AI_MAX_TOKENS = 1200 AI_TIMEOUT_SEC = 120 DRAW_BOX_OUTLINE = True AUTO_TEXT_COLOR = True TEXT_COLOR = (0, 0, 0, 255) TEXT_COLOR_DARK = (0, 0, 0, 255) TEXT_COLOR_LIGHT = (255, 255, 255, 255) BOX_OUTLINE = (0, 255, 0, 255) BOX_OUTLINE_WIDTH = 2 DRAW_OUTLINE_PARA = False DRAW_OUTLINE_ITEM = False DRAW_OUTLINE_SPAN = False PARA_OUTLINE = (0, 0, 255, 255) ITEM_OUTLINE = (255, 0, 0, 255) SPAN_OUTLINE = BOX_OUTLINE PARA_OUTLINE_WIDTH = 3 ITEM_OUTLINE_WIDTH = 2 SPAN_OUTLINE_WIDTH = BOX_OUTLINE_WIDTH ERASE_OLD_TEXT_WITH_ORIGINAL_BOXES = True ERASE_PADDING_PX = 2 ERASE_SAMPLE_MARGIN_PX = 6 ERASE_MODE = "inpaint" ERASE_MOSAIC_BLOCK_PX = 10 ERASE_CLONE_GAP_PX = 4 ERASE_CLONE_BORDER_PX = 6 ERASE_CLONE_FEATHER_PX = 3 ERASE_BLEND_GAP_PX = 3 ERASE_BLEND_FEATHER_PX = 4 INPAINT_RADIUS = 3 INPAINT_METHOD = "telea" INPAINT_DILATE_PX = 1 BG_SAMPLE_BORDER_PX = 3 BASELINE_SHIFT = True BASELINE_SHIFT_FACTOR = 0.40 FONT_DOWNLOD = True FONT_THAI_PATH = "NotoSansThai-Regular.ttf" FONT_LATIN_PATH = "NotoSans-Regular.ttf" FONT_THAI_URLS = [ "https://github.com/google/fonts/raw/main/ofl/notosansthai/NotoSansThai-Regular.ttf", "https://github.com/google/fonts/raw/main/ofl/notosansthaiui/NotoSansThaiUI-Regular.ttf", ] FONT_LATIN_URLS = [ "https://github.com/google/fonts/raw/main/ofl/notosans/NotoSans-Regular.ttf", ] FONT_JA_PATH = "NotoSansCJKjp-Regular.otf" FONT_JA_URLS = [ "https://raw.githubusercontent.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf", "https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf", ] FONT_ZH_SC_PATH = "NotoSansCJKsc-Regular.otf" FONT_ZH_SC_URLS = [ "https://raw.githubusercontent.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf", "https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf", ] FONT_ZH_TC_PATH = "NotoSansCJKtc-Regular.otf" FONT_ZH_TC_URLS = [ "https://raw.githubusercontent.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf", "https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf", ] UI_LANGUAGES = [ {"code": "en", "name": "English"}, {"code": "th", "name": "Thai"}, {"code": "ja", "name": "Japanese"}, {"code": "ko", "name": "Korean"}, {"code": "zh-CN", "name": "Chinese (Simplified)"}, {"code": "vi", "name": "Vietnamese"}, {"code": "es", "name": "Spanish"}, {"code": "de", "name": "German"}, {"code": "fr", "name": "French"}, ] AI_PROVIDER_DEFAULTS = { "gemini": { "model": "gemini-2.5-flash", "base_url": "", }, "openai": { "model": "gpt-4o-mini", "base_url": "https://api.openai.com/v1", }, "openrouter": { "model": "openai/o4-mini", "base_url": "https://openrouter.ai/api/v1", }, "huggingface": { "model": "google/gemma-2-2b-it", "base_url": "https://router.huggingface.co/v1", }, "featherless": { "model": "Qwen/Qwen2.5-7B-Instruct", "base_url": "https://api.featherless.ai/v1", }, "groq": { "model": "openai/gpt-oss-20b", "base_url": "https://api.groq.com/openai/v1", }, "together": { "model": "openai/gpt-oss-20b", "base_url": "https://api.together.xyz/v1", }, "deepseek": { "model": "deepseek-chat", "base_url": "https://api.deepseek.com/v1", }, "anthropic": { "model": "claude-sonnet-4-20250514", "base_url": "https://api.anthropic.com", }, } AI_PROVIDER_ALIASES = { "hf": "huggingface", "huggingface_router": "huggingface", "hf_router": "huggingface", "openai_compat": "openai", "openai-compatible": "openai", "gemini3": "gemini", "gemini-3": "gemini", "google": "gemini", } AI_MODEL_ALIASES = { "gemini": { "flash-lite": "gemini-2.5-flash-lite", "flash": "gemini-2.5-flash", "pro": "gemini-2.5-pro", "3-flash": "gemini-3-flash-preview", "3-pro": "gemini-3-pro-preview", "3-pro-image": "gemini-3-pro-image-preview", "flash-image": "gemini-2.5-flash-image", } } AI_PROMPT_SYSTEM_BASE = ( "You are a professional manga translator and dialogue localizer.\n" "Rewrite each paragraph as natural dialogue in the target language while preserving meaning, tone, intent, and character voice.\n" "Keep lines concise for speech bubbles. Do not add new information. Do not omit meaning. Do not explain.\n" "Preserve emphasis (… ! ?). Avoid excessive punctuation.\n" "If the input is already in the target language, improve it (dialogue polish) without changing meaning." ) AI_LANG_STYLE = { "th": ( "Target language: Thai\\n" "Write Thai manga dialogue that reads like a high-quality Thai scanlation: natural, concise, and in-character.\\n" "Keep lines short for speech bubbles; avoid stiff, literal phrasing.\\n" "Default: omit pronouns and omit gendered polite sentence-final particles unless the source line clearly requires them.\\n" "Never use the word 'ฉัน'. Prefer omitting the subject.\\n" "Never use a male-coded second-person pronoun. When addressing someone by name, do not add a second-person pronoun after the name; prefer NAME + clause.\\n" "If a second-person reference is unavoidable, use a neutral/casual form appropriate to tone, but keep it gender-neutral and consistent with the line.\\n" "Use particles/interjections sparingly to match tone; do not overuse.\\n" "Keep names/terms consistent; transliterate when appropriate.\\n" "Output only the translated text." ), "en": ( "Target language: English\n" "Write natural English manga dialogue: concise, conversational, with contractions where natural.\n" "Localize tone and character voice; keep emotion and emphasis.\n" "Keep proper nouns consistent; do not over-explain." ), "ja": ( "Target language: Japanese\n" "Write natural Japanese manga dialogue: concise, spoken.\n" "Choose 丁寧語/タメ口 to match context; keep emotion and emphasis.\n" "Keep proper nouns consistent; keep SFX natural in Japanese." ), "default": ( "Write natural manga dialogue in the target language: concise, spoken, faithful to meaning and tone." ), } AI_PROMPT_RESPONSE_CONTRACT_JSON = ( "Return ONLY valid JSON (no markdown, no extra text).\n" "Output JSON MUST have exactly one key: \"aiTextFull\".\n" "\"aiTextFull\" MUST be a single JSON string WITHOUT raw newlines.\n" "Use literal \\n and \\n\\n to represent line breaks.\n" "You MUST preserve paragraph boundaries and order. Paragraphs are separated by a blank line (\\n\\n).\n" "Do NOT add extra paragraphs. Do NOT remove paragraphs.\n" "Never include code fences or XML/HTML tags.\n" "All string values MUST NOT contain raw newlines." ) AI_PROMPT_RESPONSE_CONTRACT_TEXT = ( "Return ONLY the translated text (no JSON, no markdown, no commentary).\n" "You MUST preserve paragraph boundaries and order. Paragraphs are separated by a blank line.\n" "Use actual newlines for line breaks.\n" "Do NOT add extra paragraphs. Do NOT remove paragraphs.\n" "Never include code fences or XML/HTML tags." ) AI_PROMPT_DATA_TEMPLATE = ( "Input JSON:\n{input_json}\n\n" "Output JSON schema (MUST match exactly):\n{output_schema}" ) AI_PROMPT_DATA_TEMPLATE_TEXT = ( "Input JSON:\n{input_json}\n\n" "Return the translation as plain text only." ) FIREBASE_COOKIE_TTL_SEC = int(os.getenv("FIREBASE_COOKIE_TTL_SEC", "900")) _FIREBASE_COOKIE_CACHE = {"ts": 0.0, "url": "", "data": None} _FONT_RESOLVE_CACHE = {} _HF_MODELS_CACHE = {} _FONT_PAIR_CACHE = {} _TP_HTML_EPS_PX = 0.0 ZWSP = "\u200b" def _active_ai_contract() -> str: return AI_PROMPT_RESPONSE_CONTRACT_JSON if DO_AI_JSON else AI_PROMPT_RESPONSE_CONTRACT_TEXT def _active_ai_data_template() -> str: return AI_PROMPT_DATA_TEMPLATE if DO_AI_JSON else AI_PROMPT_DATA_TEMPLATE_TEXT def _canonical_provider(provider: str) -> str: p = (provider or "").strip().lower() return AI_PROVIDER_ALIASES.get(p, p) def _resolve_model(provider: str, model: str) -> str: m = (model or "").strip() if not m or m.lower() == "auto": d = AI_PROVIDER_DEFAULTS.get(provider) or {} return (d.get("model") or "").strip() or AI_PROVIDER_DEFAULTS["openai"]["model"] key = m.lower() aliases = AI_MODEL_ALIASES.get(provider) or {} return aliases.get(key) or m def _normalize_lang(lang: str) -> str: t = (lang or "").strip().lower() if t in ("jp", "jpn", "japanese"): return "ja" if t in ("thai",): return "th" if t in ("eng", "english"): return "en" if t.startswith("zh"): return t if len(t) >= 2: return t[:2] return t def _sha1(s: str) -> str: return hashlib.sha1(s.encode("utf-8")).hexdigest() def _hf_router_available_models(api_key: str, base_url: str) -> list[str]: if not api_key or not base_url: return [] key = _sha1(f"{_sha1(api_key)}|{base_url}") now = time.time() cached = _HF_MODELS_CACHE.get(key) or {} if cached.get("ts") and now - float(cached["ts"]) < 3600 and isinstance(cached.get("models"), list): return cached["models"] url = base_url.rstrip("/") + "/models" headers = {"Authorization": f"Bearer {api_key}"} try: with httpx.Client(timeout=float(AI_TIMEOUT_SEC)) as client: r = client.get(url, headers=headers) r.raise_for_status() data = r.json() except Exception: return [] models = [] for m in (data.get("data") or []): mid = (m.get("id") if isinstance(m, dict) else None) if isinstance(mid, str) and mid.strip(): models.append(mid.strip()) _HF_MODELS_CACHE[key] = {"ts": now, "models": models} return models def _pick_hf_fallback_model(models: list[str]) -> str: if not models: return "" priority_substrings = ( "gemma-3", "gemma-2", "llama-3.1", "llama-3", "mistral", "qwen", "glm", ) lowered = [(m, m.lower()) for m in models] for sub in priority_substrings: for m, ml in lowered: if sub in ml and ("instruct" in ml or ml.endswith("-it") or ":" in ml): return m for m, ml in lowered: if "instruct" in ml or ml.endswith("-it") or ":" in ml: return m return models[0] def _load_ai_cache(path: str): if not path: return {} if not os.path.exists(path): return {} try: with open(path, "r", encoding="utf-8") as f: d = json.load(f) return d if isinstance(d, dict) else {} except Exception: return {} def _save_ai_cache(path: str, cache: dict): if not path: return tmp = path + ".tmp" with open(tmp, "w", encoding="utf-8") as f: json.dump(cache, f, ensure_ascii=False) os.replace(tmp, path) def _build_ai_prompt_packet(target_lang: str, original_text_full: str): lang = _normalize_lang(target_lang) input_json = json.dumps( {"target_lang": lang, "originalTextFull": original_text_full}, ensure_ascii=False) output_schema = json.dumps({"aiTextFull": "..."}, ensure_ascii=False) data_template = _active_ai_data_template() if DO_AI_JSON: data_text = data_template.format( input_json=input_json, output_schema=output_schema) else: data_text = data_template.format(input_json=input_json) style = AI_LANG_STYLE.get(lang) or AI_LANG_STYLE.get("default") or "" system_parts = [AI_PROMPT_SYSTEM_BASE] if style: system_parts.append(style) system_parts.append(_active_ai_contract()) system_text = "\n\n".join([p for p in system_parts if p]) user_parts = [] user_parts.append(data_text) return system_text, user_parts def _gemini_generate_json(api_key: str, model: str, system_text: str, user_parts: list[str]): url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}" parts = [{"text": p} for p in user_parts if (p or "").strip()] payload = { "systemInstruction": {"parts": [{"text": system_text}]}, "contents": [{"role": "user", "parts": parts}], "generationConfig": { "temperature": float(AI_TEMPERATURE), "maxOutputTokens": int(AI_MAX_TOKENS), "responseMimeType": "text/plain", }, } with httpx.Client(timeout=float(AI_TIMEOUT_SEC)) as client: r = client.post(url, json=payload) try: r.raise_for_status() except httpx.HTTPStatusError as e: raise Exception(f"Gemini HTTP {r.status_code}: {r.text}") from e data = r.json() candidates = data.get("candidates") or [] if not candidates: raise Exception("Gemini returned no candidates") c = (candidates[0].get("content") or {}) out_parts = c.get("parts") or [] if not out_parts: raise Exception("Gemini returned empty content parts") txt = "".join([str(p.get("text") or "") for p in out_parts]).strip() if not txt: raise Exception("Gemini returned empty text") return txt def _read_first_env(*names: str) -> str: for n in names: v = (os.environ.get(n) or "").strip() if v: return v return "" def _detect_ai_provider_from_key(api_key: str) -> str: k = (api_key or "").strip() if k.startswith("AIza"): return "gemini" if k.startswith("hf_"): return "huggingface" if k.startswith("sk-or-"): return "openrouter" if k.startswith("sk-ant-"): return "anthropic" if k.startswith("gsk_"): return "groq" return "openai" def _resolve_ai_config(): api_key = (AI_API_KEY or _read_first_env( "AI_API_KEY", "OPENAI_API_KEY", "HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "GEMINI_API_KEY", "OPENROUTER_API_KEY", "FEATHERLESS_API_KEY", "GROQ_API_KEY", "TOGETHER_API_KEY", "DEEPSEEK_API_KEY", "ANTHROPIC_API_KEY", )).strip() provider = _canonical_provider((AI_PROVIDER or "auto")) model = (AI_MODEL or "auto").strip() base_url = (AI_BASE_URL or "auto").strip() if provider in ("", "auto"): provider = _canonical_provider(_detect_ai_provider_from_key(api_key)) preset = AI_PROVIDER_DEFAULTS.get(provider) or {} model = _resolve_model(provider, model) if base_url in ("", "auto"): base_url = (preset.get("base_url") or "").strip() if provider not in ("gemini", "anthropic"): if not base_url: base_url = (AI_PROVIDER_DEFAULTS.get("openai") or {}).get( "base_url") or "https://api.openai.com/v1" return provider, api_key, model, base_url def _openai_compat_generate_json(api_key: str, base_url: str, model: str, system_text: str, user_parts: list[str]): url = (base_url.rstrip("/") + "/chat/completions") messages = [{"role": "system", "content": system_text}] for p in user_parts: if (p or "").strip(): messages.append({"role": "user", "content": p}) payload = { "model": model, "messages": messages, "temperature": float(AI_TEMPERATURE), "max_tokens": int(AI_MAX_TOKENS), } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } used_model = model with httpx.Client(timeout=float(AI_TIMEOUT_SEC)) as client: r = client.post(url, json=payload, headers=headers) try: r.raise_for_status() data = r.json() except httpx.HTTPStatusError as e: if ( r.status_code == 400 and "router.huggingface.co" in (base_url or "") and ((AI_MODEL or "").strip().lower() in ("", "auto") or model == (AI_PROVIDER_DEFAULTS.get("huggingface") or {}).get("model")) ): try: err = r.json().get("error") or {} except Exception: err = {} if (err.get("code") or "") == "model_not_supported": models = _hf_router_available_models(api_key, base_url) fallback = _pick_hf_fallback_model(models) if fallback and fallback != model: payload["model"] = fallback used_model = fallback r2 = client.post(url, json=payload, headers=headers) try: r2.raise_for_status() except httpx.HTTPStatusError as e2: raise Exception( f"AI HTTP {r2.status_code}: {r2.text}") from e2 data = r2.json() else: preview = ", ".join(models[:8]) hint = f"\nAvailable models (first 8): {preview}" if preview else "" raise Exception( f"AI HTTP {r.status_code}: {r.text}{hint}") from e else: raise Exception( f"AI HTTP {r.status_code}: {r.text}") from e else: raise Exception(f"AI HTTP {r.status_code}: {r.text}") from e choices = data.get("choices") or [] if not choices: raise Exception("AI returned no choices") msg = (choices[0].get("message") or {}) txt = (msg.get("content") or "").strip() if not txt: raise Exception("AI returned empty text") return txt, used_model def _anthropic_generate_json(api_key: str, model: str, system_text: str, user_parts: list[str]): url = "https://api.anthropic.com/v1/messages" messages = [] for p in user_parts: if (p or "").strip(): messages.append({"role": "user", "content": p}) payload = { "model": model, "max_tokens": int(AI_MAX_TOKENS), "temperature": float(AI_TEMPERATURE), "system": system_text, "messages": messages, } headers = { "x-api-key": api_key, "content-type": "application/json", } with httpx.Client(timeout=float(AI_TIMEOUT_SEC)) as client: r = client.post(url, json=payload, headers=headers) try: r.raise_for_status() except httpx.HTTPStatusError as e: raise Exception(f"Anthropic HTTP {r.status_code}: {r.text}") from e data = r.json() content = data.get("content") or [] txt = "".join([(c.get("text") or "") for c in content if isinstance( c, dict) and c.get("type") == "text"]).strip() if not txt: raise Exception("Anthropic returned empty text") return txt def _strip_wrappers(s: str) -> str: t = (s or "").strip() if not t: return "" t = t.replace("\r\n", "\n").replace("\r", "\n") if "```" in t: t = re.sub(r"```[a-zA-Z0-9_-]*", "", t) t = t.replace("```", "") t = re.sub(r"", "", t, flags=re.IGNORECASE).strip() return t def _sanitize_json_like_text(raw: str) -> str: t = _strip_wrappers(raw) if not t: return "" out = [] in_str = False esc = False run_ch = "" run_len = 0 def _flush_run(): nonlocal run_ch, run_len if run_len: out.append(run_ch * min(run_len, 3)) run_ch = "" run_len = 0 for ch in t: if in_str: if esc: _flush_run() out.append(ch) esc = False continue if ch == "\\": _flush_run() out.append(ch) esc = True continue if ch == '"': _flush_run() out.append(ch) in_str = False continue if ch == "\n": _flush_run() out.append("\\n") continue if ch == "\t": _flush_run() out.append("\\t") continue if ch == run_ch: run_len += 1 continue _flush_run() run_ch = ch run_len = 1 continue _flush_run() if ch == '"': out.append(ch) in_str = True esc = False continue out.append(ch) _flush_run() return "".join(out) def _extract_first_json(raw: str): t = _sanitize_json_like_text(raw) if not t: raise Exception("AI returned empty text") start = t.find("{") if start < 0: raise Exception("AI returned no JSON object") in_str = False esc = False depth = 0 json_start = None for i in range(start, len(t)): ch = t[i] if in_str: if esc: esc = False elif ch == "\\": esc = True elif ch == '"': in_str = False continue if ch == '"': in_str = True continue if ch == "{": if depth == 0: json_start = i depth += 1 continue if ch == "}": if depth > 0: depth -= 1 if depth == 0 and json_start is not None: cand = t[json_start: i + 1] return json.loads(cand) raise Exception("Failed to parse AI JSON") def _parse_ai_textfull_only(raw: str) -> str: obj = _extract_first_json(raw) if not isinstance(obj, dict): raise Exception("AI JSON is not an object") txt = obj.get("aiTextFull") if txt is None: txt = obj.get("textFull") if txt is None: raise Exception("AI JSON missing aiTextFull") t = str(txt) if "\\n" in t and "\n" not in t: t = t.replace("\\n", "\n") t = t.replace("\r\n", "\n").replace("\r", "\n").strip() return t def _parse_ai_textfull_text_only(raw: str) -> str: t = _strip_wrappers(raw) if not t: raise Exception("AI returned empty text") if t.lstrip().startswith("{"): return _parse_ai_textfull_only(t) if "\\n" in t and "\n" not in t: t = t.replace("\\n", "\n") t = re.sub(r"^aiTextFull\s*[:=]\s*", "", t, flags=re.IGNORECASE).strip() return t def _budoux_parser_for_lang(lang: str): lang = _normalize_lang(lang) if not budoux: return None if lang == "th": return budoux.load_default_thai_parser() if lang == "ja": return budoux.load_default_japanese_parser() if lang in ("zh", "zh-hans", "zh_cn", "zh-cn", "zh_hans"): return budoux.load_default_simplified_chinese_parser() if lang in ("zh-hant", "zh_tw", "zh-tw", "zh_hant"): return budoux.load_default_traditional_chinese_parser() model_path = os.environ.get("BUDOUX_MODEL_PATH") if not model_path: return None with open(model_path, "r", encoding="utf-8") as f: model = json.load(f) return budoux.Parser(model) def _ensure_box_fields(box: dict): if not isinstance(box, dict): return {} b = copy.deepcopy(box) if "rotation_deg" not in b: b["rotation_deg"] = 0.0 if "rotation_deg_css" not in b: b["rotation_deg_css"] = 0.0 if "center" not in b and all(k in b for k in ("left", "top", "width", "height")): b["center"] = {"x": b["left"] + b["width"] / 2.0, "y": b["top"] + b["height"]/2.0} if all(k in b for k in ("left", "top", "width", "height")): if "left_pct" not in b: b["left_pct"] = b["left"] * 100.0 if "top_pct" not in b: b["top_pct"] = b["top"] * 100.0 if "width_pct" not in b: b["width_pct"] = b["width"] * 100.0 if "height_pct" not in b: b["height_pct"] = b["height"] * 100.0 return b def _tokens_with_spaces(text: str, parser, lang: str): t = (text or "") if not t: return [] out = [] parts = re.findall(r"\s+|\S+", t) for part in parts: if not part: continue if part.isspace(): out.append(("space", part)) continue segs = parser.parse(part) if parser else [part] for seg in segs: if seg: out.append(("word", seg)) return out def _line_cap_px_for_item(item: dict, img_w: int, img_h: int) -> float: p1 = item.get("baseline_p1") or {} p2 = item.get("baseline_p2") or {} dx = (float(p2.get("x") or 0.0) - float(p1.get("x") or 0.0)) * float(img_w) dy = (float(p2.get("y") or 0.0) - float(p1.get("y") or 0.0)) * float(img_h) cap = float(math.hypot(dx, dy)) if cap > 1e-6: return cap b = _ensure_box_fields(item.get("box") or {}) return float(b.get("width") or 0.0) * float(img_w) def _wrap_tokens_to_lines_px(tokens, items, img_w: int, img_h: int, thai_font: str, latin_font: str, font_size: int, min_lines: int): max_lines = len(items) if max_lines <= 0: return [] caps = [_line_cap_px_for_item(it, img_w, img_h) for it in items] desired = max(1, min(int(min_lines), max_lines)) soft_factor = 0.90 if desired > 1 else 1.0 lines = [[]] cur_w = 0.0 li = 0 last_word_hint = "" pending_space = "" tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) def _measure_w(font, txt: str) -> float: try: return float(font.getlength(txt)) except Exception: try: bb = dtmp.textbbox((0, 0), txt, font=font, anchor="ls") return float(bb[2] - bb[0]) except Exception: w, _ = dtmp.textsize(txt, font=font) return float(w) def _cap_for_line(idx: int) -> float: return float(caps[min(idx, max_lines - 1)]) for k, s in (tokens or []): if k == "space": if not lines[-1]: continue pending_space += str(s) continue if k != "word": continue txt = str(s) if not txt: continue font = pick_font(txt, thai_font, latin_font, int(font_size)) w = _measure_w(font, txt) sw = 0.0 if pending_space: hint = last_word_hint or txt font_s = pick_font(hint, thai_font, latin_font, int(font_size)) sw = _measure_w(font_s, pending_space) cap = _cap_for_line(li) soft_cap = cap * soft_factor if (li < desired and cap > 0.0) else cap need_w = cur_w + sw + w if lines[-1] and li < max_lines - 1: if cap > 0.0 and need_w > cap: lines.append([]) li += 1 cur_w = 0.0 pending_space = "" sw = 0.0 elif soft_cap > 0.0 and need_w > soft_cap: lines.append([]) li += 1 cur_w = 0.0 pending_space = "" sw = 0.0 if pending_space and lines[-1]: lines[-1].append(("space", pending_space, sw)) cur_w += sw pending_space = "" lines[-1].append(("word", txt, w)) cur_w += w last_word_hint = txt if len(lines) > max_lines: head = lines[: max_lines - 1] tail = [] for seg in lines[max_lines - 1:]: tail.extend(seg) lines = head + [tail] for i in range(len(lines)): while lines[i] and lines[i][0][0] == "space": lines[i] = lines[i][1:] while lines[i] and lines[i][-1][0] == "space": lines[i] = lines[i][:-1] return lines def _ensure_min_lines_by_split(lines, min_lines: int, max_lines: int): if not lines: return [] min_lines = int(min_lines) max_lines = int(max_lines) if min_lines <= 1: return lines target = min(min_lines, max_lines) lines = [list(seg) for seg in (lines or [])] def _trim(seg): while seg and seg[0][0] == "space": seg.pop(0) while seg and seg[-1][0] == "space": seg.pop() return seg while len(lines) < target: idx = None best = 0 for i, seg in enumerate(lines): n_words = sum(1 for k, s, _ in seg if k == "word" and s != ZWSP) if n_words > best and n_words > 1: best = n_words idx = i if idx is None: break seg = lines[idx] word_pos = [i for i, (k, s, _) in enumerate(seg) if k == "word" and s != ZWSP] if len(word_pos) <= 1: break cut_word = len(word_pos) // 2 cut_pos = word_pos[cut_word] left = _trim(seg[:cut_pos]) right = _trim(seg[cut_pos:]) lines[idx] = left lines.insert(idx + 1, right) if len(lines) >= max_lines: break return lines def _fit_para_size_and_lines(ptext: str, parser, items, img_w: int, img_h: int, thai_font: str, latin_font: str, base_size: int, min_lines: int, lang: str): tokens2 = _tokens_with_spaces(ptext, parser, lang) if not tokens2 or not items: return int(base_size), [[] for _ in range(len(items))] max_lines = len(items) n_words = 0 for k, s in tokens2: if k == "word" and str(s): n_words += 1 desired_lines = max(1, min(max_lines, n_words)) size = max(10, int(base_size)) heights = [] for it in items: b = _ensure_box_fields(it.get("box") or {}) heights.append(float(b.get("height") or 0.0) * float(img_h)) while size >= 10: lines = _wrap_tokens_to_lines_px( tokens2, items, img_w, img_h, thai_font, latin_font, size, min_lines=desired_lines) lines = _ensure_min_lines_by_split( lines, min_lines=desired_lines, max_lines=max_lines) if len(lines) <= max_lines: ok = True for ii, seg in enumerate(lines): words = [s for k, s, _ in seg if k == "word" and s != ZWSP] if not words: continue line_text = "".join(words) mline = _line_metrics_px( line_text, thai_font, latin_font, size) if mline is None: continue _, th, _ = mline if ii < len(heights) and heights[ii] > 0.0 and th > heights[ii] * 1.01: ok = False break if ok: return size, lines size -= 1 lines10 = _wrap_tokens_to_lines_px( tokens2, items, img_w, img_h, thai_font, latin_font, 10, min_lines=desired_lines) lines10 = _ensure_min_lines_by_split( lines10, min_lines=desired_lines, max_lines=max_lines) return 10, lines10 def _pad_lines(lines, max_lines: int): max_lines = int(max_lines) if max_lines <= 0: return [] lines = list(lines or []) if len(lines) > max_lines: return lines[:max_lines] if len(lines) < max_lines: lines.extend([[] for _ in range(max_lines - len(lines))]) return lines def _contains_thai(text: str) -> bool: for ch in (text or ""): if _is_thai_char(ch): return True return False def _apply_line_to_item( item: dict, line_tokens, para_index: int, item_index: int, abs_line_start_raw: int, W: int, H: int, thai_path: str, latin_path: str, forced_size_px: int | None, apply_baseline_shift: bool = True, kerning_adjust: bool = False, ): tokens = [] for t in (line_tokens or []): if not isinstance(t, (list, tuple)) or len(t) < 2: continue k = str(t[0]) s = str(t[1]) w = float(t[2]) if len(t) > 2 and isinstance( t[2], (int, float)) else 0.0 tokens.append((k, s, w)) words = [s for k, s, _ in tokens if k == "word" and s != ZWSP] item_text = "".join(s for _, s, _ in tokens if s != ZWSP).strip() item["text"] = item_text item["valid_text"] = bool(item_text) b = _ensure_box_fields(item.get("box") or {}) item["box"] = b base_left = float(b.get("left") or 0.0) base_top = float(b.get("top") or 0.0) base_w = float(b.get("width") or 0.0) base_h = float(b.get("height") or 0.0) if not words or base_w <= 0.0 or base_h <= 0.0 or W <= 0 or H <= 0: item["spans"] = [] return p1 = item.get("baseline_p1") or {} p2 = item.get("baseline_p2") or {} x1 = float(p1.get("x") or 0.0) * float(W) y1 = float(p1.get("y") or 0.0) * float(H) x2 = float(p2.get("x") or 0.0) * float(W) y2 = float(p2.get("y") or 0.0) * float(H) dx = x2 - x1 dy = y2 - y1 L = float(math.hypot(dx, dy)) if L <= 1e-9: item["spans"] = [] return ux = dx / L uy = dy / L nx = -uy ny = ux if ny < 0: nx, ny = -nx, -ny base_w_px = L base_h_px = base_h * float(H) base_size = 96 widths_px = [] max_ascent = 0 max_descent = 0 layout_units = [] for k, s, _ in tokens: if s == ZWSP: continue if k == "space": layout_units.append(("space", _sanitize_draw_text(s))) elif k == "word": layout_units.append(("word", _sanitize_draw_text(s))) def _measure_len_px(font, text: str) -> float: try: return float(font.getlength(text)) except Exception: tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) try: bb = dtmp.textbbox((0, 0), text, font=font, anchor="ls") return float(bb[2] - bb[0]) except Exception: w, _ = dtmp.textsize(text, font=font) return float(w) for i, (k, t) in enumerate(layout_units): if k == "space": hint = "" for j in range(i - 1, -1, -1): if layout_units[j][0] == "word": hint = layout_units[j][1] break if not hint: for j in range(i + 1, len(layout_units)): if layout_units[j][0] == "word": hint = layout_units[j][1] break font0 = pick_font(hint or "a", thai_path, latin_path, base_size) widths_px.append(max(0.0, _measure_len_px(font0, t))) continue font0 = pick_font(t, thai_path, latin_path, base_size) try: ascent, descent = font0.getmetrics() except Exception: ascent, descent = base_size, int(base_size * 0.25) if ascent > max_ascent: max_ascent = ascent if descent > max_descent: max_descent = descent if kerning_adjust and (i + 1) < len(layout_units) and layout_units[i + 1][0] == "word": nxt = layout_units[i + 1][1] nxt1 = nxt[:1] if nxt else "" if nxt1 and (_contains_thai(t) == _contains_thai(nxt1)): tw = _measure_len_px(font0, t + nxt1) - \ _measure_len_px(font0, nxt1) else: tw = _measure_len_px(font0, t) else: tw = _measure_len_px(font0, t) widths_px.append(max(0.0, tw)) line_tw = sum(widths_px) bo_base = _baseline_offset_px_for_text( item_text, thai_path, latin_path, base_size) if bo_base is not None: _, total_h_base = bo_base line_th = float(total_h_base) else: line_th = float(max_ascent + max_descent) if line_tw <= 1e-9 or line_th <= 1e-9: item["spans"] = [] return if forced_size_px is None: scale_line = min((base_w_px * 1.0) / line_tw, (base_h_px * 0.995) / line_th) if scale_line <= 0.0: item["spans"] = [] return final_size = max(10, int(base_size * scale_line)) else: final_size = int(max(10, forced_size_px)) scale_line = float(final_size) / float(base_size) item["font_size_px"] = final_size w_scaled = [w * scale_line for w in widths_px] total_scaled = sum(w_scaled) margin_px = (base_w_px - total_scaled) / \ 2.0 if total_scaled < base_w_px else 0.0 bo = _baseline_offset_px_for_text( item_text, thai_path, latin_path, final_size) if apply_baseline_shift and bo is not None: baseline_offset_px, _ = bo cx = (base_left + (base_w / 2.0)) * float(W) cy = (base_top + (base_h / 2.0)) * float(H) target = (cx + (baseline_offset_px * nx), cy + (baseline_offset_px * ny)) s = ((target[0] - x1) * nx) + ((target[1] - y1) * ny) x1 += nx * s y1 += ny * s x2 += nx * s y2 += ny * s item["baseline_p1"] = {"x": x1 / float(W), "y": y1 / float(H)} item["baseline_p2"] = {"x": x2 / float(W), "y": y2 / float(H)} raw_pos = 0 span_i = 0 unit_i = 0 cum_px = 0.0 spans = [] for kind, s, _ in tokens: if s == ZWSP: continue start_raw = abs_line_start_raw + raw_pos raw_pos += len(s) end_raw = abs_line_start_raw + raw_pos if unit_i >= len(w_scaled): break wpx = w_scaled[unit_i] t0 = (margin_px + cum_px) / base_w_px cum_px += wpx t1 = (margin_px + cum_px) / base_w_px if kind == "space": unit_i += 1 continue span_box = _ensure_box_fields({ "left": base_left + (base_w * t0), "top": base_top, "width": base_w * (t1 - t0), "height": base_h, "rotation_deg": float(b.get("rotation_deg") or 0.0), "rotation_deg_css": float(b.get("rotation_deg_css") or 0.0), }) spans.append({ "side": "Ai", "para_index": para_index, "item_index": item_index, "span_index": span_i, "text": s, "valid_text": True, "start_raw": start_raw, "end_raw": end_raw, "t0_raw": t0, "t1_raw": t1, "box": span_box, "height_raw": item.get("height_raw"), "baseline_p1": item.get("baseline_p1"), "baseline_p2": item.get("baseline_p2"), "font_size_px": final_size, }) span_i += 1 unit_i += 1 item["spans"] = spans def patch(payload: dict, img_w: int, img_h: int, thai_font: str, latin_font: str, lang: str | None = None) -> dict: ai = payload.get("Ai") or {} ai_text_full = str(ai.get("aiTextFull") or "") template_tree = ai.get("aiTree") or {} if not isinstance(template_tree, dict): raise ValueError("Ai.aiTree template must be a dict") lang_norm = _normalize_lang(lang or LANG) parser = _budoux_parser_for_lang(lang_norm) out_tree = copy.deepcopy(template_tree) out_tree["side"] = "Ai" paragraphs = out_tree.get("paragraphs") or [] ai_text_full_clean = ai_text_full def _extract_paras_by_markers(txt: str, expected: int) -> tuple[list[str], str, int] | None: if not txt or expected <= 0 or "<>", txt)) if not matches: return None out: list[str] = [""] * expected for mi, m in enumerate(matches): try: idx = int(m.group(1)) except Exception: continue seg_start = m.end() seg_end = matches[mi + 1].start() if (mi + 1) < len(matches) else len(txt) seg = (txt[seg_start:seg_end] or "").lstrip("\r\n").strip() if 0 <= idx < expected and not out[idx]: out[idx] = seg clean = "\n\n".join(out) return out, clean, len(matches) marked = _extract_paras_by_markers(ai_text_full, len(paragraphs)) if marked is not None: ai_paras, ai_text_full_clean, _marker_count = marked else: ai_paras = ai_text_full.split("\n\n") if ai_text_full else [] if len(ai_paras) < len(paragraphs): ai_paras = ai_paras + [""] * (len(paragraphs) - len(ai_paras)) if len(ai_paras) > len(paragraphs): ai_paras = ai_paras[:len(paragraphs)] ai_text_full_clean = "\n\n".join(ai_paras) raw_cursor = 0 for pi, (p, ptext) in enumerate(zip(paragraphs, ai_paras)): p["side"] = "Ai" p["para_index"] = int(p.get("para_index", pi)) items = p.get("items") or [] max_lines = len(items) if max_lines <= 0: continue base_size_ref = None if isinstance(p.get("para_font_size_px"), int) and int(p.get("para_font_size_px")) > 0: base_size_ref = int(p.get("para_font_size_px")) else: ref_sizes = [] for it in items: fs = it.get("font_size_px") if isinstance(fs, int) and fs > 0: ref_sizes.append(fs) if ref_sizes: base_size_ref = min(ref_sizes) base_size = int(base_size_ref or 96) min_lines = int(max_lines) para_size, lines = _fit_para_size_and_lines( ptext, parser, items, img_w, img_h, thai_font, latin_font, base_size, min_lines=min_lines, lang=lang_norm, ) lines = _pad_lines(lines, max_lines) p["para_font_size_px"] = int(para_size) p["text"] = ptext p["valid_text"] = bool(ptext) p["start_raw"] = raw_cursor p["end_raw"] = raw_cursor + len(ptext) line_start = raw_cursor for ii in range(max_lines): it = items[ii] it["side"] = "Ai" it["para_index"] = pi it["item_index"] = ii _apply_line_to_item( it, (lines[ii] if ii < len(lines) else []), pi, ii, line_start, img_w, img_h, thai_font, latin_font, para_size, apply_baseline_shift=True, kerning_adjust=True, ) line_raw_len = sum(len(s) for k, s, w in ( lines[ii] if ii < len(lines) else []) if s != ZWSP) line_start += line_raw_len raw_cursor = p["end_raw"] + 2 return {"Ai": {"aiTextFull": ai_text_full_clean, "aiTree": out_tree}} def _uniformize_ai_item_span_font_size(item: dict, img_w: int, img_h: int, thai_font: str, latin_font: str): spans = item.get("spans") or [] if not spans or img_w <= 0 or img_h <= 0: return base_size = item.get("font_size_px") try: base_size = int(base_size) if base_size is not None else None except Exception: base_size = None if not base_size: for sp in spans: fs = sp.get("font_size_px") if isinstance(sp, dict) else None if isinstance(fs, int) and fs > 0: base_size = fs break if not base_size or base_size <= 0: return tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) font_cache = {} def _font_for(text: str, size: int): key = (int(size), 1 if _contains_thai(text) else 0) f = font_cache.get(key) if f: return f f = pick_font(text, thai_font, latin_font, int(size)) font_cache[key] = f return f min_size = int(base_size) for sp in spans: if not isinstance(sp, dict): continue txt = _sanitize_draw_text(sp.get("text") or "") if txt.strip() == "": continue b = sp.get("box") or {} aw = float(b.get("width") or 0.0) * float(img_w) ah = float(b.get("height") or 0.0) * float(img_h) if aw <= 0.0 or ah <= 0.0: continue font = _font_for(txt, base_size) try: bb = dtmp.textbbox((0, 0), txt, font=font, anchor="ls") tw = float(bb[2] - bb[0]) th = float(bb[3] - bb[1]) except Exception: tw, th = dtmp.textsize(txt, font=font) tw = float(tw) th = float(th) if tw <= 0.0 or th <= 0.0: continue s = min((aw * 0.995) / tw, (ah * 0.995) / th) if s < 1.0: req = max(10, int(base_size * s)) if req < min_size: min_size = req if min_size != base_size: item["font_size_px"] = int(min_size) for sp in spans: if isinstance(sp, dict): sp["font_size_px"] = int(min_size) def _rebuild_ai_spans_after_font_resize(ai_tree: dict, img_w: int, img_h: int, thai_font: str, latin_font: str, lang: str | None = None): if not ai_tree or img_w <= 0 or img_h <= 0: return lang_norm = _normalize_lang(lang or LANG) parser = _budoux_parser_for_lang(lang_norm) for pi, p in _iter_paragraphs(ai_tree): items = p.get("items") or [] for ii, it in enumerate(items): txt = _item_line_text(it) if not str(txt).strip(): it["spans"] = [] continue tokens = _tokens_with_spaces(str(txt), parser, lang_norm) line_tokens = [(k, s, 0.0) for k, s in tokens] forced = it.get("font_size_px") or p.get("para_font_size_px") if isinstance(forced, float): forced = int(forced) elif isinstance(forced, str) and forced.strip().isdigit(): forced = int(forced.strip()) _apply_line_to_item( it, line_tokens, int(p.get("para_index", pi)), int(it.get("item_index", ii)), int(it.get("start_raw", 0)), img_w, img_h, thai_font, latin_font, forced, apply_baseline_shift=False, kerning_adjust=True, ) _uniformize_ai_item_span_font_size( it, img_w, img_h, thai_font, latin_font) def ai_translate_original_text(original_text_full: str, target_lang: str): provider, api_key, model, base_url = _resolve_ai_config() if not api_key: raise Exception("AI_API_KEY is required for AI translation") lang = _normalize_lang(target_lang) prompt_sig = _sha1( json.dumps( { "sys": AI_PROMPT_SYSTEM_BASE, "contract": _active_ai_contract(), "data": _active_ai_data_template(), "style": AI_LANG_STYLE.get(lang) or AI_LANG_STYLE.get("default") or "", }, ensure_ascii=False, ) ) cache = None cache_key = None if AI_CACHE: cache = _load_ai_cache(AI_CACHE_PATH) cache_key = _sha1( json.dumps( {"provider": provider, "m": model, "u": base_url, "l": lang, "p": prompt_sig, "t": original_text_full}, ensure_ascii=False, ) ) if cache_key in cache: cached = cache[cache_key] if lang == "th" and cached: t = str(cached.get("aiTextFull") or "") if t: t2 = re.sub( r"(?:(?<=^)|(?<=[\s\"'“”‘’()\[\]{}<>]))\u0e19\u0e32\u0e22(?=(?:\s|$))", "", t) t2 = re.sub(r"[ \t]{2,}", " ", t2) t2 = re.sub(r"^[ \t]+", "", t2, flags=re.MULTILINE) if t2 != t: cached = dict(cached) cached["aiTextFull"] = t2 cache[cache_key] = cached _save_ai_cache(AI_CACHE_PATH, cache) return cached system_text, user_parts = _build_ai_prompt_packet(lang, original_text_full) started = time.time() used_model = model if provider == "gemini": raw = _gemini_generate_json(api_key, model, system_text, user_parts) elif provider == "anthropic": raw = _anthropic_generate_json(api_key, model, system_text, user_parts) else: raw, used_model = _openai_compat_generate_json( api_key, base_url, model, system_text, user_parts) ai_text_full = _parse_ai_textfull_only( raw) if DO_AI_JSON else _parse_ai_textfull_text_only(raw) if lang == "th" and ai_text_full: ai_text_full = re.sub( r"(?:(?<=^)|(?<=[\s\"'“”‘’()\[\]{}<>]))\u0e19\u0e32\u0e22(?=(?:\s|$))", "", ai_text_full) ai_text_full = re.sub(r"[ \t]{2,}", " ", ai_text_full) ai_text_full = re.sub(r"^[ \t]+", "", ai_text_full, flags=re.MULTILINE) result = { "aiTextFull": ai_text_full, "meta": {"model": used_model, "provider": provider, "base_url": base_url, "latency_sec": round(time.time() - started, 3)}, } if AI_CACHE and cache is not None and cache_key is not None: cache[cache_key] = result _save_ai_cache(AI_CACHE_PATH, cache) return result def to_translated(u, lang="th"): q = parse_qs(urlparse(u).query) return "https://lens.google.com/translatedimage?" + urlencode( dict( vsrid=q["vsrid"][0], gsessionid=q["gsessionid"][0], sl="auto", tl=lang, se=1, ib="1", ) ) def _b64pad(s: str) -> str: return s + "=" * ((4 - (len(s) % 4)) % 4) def decode_imageurl_to_datauri(imageUrl: str): if not imageUrl: return None if isinstance(imageUrl, str) and imageUrl.startswith("data:image") and "base64," in imageUrl: return imageUrl for fn in (base64.b64decode, base64.urlsafe_b64decode): try: b = fn(_b64pad(imageUrl)) try: t = b.decode("utf-8") except Exception: t = b.decode("utf-8", errors="ignore") if "data:image" in t and "base64," in t: i = t.find("data:image") return t[i:].strip() if i >= 0 else t.strip() except Exception: pass return None def read_varint(buf, i): shift = 0 result = 0 while True: if i >= len(buf): raise ValueError("eof varint") b = buf[i] i += 1 result |= ((b & 0x7F) << shift) if (b & 0x80) == 0: return result, i shift += 7 if shift > 70: raise ValueError("varint too long") def parse_proto(buf, start=0, end=None): if end is None: end = len(buf) i = start out = [] while i < end: key, i = read_varint(buf, i) field = key >> 3 wire = key & 7 if wire == 0: val, i = read_varint(buf, i) out.append((field, wire, val)) elif wire == 1: val = buf[i: i + 8] i += 8 out.append((field, wire, val)) elif wire == 2: l, i = read_varint(buf, i) val = buf[i: i + l] i += l out.append((field, wire, val)) elif wire == 5: val = buf[i: i + 4] i += 4 out.append((field, wire, val)) else: raise ValueError(f"wiretype {wire}") return out def b2f(b4): return struct.unpack("= 2 and height is not None: return pts[0], pts[1], height return None, None, None def _looks_like_geom(geom_bytes): geom_fields = parse_proto(geom_bytes) pts = 0 has_height = False for f, w, v in geom_fields: if f == 1 and w == 2: p_fields = parse_proto(v) if _get_float_field(p_fields, 1) is not None and _get_float_field(p_fields, 2) is not None: pts += 1 elif f == 3 and w == 5: has_height = True return pts >= 2 and has_height def _looks_like_span(span_bytes): span_fields = parse_proto(span_bytes) has_t = False has_range = False for f, w, v in span_fields: if f in (3, 4) and w == 5: has_t = True elif f in (1, 2) and w == 0: has_range = True return has_t and has_range def _is_item_message(msg_bytes): fields = parse_proto(msg_bytes) geom_ok = False span_ok = 0 for f, w, v in fields: if f == 1 and w == 2 and not geom_ok: geom_ok = _looks_like_geom(v) elif f == 2 and w == 2: if _looks_like_span(v): span_ok += 1 return geom_ok and span_ok > 0 def _extract_items_from_paragraph(par_bytes): top = parse_proto(par_bytes) items = [] for _, w, v in top: if w == 2 and _is_item_message(v): items.append(v) if items: return items items = [] seen = set() nodes = 0 def walk(buf, depth): nonlocal nodes if depth >= 4 or nodes > 20000: return for _, w, v in parse_proto(buf): if w != 2: continue nodes += 1 if nodes > 20000: return if _is_item_message(v): if v in seen: continue seen.add(v) items.append(v) else: walk(v, depth + 1) walk(par_bytes, 0) return items def _extract_item_geom_spans(item_bytes): fields = parse_proto(item_bytes) geom_bytes = None spans_bytes = [] for f, w, v in fields: if f == 1 and w == 2: geom_bytes = v if f == 2 and w == 2: spans_bytes.append(v) return geom_bytes, spans_bytes def _extract_span(span_bytes): span_fields = parse_proto(span_bytes) start = None end = None t0 = None t1 = None for f, w, v in span_fields: if f == 1 and w == 0: start = int(v) elif f == 2 and w == 0: end = int(v) elif f == 3 and w == 5: t0 = b2f(v) elif f == 4 and w == 5: t1 = b2f(v) return start, end, t0, t1, span_fields def _normalize_angle_deg(angle_deg): while angle_deg <= -180.0: angle_deg += 360.0 while angle_deg > 180.0: angle_deg -= 360.0 if angle_deg < -90.0: angle_deg += 180.0 if angle_deg > 90.0: angle_deg -= 180.0 return angle_deg def _slice_text(full_text, start, end): if start is None or end is None: return "" if start < 0 or end < 0 or start > end or end > len(full_text): return "" return full_text[start:end] def _range_min_max(ranges): if not ranges: return None, None s = min(r[0] for r in ranges) e = max(r[1] for r in ranges) return s, e def decode_tree(paragraphs_b64, full_text, side, img_w, img_h, want_raw=True): raw_dump = [] paragraphs = [] cursor = 0 for para_index, b64s in enumerate(paragraphs_b64): par_bytes = base64.b64decode(b64s) if want_raw: raw_dump.append({"para_index": para_index, "b64": b64s, "bytes_hex": b2hex(par_bytes)}) item_msgs = _extract_items_from_paragraph(par_bytes) items = [] para_ranges = [] para_bounds = None for item_index, item_bytes in enumerate(item_msgs): geom_bytes, spans_bytes = _extract_item_geom_spans(item_bytes) if geom_bytes is None: continue p1, p2, height_norm = _get_points_from_geom(geom_bytes) if p1 is None or p2 is None or height_norm is None: continue x1n, y1n = p1 x2n, y2n = p2 x1 = x1n * img_w y1 = y1n * img_h x2 = x2n * img_w y2 = y2n * img_h dx = x2 - x1 dy = y2 - y1 if dx < 0 or (abs(dx) < 1e-12 and dy < 0): x1, y1, x2, y2 = x2, y2, x1, y1 x1n, y1n, x2n, y2n = x2n, y2n, x1n, y1n dx = x2 - x1 dy = y2 - y1 L = math.hypot(dx, dy) if L <= 1e-12: continue ux = dx / L uy = dy / L angle_deg_raw = math.degrees(math.atan2(dy, dx)) angle_deg = _normalize_angle_deg(angle_deg_raw) angle_deg_css = angle_deg height_px = height_norm * img_h item_spans = [] item_ranges = [] item_bounds = None for span_index, sb in enumerate(spans_bytes): start, end, t0, t1, _ = _extract_span(sb) if start is None: start = cursor else: cursor = max(cursor, start) if end is None: continue cursor = max(cursor, end) if t0 is None and t1 is None: continue if t0 is None: t0 = 0.0 if t1 is None: t1 = 1.0 valid_text = False span_text = "" if start is not None and end is not None and 0 <= start <= end <= len(full_text): span_text = full_text[start:end] valid_text = span_text.strip() != "" if valid_text: item_ranges.append((start, end)) e1x = x1 + ux * (t0 * L) e1y = y1 + uy * (t0 * L) e2x = x1 + ux * (t1 * L) e2y = y1 + uy * (t1 * L) cx = (e1x + e2x) / 2.0 cy = (e1y + e2y) / 2.0 width_px = abs(t1 - t0) * L left_px = cx - width_px / 2.0 top_px = cy - height_px / 2.0 left = left_px / img_w top = top_px / img_h width = width_px / img_w height = height_px / img_h span_node = { "side": side, "para_index": para_index, "item_index": item_index, "span_index": span_index, "start_raw": start, "end_raw": end, "t0_raw": t0, "t1_raw": t1, "height_raw": height_norm, "baseline_p1": {"x": x1n, "y": y1n}, "baseline_p2": {"x": x2n, "y": y2n}, "box": { "left": left, "top": top, "width": width, "height": height, "rotation_deg": angle_deg, "rotation_deg_css": angle_deg_css, "center": {"x": cx / img_w, "y": cy / img_h}, "left_pct": left * 100.0, "top_pct": top * 100.0, "width_pct": width * 100.0, "height_pct": height * 100.0, }, "text": span_text, "valid_text": valid_text, } quad = _token_box_quad_px(span_node, img_w, img_h, pad_px=0) if quad: xs = [p[0] for p in quad] ys = [p[1] for p in quad] b = (min(xs), min(ys), max(xs), max(ys)) item_bounds = b if item_bounds is None else (min(item_bounds[0], b[0]), min( item_bounds[1], b[1]), max(item_bounds[2], b[2]), max(item_bounds[3], b[3])) item_bounds = item_bounds item_spans.append(span_node) s0, s1 = _range_min_max(item_ranges) item_text = _slice_text( full_text, s0, s1).strip() if s0 is not None else "" item_valid_text = item_text.strip() != "" if s0 is not None: para_ranges.append((s0, s1)) cx = (x1 + x2) / 2.0 cy = (y1 + y2) / 2.0 left_px = cx - L / 2.0 top_px = cy - height_px / 2.0 item_box = { "left": left_px / img_w, "top": top_px / img_h, "width": L / img_w, "height": height_px / img_h, "rotation_deg": angle_deg, "rotation_deg_css": angle_deg_css, "center": {"x": cx / img_w, "y": cy / img_h}, } if item_bounds is not None: para_bounds = item_bounds if para_bounds is None else (min(para_bounds[0], item_bounds[0]), min( para_bounds[1], item_bounds[1]), max(para_bounds[2], item_bounds[2]), max(para_bounds[3], item_bounds[3])) items.append( { "side": side, "para_index": para_index, "item_index": item_index, "start_raw": s0, "end_raw": s1, "text": item_text, "valid_text": item_valid_text, "height_raw": height_norm, "baseline_p1": {"x": x1n, "y": y1n}, "baseline_p2": {"x": x2n, "y": y2n}, "box": item_box, "bounds_px": item_bounds, "spans": item_spans, } ) p0, p1 = _range_min_max(para_ranges) para_text = _slice_text( full_text, p0, p1).strip() if p0 is not None else "" para_valid_text = para_text.strip() != "" paragraphs.append( { "side": side, "para_index": para_index, "start_raw": p0, "end_raw": p1, "text": para_text, "valid_text": para_valid_text, "bounds_px": para_bounds, "items": items, } ) tree = {"side": side, "paragraphs": paragraphs} return tree, raw_dump def flatten_tree_spans(tree): spans = [] for p in tree.get("paragraphs") or []: for it in p.get("items") or []: for sp in it.get("spans") or []: spans.append(sp) return spans def flatten_tree_items_as_tokens(tree, img_w, img_h): toks = [] for p in tree.get("paragraphs") or []: for it in p.get("items") or []: t = { "side": it["side"], "para_index": it["para_index"], "item_index": it["item_index"], "span_index": -1, "start_raw": it.get("start_raw"), "end_raw": it.get("end_raw"), "t0_raw": 0.0, "t1_raw": 1.0, "height_raw": it.get("height_raw"), "baseline_p1": it.get("baseline_p1"), "baseline_p2": it.get("baseline_p2"), "box": it.get("box"), "text": it.get("text") or "", "valid_text": it.get("valid_text", False), } toks.append(t) return toks def _mean_angle_deg(angles_deg): vals = [a for a in (angles_deg or []) if a is not None] if not vals: return 0.0 xs = [math.cos(math.radians(a)) for a in vals] ys = [math.sin(math.radians(a)) for a in vals] return math.degrees(math.atan2(sum(ys) / len(ys), sum(xs) / len(xs))) def _rotate_xy(x, y, cos_a, sin_a): return (x * cos_a - y * sin_a, x * sin_a + y * cos_a) def _para_obb_quad_px(para_node, W, H): items = para_node.get("items") or [] if not items: return None angles = [] pts = [] for it in items: b = (it.get("box") or {}) angles.append(b.get("rotation_deg", 0.0)) q = _token_box_quad_px(it, W, H, pad_px=0) if q: pts.extend(q) if len(pts) < 4: return None ang = _mean_angle_deg(angles) cos_a = math.cos(math.radians(ang)) sin_a = math.sin(math.radians(ang)) cos_n = cos_a sin_n = -sin_a rpts = [_rotate_xy(x, y, cos_n, sin_n) for (x, y) in pts] xs = [p[0] for p in rpts] ys = [p[1] for p in rpts] minx, maxx = min(xs), max(xs) miny, maxy = min(ys), max(ys) corners = [(minx, miny), (maxx, miny), (maxx, maxy), (minx, maxy)] return [_rotate_xy(x, y, cos_a, sin_a) for (x, y) in corners] def build_level_outlines(tree, W, H): outlines = [] if not tree: return outlines if DRAW_OUTLINE_PARA: for para in tree.get("paragraphs") or []: q = _para_obb_quad_px(para, W, H) if q: outlines.append( {"quad": q, "color": PARA_OUTLINE, "width": PARA_OUTLINE_WIDTH}) if DRAW_OUTLINE_ITEM: for itok in flatten_tree_items_as_tokens(tree, W, H): q = _token_box_quad_px(itok, W, H, pad_px=0) if q: outlines.append( {"quad": q, "color": ITEM_OUTLINE, "width": ITEM_OUTLINE_WIDTH}) return outlines def tokens_to_html(tokens, container_class="RTMDre"): parts = [] parts.append(f'
') for t in tokens: if not t.get("valid_text"): continue b = t["box"] aria = (t.get("text") or "").replace('"', """).replace("\n", " ") wi = t.get("wi", 0) rot = b.get("rotation_deg_css", b.get("rotation_deg", 0.0)) fs = t.get("font_size_px") or b.get("font_size_px") lh = None if fs: try: lh = max(1, int(round(float(fs) * 1.05))) except Exception: lh = None style = ( f'top: calc({b["top_pct"]}%); ' f'left: calc({b["left_pct"]}%); ' f'width: calc({b["width_pct"]}%); ' f'height: calc({b["height_pct"]}%); ' f"transform: rotate({rot}deg);" ) if fs: style += f" font-size: {float(fs):.4g}px;" if lh: style += f" line-height: {lh}px;" parts.append( f'
' ) parts.append("
") return "".join(parts) def tp_overlay_css(): return ( ".tp-draw-root{position:absolute;inset:0;pointer-events:none;}" ".tp-draw-scope{position:absolute;left:0;top:0;transform-origin:0 0;}" ".tp-para{position:absolute;left:0;top:0;}" ".tp-item{position:absolute;left:0;top:0;display:flex;align-items:center;justify-content:center;" "white-space:pre;pointer-events:none;box-sizing:border-box;overflow:visible;" "font-family:var(--tp-font,system-ui);font-weight:500;" "color:var(--tp-fg,rgba(20,20,20,.98));" "text-shadow:0 0 2px rgba(255,255,255,.90),0 0 2px rgba(0,0,0,.60),0 1px 1px rgba(0,0,0,.35);}" ".tp-item>span{display:inline-block;white-space:pre;transform-origin:center;" "padding:0;border-radius:3px;" "background:var(--tp-bg,rgba(255,255,255,.65));" "box-decoration-break:clone;-webkit-box-decoration-break:clone;}" ".tp-item[data-wrap='1'],.tp-item[data-wrap='1']>span{white-space:pre-wrap;word-break:break-word;}" ".tp-item[data-wrap='1']>span{text-align:center;}" ) def _tp_norm_list(v): if isinstance(v, list): return v if isinstance(v, dict): try: return [v[k] for k in sorted(v.keys(), key=lambda x: int(x) if str(x).isdigit() else str(x))] except Exception: return list(v.values()) return [] def _tp_num(x): try: n = float(x) return n if math.isfinite(n) else None except Exception: return None def _tp_escape_text(s: str) -> str: if not s: return "" s = s.replace("\r", "") s = s.replace("&", "&").replace("<", "<").replace(">", ">") return s def _tp_get_rect(obj: dict, base_w: float, base_h: float): if not isinstance(obj, dict): return None box = obj.get("box") if isinstance(obj.get("box"), dict) else {} l0 = _tp_num(box.get("left")) t0 = _tp_num(box.get("top")) w0 = _tp_num(box.get("width")) h0 = _tp_num(box.get("height")) if None not in (l0, t0, w0, h0) and w0 > 0 and h0 > 0: l = l0 * base_w t = t0 * base_h r = (l0 + w0) * base_w b = (t0 + h0) * base_h deg = _tp_num(box.get("rotation_deg_css")) if deg is None: deg = _tp_num(box.get("rotation_deg")) return {"l": l, "t": t, "r": r, "b": b, "deg": deg or 0.0} lp = _tp_num(box.get("left_pct")) tp = _tp_num(box.get("top_pct")) wp = _tp_num(box.get("width_pct")) hp = _tp_num(box.get("height_pct")) if None not in (lp, tp, wp, hp) and wp > 0 and hp > 0: l0p = lp / 100.0 t0p = tp / 100.0 w0p = wp / 100.0 h0p = hp / 100.0 l = l0p * base_w t = t0p * base_h r = (l0p + w0p) * base_w b = (t0p + h0p) * base_h deg = _tp_num(box.get("rotation_deg_css")) if deg is None: deg = _tp_num(box.get("rotation_deg")) return {"l": l, "t": t, "r": r, "b": b, "deg": deg or 0.0} bpx = obj.get("bounds_px") if isinstance(bpx, list) and len(bpx) == 4: l = _tp_num(bpx[0]) t = _tp_num(bpx[1]) r = _tp_num(bpx[2]) bb = _tp_num(bpx[3]) if None not in (l, t, r, bb) and r > l and bb > t: return {"l": l, "t": t, "r": r, "b": bb, "deg": 0.0} return None def _tp_union_rect(items: list, base_w: float, base_h: float): l = float("inf") t = float("inf") r = float("-inf") b = float("-inf") for it in items: bx = _tp_get_rect(it, base_w, base_h) if not bx: continue l = min(l, bx["l"]) t = min(t, bx["t"]) r = max(r, bx["r"]) b = max(b, bx["b"]) if not math.isfinite(l) or not math.isfinite(t) or not math.isfinite(r) or not math.isfinite(b): return None return {"l": l, "t": t, "r": r, "b": b, "deg": 0.0} def _tp_mean_item_deg(items: list, base_w: float, base_h: float) -> float: angles = [] for it in items or []: bx = _tp_get_rect(it, base_w, base_h) if not bx: continue a = _tp_num(bx.get("deg")) if a is None: continue angles.append(float(a)) if not angles: return 0.0 return float(_mean_angle_deg(angles)) def _tp_oriented_rect_from_points(pts: list, para_deg: float) -> dict | None: if len(pts) < 2: return None ang = float(para_deg or 0.0) if not math.isfinite(ang): ang = 0.0 rad_n = math.radians(-ang) cn = math.cos(rad_n) sn = math.sin(rad_n) rpts = [(x * cn - y * sn, x * sn + y * cn) for x, y in pts] xs = [p[0] for p in rpts] ys = [p[1] for p in rpts] minx, maxx = min(xs), max(xs) miny, maxy = min(ys), max(ys) w = float(maxx - minx) h = float(maxy - miny) if w <= 0.0 or h <= 0.0: return None cx0 = float((minx + maxx) / 2.0) cy0 = float((miny + maxy) / 2.0) rad_a = math.radians(ang) ca = math.cos(rad_a) sa = math.sin(rad_a) cx = (cx0 * ca) - (cy0 * sa) cy = (cx0 * sa) + (cy0 * ca) l = cx - (w / 2.0) t = cy - (h / 2.0) return {"l": float(l), "t": float(t), "r": float(l + w), "b": float(t + h), "deg": float(ang)} def _tp_rect_corners(l: float, t: float, r: float, b: float, deg: float) -> list: w = float(r - l) h = float(b - t) if w <= 0.0 or h <= 0.0: return [] cx = float((l + r) / 2.0) cy = float((t + b) / 2.0) hw = w / 2.0 hh = h / 2.0 rad = math.radians(float(deg or 0.0)) c = math.cos(rad) s = math.sin(rad) out = [] for x, y in ((-hw, -hh), (hw, -hh), (hw, hh), (-hw, hh)): rx = (x * c) - (y * s) ry = (x * s) + (y * c) out.append((cx + rx, cy + ry)) return out def _tp_para_rect_from_items(items: list, base_w: float, base_h: float, para_deg: float) -> dict | None: if not items: return None pts = [] for it in items: ibx = _tp_get_rect(it, base_w, base_h) if not ibx: continue w = float(ibx["r"] - ibx["l"]) h = float(ibx["b"] - ibx["t"]) if w <= 0.0 or h <= 0.0: continue deg = float(ibx.get("deg") or 0.0) cx = float(ibx["l"] + w / 2.0) cy = float(ibx["t"] + h / 2.0) hw = w / 2.0 hh = h / 2.0 rad = math.radians(deg) c = math.cos(rad) s = math.sin(rad) for x, y in ((-hw, -hh), (hw, -hh), (hw, hh), (-hw, hh)): rx = (x * c) - (y * s) ry = (x * s) + (y * c) pts.append((cx + rx, cy + ry)) return _tp_oriented_rect_from_points(pts, para_deg) def _tp_extract_item_text(it: dict) -> str: if not isinstance(it, dict): return "" for k in ( "text", "translated_text", "translatedText", "ai_text", "aiText", "display_text", "displayText", ): v = it.get(k) if isinstance(v, str) and v: return v spans = _tp_norm_list(it.get("spans")) if spans: return "".join(s.get("text") if isinstance(s, dict) and isinstance(s.get("text"), str) else "" for s in spans) return "" def ai_tree_to_tp_html(tree: dict, base_w: int, base_h: int) -> str: base_w = int(base_w or 0) base_h = int(base_h or 0) if base_w <= 0 or base_h <= 0: return "" paras = _tp_norm_list(tree.get("paragraphs") if isinstance(tree, dict) else None) if not paras: return "" parts = [ f'
'] for pi, p in enumerate(paras): if not isinstance(p, dict): continue items = _tp_norm_list(p.get("items")) if len(items) > 1 and any(isinstance(x, dict) and _tp_num(x.get("item_index")) is not None for x in items): items = sorted( items, key=lambda x: _tp_num( x.get("item_index")) if isinstance(x, dict) else 0.0, ) para_idx = int(_tp_num(p.get("para_index")) or pi) pbx = _tp_get_rect(p, base_w, base_h) or _tp_union_rect( items, base_w, base_h) if not pbx: continue para_deg = float(pbx.get("deg") or 0.0) if abs(para_deg) <= 0.01: derived = _tp_mean_item_deg(items, base_w, base_h) if abs(derived) > 0.01: pbx2 = _tp_para_rect_from_items(items, base_w, base_h, derived) if pbx2: pbx = pbx2 para_deg = float(pbx.get("deg") or 0.0) pbx_items = _tp_para_rect_from_items(items, base_w, base_h, para_deg) if pbx_items: pts = _tp_rect_corners( pbx["l"], pbx["t"], pbx["r"], pbx["b"], para_deg) pts += _tp_rect_corners(pbx_items["l"], pbx_items["t"], pbx_items["r"], pbx_items["b"], para_deg) merged = _tp_oriented_rect_from_points(pts, para_deg) if merged: pbx = merged eps = float(_TP_HTML_EPS_PX or 0.0) if eps > 0.0: pbx = { "l": float(pbx["l"] - eps), "t": float(pbx["t"] - eps), "r": float(pbx["r"] + eps), "b": float(pbx["b"] + eps), "deg": float(pbx.get("deg") or para_deg or 0.0), } pw = max(0.0, pbx["r"] - pbx["l"]) ph = max(0.0, pbx["b"] - pbx["t"]) para_style = ( f'left: {pbx["l"]:.6f}px; ' f'top: {pbx["t"]:.6f}px; ' f'width: {pw:.6f}px; ' f'height: {ph:.6f}px;' ) if abs(para_deg) > 0.01: para_style += f' transform: rotate({para_deg:.6g}deg); transform-origin: center center;' parts.append( f'
' ) para_cx = (pbx["l"] + pbx["r"]) / 2.0 para_cy = (pbx["t"] + pbx["b"]) / 2.0 inv_c = inv_s = None if abs(para_deg) > 0.01: rad_inv = math.radians(-para_deg) inv_c = math.cos(rad_inv) inv_s = math.sin(rad_inv) raw_texts = [_tp_extract_item_text(it) for it in items] mapped = list(raw_texts) p_text = p.get("text") if isinstance(p.get("text"), str) else "" non_empty = sum( 1 for t in raw_texts if isinstance(t, str) and t.strip()) any_nl = any(isinstance(t, str) and re.search(r"\r?\n", t) for t in raw_texts) first_nl = bool(raw_texts and isinstance( raw_texts[0], str) and re.search(r"\r?\n", raw_texts[0])) lines = None if p_text and re.search(r"\r?\n", p_text) and (non_empty <= 1 or any_nl): lines = [s.rstrip() for s in re.split(r"\r?\n+", p_text) if s.strip()] elif first_nl and (non_empty <= 1 or all(not (t or "").strip() for t in raw_texts[1:])): lines = [s.rstrip() for s in re.split( r"\r?\n+", raw_texts[0]) if s.strip()] if lines: mapped = [lines[i] if i < len(lines) else ( raw_texts[i] if i < len(raw_texts) else "") for i in range(len(items))] for ii, it in enumerate(items): if not isinstance(it, dict): continue text = (mapped[ii] if ii < len(mapped) else "") or "" if not text.strip(): continue ibx = _tp_get_rect(it, base_w, base_h) if not ibx: continue w0 = max(0.0, ibx["r"] - ibx["l"]) h0 = max(0.0, ibx["b"] - ibx["t"]) if w0 <= 0 or h0 <= 0: continue w = float(w0 + (2.0 * eps)) if eps > 0.0 else float(w0) h = float(h0 + (2.0 * eps)) if eps > 0.0 else float(h0) item_idx = int(_tp_num(it.get("item_index")) or ii) fs_raw = _tp_num(it.get("font_size_px")) fs = int(round(fs_raw)) if fs_raw and fs_raw > 0 else max( 10, int(round(h0 * 0.85))) fs = max(6, min(fs, max(6, int(math.floor(h0 * 0.95))))) lh = max(1, min(int(round(h0)), int(round(fs * 1.12)))) if inv_c is not None and inv_s is not None: icx = (ibx["l"] + ibx["r"]) / 2.0 icy = (ibx["t"] + ibx["b"]) / 2.0 dx = icx - para_cx dy = icy - para_cy rcx = para_cx + (dx * inv_c - dy * inv_s) rcy = para_cy + (dx * inv_s + dy * inv_c) left = (rcx - (w / 2.0)) - pbx["l"] top = (rcy - (h / 2.0)) - pbx["t"] else: left = (ibx["l"] - pbx["l"]) - eps top = (ibx["t"] - pbx["t"]) - eps style = ( f'left: {left:.6f}px; ' f'top: {top:.6f}px; ' f'width: {w:.6f}px; ' f'height: {h:.6f}px; ' f'font-size: {fs}px; ' f'line-height: {lh}px; ' 'padding-bottom: 0px;' ) deg = float(ibx.get("deg") or 0.0) if inv_c is not None: deg = deg - para_deg if abs(deg) > 0.01: style += f' transform: rotate({deg:.6g}deg); transform-origin: center center;' wrap_attr = ' data-wrap="1"' if it.get("_tp_wrap") else "" parts.append( f'
' f'{_tp_escape_text(text)}
' ) parts.append("
") parts.append("
") return "".join(parts) def overlay_css(container_class="RTMDre", token_class="IwqbBf"): c = container_class t = token_class return ( f".{c}{{" "position:absolute!important;" "inset:0!important;" "width:100%!important;" "height:100%!important;" "display:block!important;" "opacity:1!important;" "visibility:visible!important;" "pointer-events:none!important;" "overflow:visible!important;" "z-index:2147483647!important;" "transform:none!important;" "contain:layout style paint!important;" "--lens-text-color:#fff;" "--lens-font-family:\"Noto Sans Thai\",\"Noto Sans Thai UI\",\"Noto Sans\",system-ui,-apple-system,BlinkMacSystemFont,\"Segoe UI\",Roboto,Arial,sans-serif;" "--lens-text-shadow:0 1px 2px rgba(0,0,0,.85),0 0 1px rgba(0,0,0,.85);" "}}" f".{c} *{{box-sizing:border-box!important;}}" f".{c} .{t}{{" "position:absolute!important;" "display:flex!important;" "align-items:center!important;" "justify-content:center!important;" "opacity:1!important;" "visibility:visible!important;" "pointer-events:none!important;" "user-select:none!important;" "overflow:visible!important;" "white-space:pre!important;" "transform-origin:top left!important;" "filter:none!important;" "mix-blend-mode:normal!important;" "text-transform:none!important;" "letter-spacing:normal!important;" "}}" f".{c} .{t}::before{{" "content:attr(aria-label)!important;" "display:block!important;" "white-space:pre!important;" "color:var(--lens-text-color)!important;" "font-family:var(--lens-font-family)!important;" "text-shadow:var(--lens-text-shadow)!important;" "font-weight:400!important;" "font-style:normal!important;" "line-height:inherit!important;" "text-rendering:geometricPrecision!important;" "}}" ) def ensure_font(path, urls): key = str(path or "") cached = _FONT_RESOLVE_CACHE.get(key) if cached is not None: return cached or None if path and os.path.isfile(path): _FONT_RESOLVE_CACHE[key] = path return path candidates = [] for root in ("/usr/share/fonts", "/usr/local/share/fonts", os.path.expanduser("~/.fonts")): if os.path.isdir(root): for p in os.walk(root): for fn in p[2]: if fn.lower() == os.path.basename(path).lower(): candidates.append(os.path.join(p[0], fn)) if candidates: _FONT_RESOLVE_CACHE[key] = candidates[0] return candidates[0] for url in urls: try: r = httpx.get(url, timeout=30) if r.status_code == 200 and len(r.content) > 10000: with open(path, "wb") as f: f.write(r.content) if os.path.isfile(path): _FONT_RESOLVE_CACHE[key] = path return path except Exception: pass _FONT_RESOLVE_CACHE[key] = "" return None def pick_font(text, thai_path, latin_path, size): def has_thai(s): for ch in s: o = ord(ch) if 0x0E00 <= o <= 0x0E7F: return True return False fp = thai_path if has_thai(text) else latin_path if fp and os.path.isfile(fp): try: return ImageFont.truetype(fp, size=size, layout_engine=getattr(ImageFont, "LAYOUT_RAQM", 0)) except Exception: try: return ImageFont.truetype(fp, size=size) except Exception: pass return ImageFont.load_default() def _get_font_pair(thai_path, latin_path, size): key = (str(thai_path or ""), str(latin_path or ""), int(size)) v = _FONT_PAIR_CACHE.get(key) if v: return v f_th = pick_font("ก", thai_path, latin_path, size) f_lat = pick_font("A", thai_path, latin_path, size) _FONT_PAIR_CACHE[key] = (f_th, f_lat) return f_th, f_lat def _is_thai_char(ch: str) -> bool: if not ch: return False o = ord(ch) return 0x0E00 <= o <= 0x0E7F def _split_runs_for_fallback(text: str): runs = [] cur = [] cur_is_th = None for ch in text: if ch == "\n": if cur: runs.append(("".join(cur), cur_is_th)) cur = [] runs.append(("\n", None)) cur_is_th = None continue is_th = _is_thai_char(ch) if ch.isspace() and cur_is_th is not None: is_th = cur_is_th if cur_is_th is None: cur_is_th = is_th cur = [ch] continue if is_th == cur_is_th: cur.append(ch) else: runs.append(("".join(cur), cur_is_th)) cur = [ch] cur_is_th = is_th if cur: runs.append(("".join(cur), cur_is_th)) return runs def _draw_text_centered_fallback(draw_ctx, center_xy, text, thai_path, latin_path, size, fill): t = _sanitize_draw_text(text) if not t: return f_th, f_lat = _get_font_pair(thai_path, latin_path, size) runs = _split_runs_for_fallback(t) x = 0.0 min_t = 0.0 max_b = 0.0 for run, is_th in runs: if run == "\n": continue f = f_th if is_th else f_lat try: bb = draw_ctx.textbbox((x, 0), run, font=f, anchor="ls") min_t = min(min_t, float(bb[1])) max_b = max(max_b, float(bb[3])) x = float(bb[2]) except Exception: try: w, h = draw_ctx.textsize(run, font=f) except Exception: w, h = (len(run) * size * 0.5, size) min_t = min(min_t, -float(h) * 0.8) max_b = max(max_b, float(h) * 0.2) x += float(w) total_w = max(1.0, x) total_h = max(1.0, max_b - min_t) cx, cy = center_xy start_x = float(cx) - (total_w / 2.0) baseline_y = float(cy) - (total_h / 2.0) - min_t x = start_x for run, is_th in runs: if run == "\n": continue f = f_th if is_th else f_lat draw_ctx.text((x, baseline_y), run, font=f, fill=fill, anchor="ls") try: x += float(draw_ctx.textlength(run, font=f)) except Exception: try: w, _ = draw_ctx.textsize(run, font=f) except Exception: w = len(run) * size * 0.5 x += float(w) def _draw_text_baseline_fallback(draw, pos, text, thai_path, latin_path, size, fill): t = _sanitize_draw_text(text) if not t: return 0.0, 0.0 f_th, f_lat = _get_font_pair(thai_path, latin_path, size) runs = _split_runs_for_fallback(t) x0, y0 = pos x = float(x0) max_ascent = 0 max_descent = 0 for run, is_th in runs: if run == "\n": continue f = f_th if is_th else f_lat try: ascent, descent = f.getmetrics() except Exception: ascent, descent = size, int(size * 0.25) max_ascent = max(max_ascent, ascent) max_descent = max(max_descent, descent) draw.text((x, y0), run, font=f, fill=fill, anchor="ls") try: adv = float(f.getlength(run)) except Exception: tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) try: bb = dtmp.textbbox((0, 0), run, font=f, anchor="ls") adv = float(bb[2] - bb[0]) except Exception: w, _ = dtmp.textsize(run, font=f) adv = float(w) x += adv return float(x - x0), float(max_ascent + max_descent) def _baseline_offset_px_for_text(text: str, thai_path: str, latin_path: str, size: int): t = _sanitize_draw_text(text) if not t: return None f_th, f_lat = _get_font_pair(thai_path, latin_path, size) runs = _split_runs_for_fallback(t) tmp = Image.new("RGBA", (16, 16), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) x = 0.0 min_t = 0.0 max_b = 0.0 for run, is_th in runs: if run == "\n": continue f = f_th if is_th else f_lat try: bb = dtmp.textbbox((x, 0), run, font=f, anchor="ls") min_t = min(min_t, float(bb[1])) max_b = max(max_b, float(bb[3])) x = float(bb[2]) except Exception: try: w, h = dtmp.textsize(run, font=f) except Exception: w, h = (len(run) * size * 0.5, size) min_t = min(min_t, -float(h) * 0.8) max_b = max(max_b, float(h) * 0.2) x += float(w) total_h = max(1.0, max_b - min_t) baseline_offset = -(total_h / 2.0) - min_t return baseline_offset, total_h def _line_metrics_px(text: str, thai_path: str, latin_path: str, size: int): t = _sanitize_draw_text(text) if not t: return None f_th, f_lat = _get_font_pair(thai_path, latin_path, size) runs = _split_runs_for_fallback(t) tmp = Image.new("RGBA", (16, 16), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) x = 0.0 min_t = 0.0 max_b = 0.0 for run, is_th in runs: if run == "\n": continue f = f_th if is_th else f_lat try: bb = dtmp.textbbox((x, 0), run, font=f, anchor="ls") min_t = min(min_t, float(bb[1])) max_b = max(max_b, float(bb[3])) x = float(bb[2]) except Exception: try: w, h = dtmp.textsize(run, font=f) except Exception: w, h = (len(run) * size * 0.5, size) min_t = min(min_t, -float(h) * 0.8) max_b = max(max_b, float(h) * 0.2) x += float(w) width = max(1.0, x) total_h = max(1.0, max_b - min_t) baseline_to_center = -((min_t + max_b) / 2.0) return width, total_h, baseline_to_center def _item_avail_w_px(item: dict, W: int, H: int) -> float: b = item.get("box") or {} w_box = float(b.get("width") or 0.0) * float(W) L = 0.0 p1 = item.get("baseline_p1") or {} p2 = item.get("baseline_p2") or {} if ("x" in p1 and "y" in p1 and "x" in p2 and "y" in p2): dx = (float(p2.get("x") or 0.0) - float(p1.get("x") or 0.0)) * float(W) dy = (float(p2.get("y") or 0.0) - float(p1.get("y") or 0.0)) * float(H) L = float(math.hypot(dx, dy)) avail = max(w_box, L) return max(1.0, float(avail)) def _item_avail_h_px(item: dict, H: int) -> float: b = item.get("box") or {} return max(1.0, (float(b.get("height") or 0.0) * float(H)) - 2.0) def _item_line_text(item: dict) -> str: t = str(item.get("text") or "") if t.strip(): return t spans = item.get("spans") or [] return "".join(str(s.get("text") or "") for s in spans) def _compute_fit_size_px_for_item(item: dict, thai_path: str, latin_path: str, W: int, H: int, base_size: int = 96) -> int | None: item.pop("_tp_wrap", None) text = _item_line_text(item) if not text.strip(): return None m = _line_metrics_px(text, thai_path, latin_path, base_size) if m is None: return None tw, th, _ = m avail_w = _item_avail_w_px(item, W, H) avail_h = _item_avail_h_px(item, H) if tw <= 1e-6 or th <= 1e-6: return None is_thai = any(_is_thai_char(ch) for ch in text) scale_w = (avail_w * 0.98) / tw scale_h = (avail_h * (0.90 if is_thai else 0.94)) / th scale = min(scale_w, scale_h) if scale <= 0: return None size = max(10, int(base_size * scale)) while size > 10: mm = _line_metrics_px(text, thai_path, latin_path, size) if mm is None: return None tw2, th2, _ = mm if (tw2 <= avail_w * 0.999) and (th2 <= avail_h * 0.999): break size -= 1 if size <= 12 and avail_h >= 24: tw0, th0, _ = m if tw0 > (avail_w * 1.2): def _wrap_fits(s: int) -> bool: if s <= 0: return False k = float(s) / float(base_size) tw = float(tw0) * k th = float(th0) * k lines = int(math.ceil(max(1.0, tw) / max(1.0, avail_w))) return (float(lines) * th) <= float(avail_h) hi = int(min(max(16, avail_h), base_size * 3)) lo = int(size) best = int(size) while lo <= hi: mid = (lo + hi) // 2 if _wrap_fits(mid): best = int(mid) lo = mid + 1 else: hi = mid - 1 if best >= int(size * 1.25): item["_tp_wrap"] = True size = int(best) return int(size) def fit_tree_font_sizes_for_tp_html(tree: dict, thai_path: str, latin_path: str, W: int, H: int) -> dict: paras = tree.get("paragraphs") or [] for p in paras: items = p.get("items") or [] if not items: continue per_item_fit: dict[int, int] = {} fits: list[int] = [] for i, it in enumerate(items): s = _compute_fit_size_px_for_item(it, thai_path, latin_path, W, H) if s is None: continue per_item_fit[i] = int(s) fits.append(int(s)) if not fits: continue fits.sort() p["para_font_size_px"] = int(fits[len(fits) // 2]) for i, it in enumerate(items): fs = per_item_fit.get(i) if fs is None: continue it["font_size_px"] = int(fs) for sp in (it.get("spans") or []): sp["font_size_px"] = int(fs) return tree def _iter_paragraphs(tree: dict): ps = (tree or {}).get("paragraphs") or [] for i, p in enumerate(ps): yield i, p def _apply_para_font_size(tree: dict, para_sizes: dict[int, int]): if not tree: return for pi, p in _iter_paragraphs(tree): sz = para_sizes.get(pi) if not sz: continue p["para_font_size_px"] = int(sz) for it in (p.get("items") or []): it["font_size_px"] = int(sz) for sp in (it.get("spans") or []): sp["font_size_px"] = int(sz) def _compute_shared_para_sizes(trees: list[dict], thai_path: str, latin_path: str, W: int, H: int) -> dict[int, int]: sizes: dict[int, int] = {} for tree in trees: if not tree: continue for pi, p in _iter_paragraphs(tree): for it in (p.get("items") or []): fit = _compute_fit_size_px_for_item( it, thai_path, latin_path, W, H) if fit is None: continue cur = sizes.get(pi) sizes[pi] = fit if cur is None else min(cur, fit) vals = [v for v in sizes.values() if isinstance(v, int) and v > 0] if not vals: return sizes vals.sort() mid = len(vals) // 2 target = vals[mid] if (len(vals) % 2 == 1) else int( round((vals[mid - 1] + vals[mid]) / 2)) for k in list(sizes.keys()): try: sizes[k] = int(min(int(sizes[k]), int(target))) except Exception: pass return sizes def _sanitize_draw_text(s: str) -> str: t = (s or "").replace("\r\n", "\n").replace("\r", "\n") t = t.replace("\u200b", "").replace("\ufeff", "") t = "".join(ch for ch in t if (ch == "\n") or ( unicodedata.category(ch)[0] != "C")) return t def _token_box_px(t, W, H, pad_px=0): b = t.get("box") or {} left = int(round(float(b.get("left", 0.0)) * W)) - pad_px top = int(round(float(b.get("top", 0.0)) * H)) - pad_px right = int(round((float(b.get("left", 0.0)) + float(b.get("width", 0.0))) * W)) + pad_px bottom = int( round((float(b.get("top", 0.0)) + float(b.get("height", 0.0))) * H)) + pad_px left = max(0, min(W, left)) top = max(0, min(H, top)) right = max(0, min(W, right)) bottom = max(0, min(H, bottom)) if right <= left or bottom <= top: return None return left, top, right, bottom def _token_quad_px(t, W, H, pad_px=0, apply_baseline_shift=True): if not t.get("valid_text"): return None p1 = t.get("baseline_p1") or {} p2 = t.get("baseline_p2") or {} x1 = float(p1.get("x", 0.0)) * W y1 = float(p1.get("y", 0.0)) * H x2 = float(p2.get("x", 0.0)) * W y2 = float(p2.get("y", 0.0)) * H dx = x2 - x1 dy = y2 - y1 if dx < 0 or (abs(dx) < 1e-12 and dy < 0): x1, y1, x2, y2 = x2, y2, x1, y1 dx = x2 - x1 dy = y2 - y1 L = math.hypot(dx, dy) if L <= 1e-9: return None ux = dx / L uy = dy / L nx = -uy ny = ux if ny < 0: nx, ny = -nx, -ny t0 = float(t.get("t0_raw") if t.get("t0_raw") is not None else 0.0) t1 = float(t.get("t1_raw") if t.get("t1_raw") is not None else 1.0) sx = x1 + ux * (t0 * L) sy = y1 + uy * (t0 * L) ex = x1 + ux * (t1 * L) ey = y1 + uy * (t1 * L) h = max(1.0, float(t.get("height_raw") or 0.0) * H) if apply_baseline_shift and BASELINE_SHIFT: shift = h * BASELINE_SHIFT_FACTOR sx += nx * shift sy += ny * shift ex += nx * shift ey += ny * shift pad = max(0.0, float(pad_px)) sx -= ux * pad sy -= uy * pad ex += ux * pad ey += uy * pad hh = (h / 2.0) + pad ox = nx * hh oy = ny * hh return [(sx - ox, sy - oy), (ex - ox, ey - oy), (ex + ox, ey + oy), (sx + ox, sy + oy)] def _token_box_quad_px(t, W, H, pad_px=0): b = t.get("box") or {} w = float(b.get("width", 0.0)) * W h = float(b.get("height", 0.0)) * H if w <= 0.0 or h <= 0.0: return None left = float(b.get("left", 0.0)) * W top = float(b.get("top", 0.0)) * H cx = left + (w / 2.0) cy = top + (h / 2.0) hw = (w / 2.0) + float(pad_px) hh = (h / 2.0) + float(pad_px) angle_deg = float(b.get("rotation_deg", 0.0)) rad = math.radians(angle_deg) c = math.cos(rad) s = math.sin(rad) corners = [(-hw, -hh), (hw, -hh), (hw, hh), (-hw, hh)] out = [] for x, y in corners: rx = (x * c) - (y * s) ry = (x * s) + (y * c) out.append((cx + rx, cy + ry)) return out def _quad_bbox(quad, W, H): xs = [p[0] for p in quad] ys = [p[1] for p in quad] l = max(0, min(W, int(math.floor(min(xs))))) t = max(0, min(H, int(math.floor(min(ys))))) r = max(0, min(W, int(math.ceil(max(xs))))) b = max(0, min(H, int(math.ceil(max(ys))))) if r <= l or b <= t: return None return l, t, r, b def _median_rgba(pixels): if not pixels: return None rs = sorted(p[0] for p in pixels) gs = sorted(p[1] for p in pixels) bs = sorted(p[2] for p in pixels) a = 255 mid = len(rs) // 2 return (rs[mid], gs[mid], bs[mid], a) def _rel_luminance(rgb): r, g, b = rgb def lin(c): c = c / 255.0 return c / 12.92 if c <= 0.04045 else ((c + 0.055) / 1.055) ** 2.4 return 0.2126 * lin(r) + 0.7152 * lin(g) + 0.0722 * lin(b) def _contrast_ratio(l1, l2): a = max(l1, l2) + 0.05 b = min(l1, l2) + 0.05 return a / b def _pick_bw_text_color(bg_rgb): Lb = _rel_luminance(bg_rgb) c_black = _contrast_ratio(Lb, 0.0) c_white = _contrast_ratio(Lb, 1.0) return TEXT_COLOR_LIGHT if c_white >= c_black else TEXT_COLOR_DARK def _sample_bg_color_from_quad(base_rgb, quad, rect, border_px=3, margin_px=6): l, t, r, b = rect w = r - l h = b - t if w <= 0 or h <= 0: return _sample_bg_color(base_rgb, rect, margin_px) mask = Image.new("L", (w, h), 0) d = ImageDraw.Draw(mask) qrel = [(x - l, y - t) for x, y in quad] d.polygon(qrel, fill=255) bp = int(max(0, border_px or 0)) if bp > 0: k = min(w, h) bp = min(bp, max(1, (k - 1) // 2)) if bp > 0: er = mask.filter(ImageFilter.MinFilter(size=bp * 2 + 1)) border = ImageChops.subtract(mask, er) else: border = mask region = base_rgb.crop((l, t, r, b)) rp = list(region.getdata()) mp = list(border.getdata()) samples = [p for p, m in zip(rp, mp) if m > 0] if len(samples) < 24: ext = _sample_bg_color(base_rgb, rect, margin_px) return ext med = _median_rgba(samples) if med: return med[:3] return _sample_bg_color(base_rgb, rect, margin_px) def _sample_bg_color(base_rgb, rect, margin_px): W, H = base_rgb.size l, t, r, b = rect m = max(1, int(margin_px)) samples = [] def add_strip(x0, y0, x1, y1): x0 = max(0, min(W, x0)) y0 = max(0, min(H, y0)) x1 = max(0, min(W, x1)) y1 = max(0, min(H, y1)) if x1 <= x0 or y1 <= y0: return samples.extend(list(base_rgb.crop((x0, y0, x1, y1)).getdata())) add_strip(l, t - m, r, t) add_strip(l, b, r, b + m) add_strip(l - m, t, l, b) add_strip(r, t, r + m, b) med = _median_rgba(samples) if med: return med[:3] return base_rgb.getpixel((max(0, min(W - 1, l)), max(0, min(H - 1, t)))) def _sample_bg_color_from_quad_ring(base_rgb, quad, rect, ring_px=4): W, H = base_rgb.size l, t, r, b = rect w = r - l h = b - t if w <= 0 or h <= 0: return None mask = np.zeros((h, w), dtype=np.uint8) pts = np.array([[(x - l, y - t) for x, y in quad]], dtype=np.int32) cv2.fillPoly(mask, pts, 255) rp = int(max(1, ring_px or 1)) k = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (rp * 2 + 1, rp * 2 + 1)) dil = cv2.dilate(mask, k, iterations=1) ring = cv2.bitwise_and(dil, cv2.bitwise_not(mask)) rgb = np.array(base_rgb.crop((l, t, r, b)).convert("RGB"), dtype=np.uint8) sel = rgb[ring > 0] if sel.size < 24: return None med = np.median(sel, axis=0) return (int(med[0]), int(med[1]), int(med[2])) def _pixelate(img, block_px): w, h = img.size if w <= 1 or h <= 1: return img block_px = int(block_px or 1) if block_px < 1: block_px = 1 sw = max(1, w // block_px) sh = max(1, h // block_px) return img.resize((sw, sh), resample=Image.NEAREST).resize((w, h), resample=Image.NEAREST) def _mean_abs_diff(a, b): if a.size != b.size: return 1e18 a = a.convert("RGB") b = b.convert("RGB") da = list(a.getdata()) db = list(b.getdata()) if not da: return 1e18 s = 0 for (ar, ag, ab), (br, bg, bb) in zip(da, db): s += abs(ar - br) + abs(ag - bg) + abs(ab - bb) return s / (len(da) * 3) def _resize_small(img, max_w=64, max_h=64): w, h = img.size if w <= 0 or h <= 0: return img scale = min(max_w / w, max_h / h, 1.0) nw = max(1, int(w * scale)) nh = max(1, int(h * scale)) return img.resize((nw, nh), resample=Image.BILINEAR) def _clone_candidate_score(base, rect, cand_rect, direction, border_px): W, H = base.size l, t, r, b = rect cl, ct, cr, cb = cand_rect w = r - l h = b - t if w <= 1 or h <= 1: return 1e18 border_px = max(1, int(border_px or 1)) if direction == "up": a = base.crop((l, max(0, t - border_px), r, t)) d = base.crop((cl, max(0, cb - border_px), cr, cb)) elif direction == "down": a = base.crop((l, b, r, min(H, b + border_px))) d = base.crop((cl, ct, cr, min(H, ct + border_px))) elif direction == "left": a = base.crop((max(0, l - border_px), t, l, b)) d = base.crop((max(0, cr - border_px), ct, cr, cb)) else: a = base.crop((r, t, min(W, r + border_px), b)) d = base.crop((cl, ct, min(W, cl + border_px), cb)) a = _resize_small(a, 64, 16) d = _resize_small(d, 64, 16) return _mean_abs_diff(a, d) def _choose_clone_rect(base, rect, gap_px, border_px): W, H = base.size l, t, r, b = rect w = r - l h = b - t gap_px = max(0, int(gap_px or 0)) cands = [] up = (l, t - gap_px - h, r, t - gap_px) down = (l, b + gap_px, r, b + gap_px + h) left = (l - gap_px - w, t, l - gap_px, b) right = (r + gap_px, t, r + gap_px + w, b) for direction, (cl, ct, cr, cb) in [("up", up), ("down", down), ("left", left), ("right", right)]: if cl < 0 or ct < 0 or cr > W or cb > H: continue cand_rect = (cl, ct, cr, cb) score = _clone_candidate_score( base, rect, cand_rect, direction, border_px) cands.append((score, cand_rect)) if not cands: return None cands.sort(key=lambda x: x[0]) return cands[0][1] def _erase_with_clone(base, rect, mask, gap_px, border_px, feather_px): l, t, r, b = rect cand = _choose_clone_rect(base, rect, gap_px, border_px) if not cand: return False cl, ct, cr, cb = cand donor = base.crop((cl, ct, cr, cb)) region = base.crop((l, t, r, b)) feather_px = max(0, int(feather_px or 0)) if feather_px > 0: m = mask.filter(ImageFilter.GaussianBlur(radius=feather_px)) else: m = mask merged = Image.composite(donor, region, m) base.paste(merged, (l, t)) return True def _erase_with_blend_patches(base, rect, mask, gap_px=3, feather_px=4): l, t, r, b = rect W, H = base.size w = r - l h = b - t if w <= 2 or h <= 2: return False gap = int(max(0, gap_px)) candidates = [] dirs = [(0, -(h + gap)), (0, (h + gap)), (-(w + gap), 0), ((w + gap), 0), (-(w + gap), -(h + gap)), ((w + gap), -(h + gap)), (-(w + gap), (h + gap)), ((w + gap), (h + gap))] for dx, dy in dirs: ll = l + dx tt = t + dy rr = ll + w bb = tt + h if ll < 0 or tt < 0 or rr > W or bb > H: continue candidates.append(base.crop((ll, tt, rr, bb)).convert("RGB")) if not candidates: return False acc = candidates[0] for c in candidates[1:]: acc = ImageChops.add(acc, c, scale=1.0, offset=0) n = len(candidates) blended = acc.point(lambda p: int(p / n)) m = mask fp = int(max(0, feather_px)) if fp > 0: m = m.filter(ImageFilter.GaussianBlur(radius=fp)) region = base.crop((l, t, r, b)).convert("RGB") merged = Image.composite(blended, region, m) base.paste(merged, (l, t)) return True def _erase_with_inpaint(base, box_tokens, pad_px=2): if not box_tokens: return base rgb = base.convert("RGB") W, H = rgb.size mask = Image.new("L", (W, H), 0) d = ImageDraw.Draw(mask) for t in box_tokens: quad = _token_box_quad_px(t, W, H, pad_px=pad_px) if not quad: quad = _token_quad_px(t, W, H, pad_px=pad_px, apply_baseline_shift=True) if not quad: rect = _token_box_px(t, W, H, pad_px=pad_px) if not rect: continue l, tt, r, bb = rect quad = [(l, tt), (r, tt), (r, bb), (l, bb)] d.polygon(quad, fill=255) m = np.array(mask, dtype=np.uint8) ys, xs = np.where(m > 0) if xs.size == 0 or ys.size == 0: return rgb l = int(max(0, xs.min() - 8)) t = int(max(0, ys.min() - 8)) r = int(min(W, xs.max() + 1 + 8)) b = int(min(H, ys.max() + 1 + 8)) if r <= l or b <= t: return rgb crop_rgb = np.array(rgb.crop((l, t, r, b)), dtype=np.uint8) crop_m = m[t:b, l:r] dpx = int(max(0, INPAINT_DILATE_PX or 0)) if dpx > 0: k = cv2.getStructuringElement( cv2.MORPH_ELLIPSE, (dpx * 2 + 1, dpx * 2 + 1)) crop_m = cv2.dilate(crop_m, k, iterations=1) bgr = cv2.cvtColor(crop_rgb, cv2.COLOR_RGB2BGR) method = (INPAINT_METHOD or "telea").strip().lower() flag = cv2.INPAINT_TELEA if method in ("telea", "t") else cv2.INPAINT_NS radius = float(INPAINT_RADIUS or 3) out_bgr = cv2.inpaint(bgr, crop_m, radius, flag) out_rgb = cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB) out = rgb.copy() out.paste(Image.fromarray(out_rgb), (l, t)) return out def erase_text_with_boxes(img, box_tokens, pad_px=2, sample_margin_px=6, mode=None, mosaic_block_px=None): if not box_tokens: return img mode = (mode or ERASE_MODE or "solid").strip().lower() mosaic_block_px = int(mosaic_block_px or ERASE_MOSAIC_BLOCK_PX or 10) base = img.convert("RGB").copy() if mode in ("inpaint", "cv2", "opencv"): return _erase_with_inpaint(base, box_tokens, pad_px=pad_px) W, H = base.size for t in box_tokens: quad = _token_box_quad_px(t, W, H, pad_px=pad_px) if not quad: quad = _token_quad_px(t, W, H, pad_px=pad_px, apply_baseline_shift=True) if not quad: rect = _token_box_px(t, W, H, pad_px=pad_px) if not rect: continue l, tt, r, bb = rect quad = [(l, tt), (r, tt), (r, bb), (l, bb)] rect = _quad_bbox(quad, W, H) if not rect: continue l, tt, r, bb = rect region = base.crop((l, tt, r, bb)) mask = Image.new("L", (r - l, bb - tt), 0) mdraw = ImageDraw.Draw(mask) qrel = [(x - l, y - tt) for x, y in quad] mdraw.polygon(qrel, fill=255) if mode in ("blend_patch", "blend", "avg_patch", "patch"): ok = _erase_with_blend_patches( base, rect, mask, ERASE_BLEND_GAP_PX, ERASE_BLEND_FEATHER_PX) if ok: continue mode = "solid" if mode == "clone": ok = _erase_with_clone( base, rect, mask, ERASE_CLONE_GAP_PX, ERASE_CLONE_BORDER_PX, ERASE_CLONE_FEATHER_PX) if ok: continue mode = "solid" if mode == "mosaic": pixelated = _pixelate(region, mosaic_block_px) merged = Image.composite(pixelated, region, mask) base.paste(merged, (l, tt)) else: color = _sample_bg_color_from_quad( base, quad, rect, BG_SAMPLE_BORDER_PX, sample_margin_px) region.paste(color, mask=mask) base.paste(region, (l, tt)) return base def draw_overlay(img, tokens, out_path, thai_path, latin_path, level_outlines=None, font_scale: float = 1.0, fit_to_box: bool = True): base = img.convert("RGBA") base_rgb = img.convert("RGB") overlay = Image.new("RGBA", base.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(overlay) for ol in (level_outlines or []): q = ol.get("quad") if not q: continue col = ol.get("color", BOX_OUTLINE) w = int(ol.get("width", 2)) draw.line(q + [q[0]], fill=col, width=w) W, H = base.size for t in tokens: b = t.get("box") or {} box_quad = _token_box_quad_px(t, W, H, pad_px=0) use_box_center = False if box_quad: lq, tq, rq, bq = _quad_bbox(box_quad, W, H) box_cx = (lq + rq) / 2.0 box_cy = (tq + bq) / 2.0 box_w = max(1.0, float(rq - lq)) box_h = max(1.0, float(bq - tq)) use_box_center = True else: left0 = float(b.get("left", 0.0)) * W top0 = float(b.get("top", 0.0)) * H box_w = max(1.0, float(b.get("width", 0.0)) * W) box_h = max(1.0, float(b.get("height", 0.0)) * H) box_cx = left0 + (box_w / 2.0) box_cy = top0 + (box_h / 2.0) if DRAW_OUTLINE_SPAN and DRAW_BOX_OUTLINE: quad = _token_box_quad_px(t, W, H, pad_px=0) if quad: draw.line(quad + [quad[0]], fill=SPAN_OUTLINE, width=SPAN_OUTLINE_WIDTH) else: left = b["left"] * W top = b["top"] * H width = b["width"] * W height = b["height"] * H draw.rectangle([left, top, left + width, top + height], outline=SPAN_OUTLINE, width=SPAN_OUTLINE_WIDTH) text = _sanitize_draw_text(t.get("text") or "") if text.strip() == "": continue p1 = t["baseline_p1"] p2 = t["baseline_p2"] x1 = float(p1["x"]) * W y1 = float(p1["y"]) * H x2 = float(p2["x"]) * W y2 = float(p2["y"]) * H dx = x2 - x1 dy = y2 - y1 if dx < 0 or (abs(dx) < 1e-12 and dy < 0): x1, y1, x2, y2 = x2, y2, x1, y1 dx = x2 - x1 dy = y2 - y1 L = math.hypot(dx, dy) if L <= 1e-9: continue ux = dx / L uy = dy / L t0 = float(t.get("t0_raw") if t.get("t0_raw") is not None else 0.0) t1 = float(t.get("t1_raw") if t.get("t1_raw") is not None else 1.0) sx = x1 + ux * (t0 * L) sy = y1 + uy * (t0 * L) ex = x1 + ux * (t1 * L) ey = y1 + uy * (t1 * L) avail_w = box_w avail_h = box_h if BASELINE_SHIFT and (not use_box_center): nx, ny = -uy, ux shift = avail_h * BASELINE_SHIFT_FACTOR sx += nx * shift sy += ny * shift angle_deg = float(b.get("rotation_deg", 0.0)) forced_size = t.get("font_size_px") if forced_size is not None: final_size = int( max(10, round(float(forced_size) * float(font_scale)))) font = pick_font(text, thai_path, latin_path, final_size) if fit_to_box: tmpc = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dc = ImageDraw.Draw(tmpc) try: bbc = dc.textbbox((0, 0), text, font=font, anchor="ls") twc = float(bbc[2] - bbc[0]) thc = float(bbc[3] - bbc[1]) except Exception: twc, thc = dc.textsize(text, font=font) twc = float(twc) thc = float(thc) if twc > 0 and thc > 0 and (twc > avail_w or thc > avail_h): s = min(avail_w / twc, avail_h / thc) if s < 1.0: final_size = max(10, int(final_size * s)) font = pick_font( text, thai_path, latin_path, final_size) else: base_size = 96 font0 = pick_font(text, thai_path, latin_path, base_size) tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) try: bb = dtmp.textbbox((0, 0), text, font=font0, anchor="ls") tw = bb[2] - bb[0] th = bb[3] - bb[1] except Exception: tw, th = dtmp.textsize(text, font=font0) if tw <= 0 or th <= 0: continue scale = min(avail_w / tw, avail_h / th) final_size = max(10, int(base_size * scale)) if not fit_to_box: final_size = max(10, int(final_size * float(font_scale))) font = pick_font(text, thai_path, latin_path, final_size) tmp2 = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) d2 = ImageDraw.Draw(tmp2) try: bb2 = d2.textbbox((0, 0), text, font=font, anchor="ls") tw2 = bb2[2] - bb2[0] th2 = bb2[3] - bb2[1] except Exception: tw2, th2 = d2.textsize(text, font=font) side = int(max(tw2, th2, avail_h, avail_w) * 2.2 + 40) side = min(side, int(max(W, H) * 4)) if side < 128: side = 128 canvas = Image.new("RGBA", (side, side), (0, 0, 0, 0)) dc = ImageDraw.Draw(canvas) fill = TEXT_COLOR if AUTO_TEXT_COLOR: q = _token_box_quad_px(t, W, H, pad_px=0) if q: rr = _quad_bbox(q, W, H) if rr: bg = _sample_bg_color_from_quad_ring( base_rgb, q, rr, ring_px=max(2, BG_SAMPLE_BORDER_PX)) if bg is None: bg = _sample_bg_color_from_quad( base_rgb, q, rr, BG_SAMPLE_BORDER_PX, ERASE_SAMPLE_MARGIN_PX) fill = _pick_bw_text_color(bg) else: rr = _token_box_px(t, W, H, pad_px=0) if rr: bg = _sample_bg_color(base_rgb, rr, ERASE_SAMPLE_MARGIN_PX) fill = _pick_bw_text_color(bg) origin = (side // 2, side // 2) p1 = t.get("baseline_p1") or {} p2 = t.get("baseline_p2") or {} has_baseline = ("x" in p1 and "y" in p1 and "x" in p2 and "y" in p2) if has_baseline: x1 = float(p1.get("x") or 0.0) * float(W) y1 = float(p1.get("y") or 0.0) * float(H) x2 = float(p2.get("x") or 0.0) * float(W) y2 = float(p2.get("y") or 0.0) * float(H) dx = x2 - x1 dy = y2 - y1 Lb = float(math.hypot(dx, dy)) if Lb <= 1e-6: Lb = 1.0 ux = dx / Lb uy = dy / Lb nx = -uy ny = ux bb = t.get("box") or {} cx = (float(bb.get("left") or 0.0) + float(bb.get("width") or 0.0) / 2.0) * float(W) cy = (float(bb.get("top") or 0.0) + float(bb.get("height") or 0.0) / 2.0) * float(H) tt = _sanitize_draw_text(text) if not tt: continue font_m = pick_font(tt, thai_path, latin_path, final_size) try: tw = float(font_m.getlength(tt)) except Exception: tmp = Image.new("RGBA", (10, 10), (0, 0, 0, 0)) dtmp = ImageDraw.Draw(tmp) try: bbm = dtmp.textbbox((0, 0), tt, font=font_m, anchor="ls") tw = float(bbm[2] - bbm[0]) except Exception: tw, _ = dtmp.textsize(tt, font=font_m) tw = float(tw) f_th, f_lat = _get_font_pair(thai_path, latin_path, final_size) try: a_th, d_th = f_th.getmetrics() except Exception: a_th, d_th = final_size, int(final_size * 0.25) try: a_lat, d_lat = f_lat.getmetrics() except Exception: a_lat, d_lat = final_size, int(final_size * 0.25) ascent = float(max(a_th, a_lat)) descent = float(max(d_th, d_lat)) center_y_rel = (-ascent + descent) / 2.0 bx = cx - ux * (tw / 2.0) - nx * center_y_rel by = cy - uy * (tw / 2.0) - ny * center_y_rel angle_deg = float(math.degrees(math.atan2(dy, dx))) _draw_text_baseline_fallback( dc, origin, text, thai_path, latin_path, final_size, fill) rotated = canvas.rotate(-angle_deg, resample=Image.BICUBIC, expand=False, center=origin) paste_x = int(round(bx - origin[0])) paste_y = int(round(by - origin[1])) overlay.alpha_composite(rotated, dest=(paste_x, paste_y)) else: _draw_text_centered_fallback( dc, origin, text, thai_path, latin_path, final_size, fill) rotated = canvas.rotate(-angle_deg, resample=Image.BICUBIC, expand=False, center=origin) paste_x = int(round(box_cx - origin[0])) paste_y = int(round(box_cy - origin[1])) overlay.alpha_composite(rotated, dest=(paste_x, paste_y)) out = Image.alpha_composite(base, overlay).convert("RGB") out.save(out_path) def get_lens_data_from_image(image_path, firebase_url, lang): ck = _get_firebase_cookie(firebase_url) with open(image_path, "rb") as f: img_bytes = f.read() hdr = {"User-Agent": "Mozilla/5.0", "Referer": "https://lens.google.com/"} with httpx.Client(cookies=ck, headers=hdr, follow_redirects=False, timeout=60) as c: r = c.post( "https://lens.google.com/v3/upload", files={"encoded_image": ("file.jpg", img_bytes, "image/jpeg")}, ) if r.status_code not in (302, 303): raise Exception(f"Upload failed: {r.status_code}\n{r.text}") redirect = r.headers["location"] u = to_translated(redirect, lang=lang) with httpx.Client(cookies=ck, headers=hdr, timeout=60) as c: j = c.get(u).text data = json.loads(j[5:] if j.startswith(")]}'") else j) return data def _get_firebase_cookie(firebase_url: str): u = (firebase_url or '').strip() now = time.time() cache = _FIREBASE_COOKIE_CACHE if cache.get('data') and cache.get('url') == u and (now - float(cache.get('ts') or 0)) < float(FIREBASE_COOKIE_TTL_SEC): return cache.get('data') r = httpx.get(u, timeout=30) ck = r.json() cache['ts'] = now cache['url'] = u cache['data'] = ck return ck def warmup(lang: str = "th") -> dict: l = _normalize_lang(lang) cookie_ok = False try: _get_firebase_cookie(FIREBASE_URL) cookie_ok = True except Exception: pass thai_font = FONT_THAI_PATH latin_font = FONT_LATIN_PATH if l == "ja": latin_font = FONT_JA_PATH elif l in ("zh", "zh-hans", "zh_cn", "zh-cn", "zh_hans"): latin_font = FONT_ZH_SC_PATH elif l in ("zh-hant", "zh_tw", "zh-tw", "zh_hant"): latin_font = FONT_ZH_TC_PATH if FONT_DOWNLOD: thai_font = ensure_font(thai_font, FONT_THAI_URLS) if l == "ja": latin_font = ensure_font(latin_font, FONT_JA_URLS) elif l in ("zh", "zh-hans", "zh_cn", "zh-cn", "zh_hans"): latin_font = ensure_font(latin_font, FONT_ZH_SC_URLS) elif l in ("zh-hant", "zh_tw", "zh-tw", "zh_hant"): latin_font = ensure_font(latin_font, FONT_ZH_TC_URLS) else: latin_font = ensure_font(latin_font, FONT_LATIN_URLS) _get_font_pair(thai_font or "", latin_font or "", 22) _get_font_pair(thai_font or "", latin_font or "", 28) return {"ok": True, "lang": l, "thai_font": thai_font or "", "latin_font": latin_font or "", "cookie_ok": cookie_ok} def main(): data = get_lens_data_from_image(IMAGE_PATH, FIREBASE_URL, LANG) img = Image.open(IMAGE_PATH).convert("RGB") W, H = img.size thai_font = FONT_THAI_PATH latin_font = FONT_LATIN_PATH lang = _normalize_lang(LANG) if lang == "ja": latin_font = FONT_JA_PATH elif lang in ("zh", "zh-hans", "zh_cn", "zh-cn", "zh_hans"): latin_font = FONT_ZH_SC_PATH elif lang in ("zh-hant", "zh_tw", "zh-tw", "zh_hant"): latin_font = FONT_ZH_TC_PATH if FONT_DOWNLOD: thai_font = ensure_font(thai_font, FONT_THAI_URLS) if lang == "ja": latin_font = ensure_font(latin_font, FONT_JA_URLS) elif lang in ("zh", "zh-hans", "zh_cn", "zh-cn", "zh_hans"): latin_font = ensure_font(latin_font, FONT_ZH_SC_URLS) elif lang in ("zh-hant", "zh_tw", "zh-tw", "zh_hant"): latin_font = ensure_font(latin_font, FONT_ZH_TC_URLS) else: latin_font = ensure_font(latin_font, FONT_LATIN_URLS) image_url = data.get("imageUrl") if isinstance(data, dict) else None image_datauri = "" if DECODE_IMAGEURL_TO_DATAURI and image_url: image_datauri = decode_imageurl_to_datauri(image_url) out = { "imageUrl": image_url, "imageDataUri": image_datauri, "originalContentLanguage": data.get("originalContentLanguage"), "originalTextFull": data.get("originalTextFull"), "translatedTextFull": data.get("translatedTextFull"), "AiTextFull": "", "originalParagraphs": data.get("originalParagraphs") or [], "translatedParagraphs": data.get("translatedParagraphs") or [], "original": {}, "translated": {}, "Ai": {}, } original_span_tokens = None original_tree = None translated_tree = None def _base_img_for_overlay() -> Image.Image: if not (ERASE_OLD_TEXT_WITH_ORIGINAL_BOXES and original_span_tokens): return img return erase_text_with_boxes( img, original_span_tokens, pad_px=ERASE_PADDING_PX, sample_margin_px=ERASE_SAMPLE_MARGIN_PX, ) if DO_ORIGINAL: tree, _ = decode_tree( data.get("originalParagraphs") or [], data.get("originalTextFull") or "", "original", W, H, want_raw=False, ) original_tree = tree original_span_tokens = flatten_tree_spans(tree) out["original"] = {"originalTree": tree} if DO_ORIGINAL_HTML: out["original"]["originalhtml"] = tokens_to_html( original_span_tokens) if DRAW_OVERLAY_ORIGINAL: base_img = _base_img_for_overlay() draw_overlay( base_img, original_span_tokens, OVERLAY_ORIGINAL_PATH, thai_font or "", latin_font or "", level_outlines=build_level_outlines(original_tree, W, H), ) if DO_AI and original_tree is None: tree0, _ = decode_tree( data.get("originalParagraphs") or [], data.get("originalTextFull") or "", "original", W, H, want_raw=False, ) original_tree = tree0 if DO_TRANSLATED: tree, _ = decode_tree( data.get("translatedParagraphs") or [], data.get("translatedTextFull") or "", "translated", W, H, want_raw=False, ) translated_tree = tree out["translated"] = {"translatedTree": tree} translated_span_tokens = flatten_tree_spans(tree) if DO_TRANSLATED_HTML: out["translated"]["translatedhtml"] = tokens_to_html( translated_span_tokens) if DRAW_OVERLAY_TRANSLATED: base_img = _base_img_for_overlay() draw_overlay( base_img, translated_span_tokens, OVERLAY_TRANSLATED_PATH, thai_font or "", latin_font or "", level_outlines=build_level_outlines(tree, W, H), font_scale=TRANSLATED_OVERLAY_FONT_SCALE, fit_to_box=TRANSLATED_OVERLAY_FIT_TO_BOX, ) ai = None if DO_AI: src_text = out.get("originalTextFull") or "" if not src_text: src_text = data.get("originalTextFull") or "" tree_for_boxes = translated_tree or original_tree if tree_for_boxes is None: tree_for_boxes, _ = decode_tree( data.get("originalParagraphs") or [], data.get("originalTextFull") or "", "original", W, H, want_raw=False, ) original_tree = tree_for_boxes ai = ai_translate_original_text( src_text, LANG, ) template_tree = translated_tree patched = patch({"Ai": {"aiTextFull": str(ai.get( "aiTextFull") or ""), "aiTree": template_tree}}, W, H, thai_font, latin_font) ai_tree = (patched.get("Ai") or {}).get("aiTree") or {} ai["aiTree"] = ai_tree shared_para_sizes = _compute_shared_para_sizes( [original_tree or {}, translated_tree or {}, ai_tree or {}], thai_font or "", latin_font or "", W, H, ) _apply_para_font_size(original_tree or {}, shared_para_sizes) _apply_para_font_size(translated_tree or {}, shared_para_sizes) _apply_para_font_size(ai_tree or {}, shared_para_sizes) _rebuild_ai_spans_after_font_resize( ai_tree or {}, W, H, thai_font or "", latin_font or "") out["AiTextFull"] = str(ai.get("aiTextFull") or "") out["Ai"] = { "aiTextFull": str(ai.get("aiTextFull") or ""), "aiTree": ai_tree, } if DO_AI_HTML: if AI_OVERLAY_FIT_TO_BOX: fit_tree_font_sizes_for_tp_html( ai_tree or {}, thai_font or "", latin_font or "", W, H) out["Ai"]["aihtml"] = ai_tree_to_tp_html(ai_tree, W, H) out["Ai"]["aihtmlCss"] = tp_overlay_css() out["Ai"]["aihtmlMeta"] = { "baseW": int(W), "baseH": int(H), "format": "tp", } if DO_AI_OVERLAY and translated_tree is not None: base_img = _base_img_for_overlay() tokens_for_draw = flatten_tree_spans(ai_tree) draw_overlay( base_img, tokens_for_draw, AI_PATH_OVERLAY, thai_font or "", latin_font or "", level_outlines=build_level_outlines(ai_tree, W, H), font_scale=AI_OVERLAY_FONT_SCALE, fit_to_box=AI_OVERLAY_FIT_TO_BOX, ) if HTML_INCLUDE_CSS and (DO_ORIGINAL_HTML or DO_TRANSLATED_HTML or DO_AI_HTML): out["htmlCss"] = overlay_css() out["htmlMeta"] = { "containerClass": "RTMDre", "tokenClass": "IwqbBf", "sourceWidth": int(W), "sourceHeight": int(H), } if "htmlMeta" not in out: out["htmlMeta"] = { "containerClass": "RTMDre", "tokenClass": "IwqbBf", "sourceWidth": int(W), "sourceHeight": int(H), } if WRITE_OUT_JSON: with open(OUT_JSON, "w", encoding="utf-8") as f: json.dump(out, f, ensure_ascii=False, indent=2) if __name__ == "__main__": main()