translate-test / app.py
plan291037's picture
Update app.py
2cd769f verified
# app.py — เวอร์ชันบังคับ TH↔EN ให้ตรงภาษาเอาต์พุต
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from openai import AsyncOpenAI
from langdetect import detect, DetectorFactory
import asyncio, re
DetectorFactory.seed = 0
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL = "scb10x/typhoon-translate-4b"
app = FastAPI(title="HF Space · Ollama Translator")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
)
class TranslateReq(BaseModel):
text: str
target: str = "en" # "en" หรือ "th"
source: str = "auto" # "auto" = เดาภาษาให้
LANG_NAME = {"en": "English", "th": "Thai"}
TH_RANGE = (0x0E00, 0x0E7F)
def guess_lang(text: str) -> str:
try:
lg = detect(text)
return "th" if lg.startswith("th") else ("en" if lg.startswith("en") else lg)
except Exception:
return "auto"
def target_ratio(text: str, tgt: str) -> float:
letters = [c for c in text if c.isalpha()]
if not letters:
return 0.0
if tgt == "th":
n_target = sum(TH_RANGE[0] <= ord(c) <= TH_RANGE[1] for c in letters)
elif tgt == "en":
n_target = sum('A' <= c <= 'Z' or 'a' <= c <= 'z' for c in letters)
else:
n_target = 0
return n_target / len(letters)
def is_target_strict(text: str, tgt: str, threshold: float = 0.85) -> bool:
# ต้องเป็นอักษรภาษาปลายทาง ≥ threshold ของอักขระตัวอักษรทั้งหมด
return target_ratio(text, tgt) >= threshold
def split_text(text: str, max_chars: int = 1200):
parts, buf = [], ""
for tok in re.split(r'(\n+|[.!?。!?])', text):
if tok is None: continue
if len(buf) + len(tok) <= max_chars:
buf += tok
else:
if buf: parts.append(buf)
buf = tok
if buf: parts.append(buf)
return parts
def build_system(src: str, tgt: str) -> str:
ex = (
# few-shot ตัวอย่างช่วยให้ไม่คงภาษาเดิม
"EXAMPLES:\n"
"Input (English): Hello!\n"
"Output (Thai): สวัสดี!\n\n"
"Input (English): The quick brown fox jumps over the lazy dog.\n"
"Output (Thai): สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ\n\n"
"Input (Thai): ยินดีที่ได้รู้จัก\n"
"Output (English): Nice to meet you\n"
)
return (
f"You are a professional MT engine for Thai↔English.\n"
f"TASK: Translate {src}{tgt}.\n"
f"REQUIREMENTS:\n"
f"- Output MUST be 100% in {LANG_NAME[tgt]}.\n"
f"- Do NOT echo the source language. Do NOT mix languages.\n"
f"- Preserve meaning, punctuation, numbers, line breaks.\n"
f"- Translate pangrams naturally (do not keep them in source language).\n\n"
f"{ex}"
)
async def ask_ollama(system: str, content: str):
return await client.chat.completions.create(
model=MODEL,
temperature=0,
messages=[{"role": "system", "content": system},
{"role": "user", "content": content}],
extra_body={"options": {"num_ctx": 4096, "num_predict": -1}}
)
async def translate_chunk(chunk: str, src: str, tgt: str) -> str:
system = build_system(src, tgt)
# Attempt 1 — ปกติ
user = f"Translate strictly into {LANG_NAME[tgt]} ONLY:\n\n{chunk}"
out = (await ask_ollama(system, user)).choices[0].message.content.strip()
if is_target_strict(out, tgt):
return out
# Attempt 2 — เข้มขึ้น
user2 = (
f"STRICT MODE: Output MUST be in {LANG_NAME[tgt]} ONLY.\n"
f"NO {('Thai' if tgt=='en' else 'English')} letters.\n"
f"Translate the following:\n{chunk}"
)
out2 = (await ask_ollama(system, user2)).choices[0].message.content.strip()
if is_target_strict(out2, tgt):
return out2
# Attempt 3 — ย้ำคำสั่งด้วยภาษาปลายทาง (ช่วยรีดให้เป็นไทยล้วน)
if tgt == "th":
user3 = f"โปรดแปลเป็นภาษาไทยเท่านั้น ห้ามมีอักษรอังกฤษ:\n{chunk}"
else:
user3 = f"Translate into English only. No Thai letters:\n{chunk}"
out3 = (await ask_ollama(system, user3)).choices[0].message.content.strip()
# ส่งผลลัพธ์ที่ดีที่สุดในสามรอบ
for cand in (out3, out2, out):
if is_target_strict(cand, tgt):
return cand
return out3 or out2 or out # อย่างน้อยให้มีคำตอบกลับไป
@app.post("/translate")
async def translate(req: TranslateReq):
src = req.source
if src == "auto":
src = guess_lang(req.text)
if src == "auto" or src == req.target:
src = "en" if re.search(r"[A-Za-z]", req.text) else "th"
chunks = split_text(req.text, max_chars=1200)
sem = asyncio.Semaphore(2) # ปรับเป็น 3–4 ได้ถ้าเครื่องแรง
async def run(c):
async with sem:
return await translate_chunk(c, src, req.target)
outs = await asyncio.gather(*[run(c) for c in chunks])
return {
"translation": "".join(outs),
"model": MODEL,
"source": src,
"target": req.target,
"chunks": len(chunks)
}