Spaces:
Paused
Paused
File size: 5,806 Bytes
2cd769f 82bebe6 fd7fe2c 82bebe6 fd7fe2c 82bebe6 9b3ecdf fd7fe2c 82bebe6 9b3ecdf 2cd769f 9b3ecdf 2cd769f 82bebe6 9b3ecdf 82bebe6 2cd769f 9b3ecdf 82bebe6 9b3ecdf 82bebe6 9b3ecdf 82bebe6 9b3ecdf 82bebe6 2cd769f fd7fe2c 2cd769f 9b3ecdf 2cd769f 9b3ecdf 2cd769f 82bebe6 2cd769f 82bebe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | # app.py — เวอร์ชันบังคับ TH↔EN ให้ตรงภาษาเอาต์พุต
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from openai import AsyncOpenAI
from langdetect import detect, DetectorFactory
import asyncio, re
DetectorFactory.seed = 0
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL = "scb10x/typhoon-translate-4b"
app = FastAPI(title="HF Space · Ollama Translator")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
)
class TranslateReq(BaseModel):
text: str
target: str = "en" # "en" หรือ "th"
source: str = "auto" # "auto" = เดาภาษาให้
LANG_NAME = {"en": "English", "th": "Thai"}
TH_RANGE = (0x0E00, 0x0E7F)
def guess_lang(text: str) -> str:
try:
lg = detect(text)
return "th" if lg.startswith("th") else ("en" if lg.startswith("en") else lg)
except Exception:
return "auto"
def target_ratio(text: str, tgt: str) -> float:
letters = [c for c in text if c.isalpha()]
if not letters:
return 0.0
if tgt == "th":
n_target = sum(TH_RANGE[0] <= ord(c) <= TH_RANGE[1] for c in letters)
elif tgt == "en":
n_target = sum('A' <= c <= 'Z' or 'a' <= c <= 'z' for c in letters)
else:
n_target = 0
return n_target / len(letters)
def is_target_strict(text: str, tgt: str, threshold: float = 0.85) -> bool:
# ต้องเป็นอักษรภาษาปลายทาง ≥ threshold ของอักขระตัวอักษรทั้งหมด
return target_ratio(text, tgt) >= threshold
def split_text(text: str, max_chars: int = 1200):
parts, buf = [], ""
for tok in re.split(r'(\n+|[.!?。!?])', text):
if tok is None: continue
if len(buf) + len(tok) <= max_chars:
buf += tok
else:
if buf: parts.append(buf)
buf = tok
if buf: parts.append(buf)
return parts
def build_system(src: str, tgt: str) -> str:
ex = (
# few-shot ตัวอย่างช่วยให้ไม่คงภาษาเดิม
"EXAMPLES:\n"
"Input (English): Hello!\n"
"Output (Thai): สวัสดี!\n\n"
"Input (English): The quick brown fox jumps over the lazy dog.\n"
"Output (Thai): สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ\n\n"
"Input (Thai): ยินดีที่ได้รู้จัก\n"
"Output (English): Nice to meet you\n"
)
return (
f"You are a professional MT engine for Thai↔English.\n"
f"TASK: Translate {src} → {tgt}.\n"
f"REQUIREMENTS:\n"
f"- Output MUST be 100% in {LANG_NAME[tgt]}.\n"
f"- Do NOT echo the source language. Do NOT mix languages.\n"
f"- Preserve meaning, punctuation, numbers, line breaks.\n"
f"- Translate pangrams naturally (do not keep them in source language).\n\n"
f"{ex}"
)
async def ask_ollama(system: str, content: str):
return await client.chat.completions.create(
model=MODEL,
temperature=0,
messages=[{"role": "system", "content": system},
{"role": "user", "content": content}],
extra_body={"options": {"num_ctx": 4096, "num_predict": -1}}
)
async def translate_chunk(chunk: str, src: str, tgt: str) -> str:
system = build_system(src, tgt)
# Attempt 1 — ปกติ
user = f"Translate strictly into {LANG_NAME[tgt]} ONLY:\n\n{chunk}"
out = (await ask_ollama(system, user)).choices[0].message.content.strip()
if is_target_strict(out, tgt):
return out
# Attempt 2 — เข้มขึ้น
user2 = (
f"STRICT MODE: Output MUST be in {LANG_NAME[tgt]} ONLY.\n"
f"NO {('Thai' if tgt=='en' else 'English')} letters.\n"
f"Translate the following:\n{chunk}"
)
out2 = (await ask_ollama(system, user2)).choices[0].message.content.strip()
if is_target_strict(out2, tgt):
return out2
# Attempt 3 — ย้ำคำสั่งด้วยภาษาปลายทาง (ช่วยรีดให้เป็นไทยล้วน)
if tgt == "th":
user3 = f"โปรดแปลเป็นภาษาไทยเท่านั้น ห้ามมีอักษรอังกฤษ:\n{chunk}"
else:
user3 = f"Translate into English only. No Thai letters:\n{chunk}"
out3 = (await ask_ollama(system, user3)).choices[0].message.content.strip()
# ส่งผลลัพธ์ที่ดีที่สุดในสามรอบ
for cand in (out3, out2, out):
if is_target_strict(cand, tgt):
return cand
return out3 or out2 or out # อย่างน้อยให้มีคำตอบกลับไป
@app.post("/translate")
async def translate(req: TranslateReq):
src = req.source
if src == "auto":
src = guess_lang(req.text)
if src == "auto" or src == req.target:
src = "en" if re.search(r"[A-Za-z]", req.text) else "th"
chunks = split_text(req.text, max_chars=1200)
sem = asyncio.Semaphore(2) # ปรับเป็น 3–4 ได้ถ้าเครื่องแรง
async def run(c):
async with sem:
return await translate_chunk(c, src, req.target)
outs = await asyncio.gather(*[run(c) for c in chunks])
return {
"translation": "".join(outs),
"model": MODEL,
"source": src,
"target": req.target,
"chunks": len(chunks)
}
|