Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py
|
| 2 |
from fastapi import FastAPI
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from pydantic import BaseModel
|
|
@@ -7,7 +7,6 @@ from langdetect import detect, DetectorFactory
|
|
| 7 |
import asyncio, re
|
| 8 |
|
| 9 |
DetectorFactory.seed = 0
|
| 10 |
-
|
| 11 |
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
|
| 12 |
MODEL = "scb10x/typhoon-translate-4b"
|
| 13 |
|
|
@@ -20,10 +19,10 @@ app.add_middleware(
|
|
| 20 |
class TranslateReq(BaseModel):
|
| 21 |
text: str
|
| 22 |
target: str = "en" # "en" หรือ "th"
|
| 23 |
-
source: str = "auto" # "auto"
|
| 24 |
|
| 25 |
LANG_NAME = {"en": "English", "th": "Thai"}
|
| 26 |
-
|
| 27 |
|
| 28 |
def guess_lang(text: str) -> str:
|
| 29 |
try:
|
|
@@ -32,19 +31,21 @@ def guess_lang(text: str) -> str:
|
|
| 32 |
except Exception:
|
| 33 |
return "auto"
|
| 34 |
|
| 35 |
-
def
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def split_text(text: str, max_chars: int = 1200):
|
| 50 |
parts, buf = [], ""
|
|
@@ -58,74 +59,79 @@ def split_text(text: str, max_chars: int = 1200):
|
|
| 58 |
if buf: parts.append(buf)
|
| 59 |
return parts
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
extra_body={
|
| 72 |
-
"options": {
|
| 73 |
-
"num_ctx": 4096,
|
| 74 |
-
"num_predict": -1
|
| 75 |
-
}
|
| 76 |
-
}
|
| 77 |
)
|
| 78 |
-
|
| 79 |
-
async def translate_chunk(chunk: str, src: str, tgt: str) -> str:
|
| 80 |
-
# พรอมป์ตแบบเข้ม: ห้ามออกภาษาอื่น
|
| 81 |
-
system = (
|
| 82 |
f"You are a professional MT engine for Thai↔English.\n"
|
| 83 |
f"TASK: Translate {src} → {tgt}.\n"
|
| 84 |
f"REQUIREMENTS:\n"
|
| 85 |
f"- Output MUST be 100% in {LANG_NAME[tgt]}.\n"
|
| 86 |
-
f"- Do NOT
|
| 87 |
-
f"- Preserve meaning,
|
| 88 |
-
f"-
|
|
|
|
| 89 |
)
|
| 90 |
-
user = f"Translate strictly into {LANG_NAME[tgt]} ONLY:\n\n{chunk}"
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
@app.post("/translate")
|
| 116 |
async def translate(req: TranslateReq):
|
| 117 |
-
# กำหนด src ให้ชัด
|
| 118 |
src = req.source
|
| 119 |
if src == "auto":
|
| 120 |
src = guess_lang(req.text)
|
| 121 |
if src == "auto" or src == req.target:
|
| 122 |
-
# heuristic ถ้าดูไม่ออก
|
| 123 |
src = "en" if re.search(r"[A-Za-z]", req.text) else "th"
|
| 124 |
|
| 125 |
chunks = split_text(req.text, max_chars=1200)
|
| 126 |
|
| 127 |
-
|
| 128 |
-
sem = asyncio.Semaphore(2)
|
| 129 |
async def run(c):
|
| 130 |
async with sem:
|
| 131 |
return await translate_chunk(c, src, req.target)
|
|
|
|
| 1 |
+
# app.py — เวอร์ชันบังคับ TH↔EN ให้ตรงภาษาเอาต์พุต
|
| 2 |
from fastapi import FastAPI
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from pydantic import BaseModel
|
|
|
|
| 7 |
import asyncio, re
|
| 8 |
|
| 9 |
DetectorFactory.seed = 0
|
|
|
|
| 10 |
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
|
| 11 |
MODEL = "scb10x/typhoon-translate-4b"
|
| 12 |
|
|
|
|
| 19 |
class TranslateReq(BaseModel):
|
| 20 |
text: str
|
| 21 |
target: str = "en" # "en" หรือ "th"
|
| 22 |
+
source: str = "auto" # "auto" = เดาภาษาให้
|
| 23 |
|
| 24 |
LANG_NAME = {"en": "English", "th": "Thai"}
|
| 25 |
+
TH_RANGE = (0x0E00, 0x0E7F)
|
| 26 |
|
| 27 |
def guess_lang(text: str) -> str:
|
| 28 |
try:
|
|
|
|
| 31 |
except Exception:
|
| 32 |
return "auto"
|
| 33 |
|
| 34 |
+
def target_ratio(text: str, tgt: str) -> float:
|
| 35 |
+
letters = [c for c in text if c.isalpha()]
|
| 36 |
+
if not letters:
|
| 37 |
+
return 0.0
|
| 38 |
+
if tgt == "th":
|
| 39 |
+
n_target = sum(TH_RANGE[0] <= ord(c) <= TH_RANGE[1] for c in letters)
|
| 40 |
+
elif tgt == "en":
|
| 41 |
+
n_target = sum('A' <= c <= 'Z' or 'a' <= c <= 'z' for c in letters)
|
| 42 |
+
else:
|
| 43 |
+
n_target = 0
|
| 44 |
+
return n_target / len(letters)
|
| 45 |
+
|
| 46 |
+
def is_target_strict(text: str, tgt: str, threshold: float = 0.85) -> bool:
|
| 47 |
+
# ต้องเป็นอักษรภาษาปลายทาง ≥ threshold ของอักขระตัวอักษรทั้งหมด
|
| 48 |
+
return target_ratio(text, tgt) >= threshold
|
| 49 |
|
| 50 |
def split_text(text: str, max_chars: int = 1200):
|
| 51 |
parts, buf = [], ""
|
|
|
|
| 59 |
if buf: parts.append(buf)
|
| 60 |
return parts
|
| 61 |
|
| 62 |
+
def build_system(src: str, tgt: str) -> str:
|
| 63 |
+
ex = (
|
| 64 |
+
# few-shot ตัวอย่างช่วยให้ไม่คงภาษาเดิม
|
| 65 |
+
"EXAMPLES:\n"
|
| 66 |
+
"Input (English): Hello!\n"
|
| 67 |
+
"Output (Thai): สวัสดี!\n\n"
|
| 68 |
+
"Input (English): The quick brown fox jumps over the lazy dog.\n"
|
| 69 |
+
"Output (Thai): สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ\n\n"
|
| 70 |
+
"Input (Thai): ยินดีที่ได้รู้จัก\n"
|
| 71 |
+
"Output (English): Nice to meet you\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
)
|
| 73 |
+
return (
|
|
|
|
|
|
|
|
|
|
| 74 |
f"You are a professional MT engine for Thai↔English.\n"
|
| 75 |
f"TASK: Translate {src} → {tgt}.\n"
|
| 76 |
f"REQUIREMENTS:\n"
|
| 77 |
f"- Output MUST be 100% in {LANG_NAME[tgt]}.\n"
|
| 78 |
+
f"- Do NOT echo the source language. Do NOT mix languages.\n"
|
| 79 |
+
f"- Preserve meaning, punctuation, numbers, line breaks.\n"
|
| 80 |
+
f"- Translate pangrams naturally (do not keep them in source language).\n\n"
|
| 81 |
+
f"{ex}"
|
| 82 |
)
|
|
|
|
| 83 |
|
| 84 |
+
async def ask_ollama(system: str, content: str):
|
| 85 |
+
return await client.chat.completions.create(
|
| 86 |
+
model=MODEL,
|
| 87 |
+
temperature=0,
|
| 88 |
+
messages=[{"role": "system", "content": system},
|
| 89 |
+
{"role": "user", "content": content}],
|
| 90 |
+
extra_body={"options": {"num_ctx": 4096, "num_predict": -1}}
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
async def translate_chunk(chunk: str, src: str, tgt: str) -> str:
|
| 94 |
+
system = build_system(src, tgt)
|
| 95 |
+
|
| 96 |
+
# Attempt 1 — ปกติ
|
| 97 |
+
user = f"Translate strictly into {LANG_NAME[tgt]} ONLY:\n\n{chunk}"
|
| 98 |
+
out = (await ask_ollama(system, user)).choices[0].message.content.strip()
|
| 99 |
+
if is_target_strict(out, tgt):
|
| 100 |
+
return out
|
| 101 |
+
|
| 102 |
+
# Attempt 2 — เข้มขึ้น
|
| 103 |
+
user2 = (
|
| 104 |
+
f"STRICT MODE: Output MUST be in {LANG_NAME[tgt]} ONLY.\n"
|
| 105 |
+
f"NO {('Thai' if tgt=='en' else 'English')} letters.\n"
|
| 106 |
+
f"Translate the following:\n{chunk}"
|
| 107 |
+
)
|
| 108 |
+
out2 = (await ask_ollama(system, user2)).choices[0].message.content.strip()
|
| 109 |
+
if is_target_strict(out2, tgt):
|
| 110 |
+
return out2
|
| 111 |
+
|
| 112 |
+
# Attempt 3 — ย้ำคำสั่งด้วยภาษาปลายทาง (ช่วยรีดให้เป็นไทยล้วน)
|
| 113 |
+
if tgt == "th":
|
| 114 |
+
user3 = f"โปรดแปลเป็นภาษาไทยเท่านั้น ห้ามมีอักษรอังกฤษ:\n{chunk}"
|
| 115 |
+
else:
|
| 116 |
+
user3 = f"Translate into English only. No Thai letters:\n{chunk}"
|
| 117 |
+
out3 = (await ask_ollama(system, user3)).choices[0].message.content.strip()
|
| 118 |
+
# ส่งผลลัพธ์ที่ดีที่สุดในสามรอบ
|
| 119 |
+
for cand in (out3, out2, out):
|
| 120 |
+
if is_target_strict(cand, tgt):
|
| 121 |
+
return cand
|
| 122 |
+
return out3 or out2 or out # อย่างน้อยให้มีคำตอบกลับไป
|
| 123 |
|
| 124 |
@app.post("/translate")
|
| 125 |
async def translate(req: TranslateReq):
|
|
|
|
| 126 |
src = req.source
|
| 127 |
if src == "auto":
|
| 128 |
src = guess_lang(req.text)
|
| 129 |
if src == "auto" or src == req.target:
|
|
|
|
| 130 |
src = "en" if re.search(r"[A-Za-z]", req.text) else "th"
|
| 131 |
|
| 132 |
chunks = split_text(req.text, max_chars=1200)
|
| 133 |
|
| 134 |
+
sem = asyncio.Semaphore(2) # ปรับเป็น 3–4 ได้ถ้าเครื่องแรง
|
|
|
|
| 135 |
async def run(c):
|
| 136 |
async with sem:
|
| 137 |
return await translate_chunk(c, src, req.target)
|