File size: 5,806 Bytes
2cd769f
82bebe6
 
 
 
 
 
fd7fe2c
82bebe6
 
 
fd7fe2c
 
82bebe6
 
9b3ecdf
fd7fe2c
 
82bebe6
 
9b3ecdf
2cd769f
9b3ecdf
 
2cd769f
82bebe6
 
 
 
9b3ecdf
82bebe6
 
 
2cd769f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b3ecdf
82bebe6
 
 
9b3ecdf
82bebe6
 
 
9b3ecdf
82bebe6
9b3ecdf
82bebe6
 
2cd769f
 
 
 
 
 
 
 
 
 
fd7fe2c
2cd769f
9b3ecdf
 
 
 
2cd769f
 
 
 
9b3ecdf
 
2cd769f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82bebe6
 
 
 
 
 
 
 
 
 
 
2cd769f
82bebe6
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# app.py — เวอร์ชันบังคับ TH↔EN ให้ตรงภาษาเอาต์พุต
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from openai import AsyncOpenAI
from langdetect import detect, DetectorFactory
import asyncio, re

DetectorFactory.seed = 0
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL = "scb10x/typhoon-translate-4b"

app = FastAPI(title="HF Space · Ollama Translator")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
)

class TranslateReq(BaseModel):
    text: str
    target: str = "en"   # "en" หรือ "th"
    source: str = "auto" # "auto" = เดาภาษาให้

LANG_NAME = {"en": "English", "th": "Thai"}
TH_RANGE = (0x0E00, 0x0E7F)

def guess_lang(text: str) -> str:
    try:
        lg = detect(text)
        return "th" if lg.startswith("th") else ("en" if lg.startswith("en") else lg)
    except Exception:
        return "auto"

def target_ratio(text: str, tgt: str) -> float:
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    if tgt == "th":
        n_target = sum(TH_RANGE[0] <= ord(c) <= TH_RANGE[1] for c in letters)
    elif tgt == "en":
        n_target = sum('A' <= c <= 'Z' or 'a' <= c <= 'z' for c in letters)
    else:
        n_target = 0
    return n_target / len(letters)

def is_target_strict(text: str, tgt: str, threshold: float = 0.85) -> bool:
    # ต้องเป็นอักษรภาษาปลายทาง ≥ threshold ของอักขระตัวอักษรทั้งหมด
    return target_ratio(text, tgt) >= threshold

def split_text(text: str, max_chars: int = 1200):
    parts, buf = [], ""
    for tok in re.split(r'(\n+|[.!?。!?])', text):
        if tok is None: continue
        if len(buf) + len(tok) <= max_chars:
            buf += tok
        else:
            if buf: parts.append(buf)
            buf = tok
    if buf: parts.append(buf)
    return parts

def build_system(src: str, tgt: str) -> str:
    ex = (
        # few-shot ตัวอย่างช่วยให้ไม่คงภาษาเดิม
        "EXAMPLES:\n"
        "Input (English): Hello!\n"
        "Output (Thai): สวัสดี!\n\n"
        "Input (English): The quick brown fox jumps over the lazy dog.\n"
        "Output (Thai): สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ\n\n"
        "Input (Thai): ยินดีที่ได้รู้จัก\n"
        "Output (English): Nice to meet you\n"
    )
    return (
        f"You are a professional MT engine for Thai↔English.\n"
        f"TASK: Translate {src}{tgt}.\n"
        f"REQUIREMENTS:\n"
        f"- Output MUST be 100% in {LANG_NAME[tgt]}.\n"
        f"- Do NOT echo the source language. Do NOT mix languages.\n"
        f"- Preserve meaning, punctuation, numbers, line breaks.\n"
        f"- Translate pangrams naturally (do not keep them in source language).\n\n"
        f"{ex}"
    )

async def ask_ollama(system: str, content: str):
    return await client.chat.completions.create(
        model=MODEL,
        temperature=0,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": content}],
        extra_body={"options": {"num_ctx": 4096, "num_predict": -1}}
    )

async def translate_chunk(chunk: str, src: str, tgt: str) -> str:
    system = build_system(src, tgt)

    # Attempt 1 — ปกติ
    user = f"Translate strictly into {LANG_NAME[tgt]} ONLY:\n\n{chunk}"
    out = (await ask_ollama(system, user)).choices[0].message.content.strip()
    if is_target_strict(out, tgt):
        return out

    # Attempt 2 — เข้มขึ้น
    user2 = (
        f"STRICT MODE: Output MUST be in {LANG_NAME[tgt]} ONLY.\n"
        f"NO {('Thai' if tgt=='en' else 'English')} letters.\n"
        f"Translate the following:\n{chunk}"
    )
    out2 = (await ask_ollama(system, user2)).choices[0].message.content.strip()
    if is_target_strict(out2, tgt):
        return out2

    # Attempt 3 — ย้ำคำสั่งด้วยภาษาปลายทาง (ช่วยรีดให้เป็นไทยล้วน)
    if tgt == "th":
        user3 = f"โปรดแปลเป็นภาษาไทยเท่านั้น ห้ามมีอักษรอังกฤษ:\n{chunk}"
    else:
        user3 = f"Translate into English only. No Thai letters:\n{chunk}"
    out3 = (await ask_ollama(system, user3)).choices[0].message.content.strip()
    # ส่งผลลัพธ์ที่ดีที่สุดในสามรอบ
    for cand in (out3, out2, out):
        if is_target_strict(cand, tgt):
            return cand
    return out3 or out2 or out  # อย่างน้อยให้มีคำตอบกลับไป

@app.post("/translate")
async def translate(req: TranslateReq):
    src = req.source
    if src == "auto":
        src = guess_lang(req.text)
        if src == "auto" or src == req.target:
            src = "en" if re.search(r"[A-Za-z]", req.text) else "th"

    chunks = split_text(req.text, max_chars=1200)

    sem = asyncio.Semaphore(2)  # ปรับเป็น 3–4 ได้ถ้าเครื่องแรง
    async def run(c):
        async with sem:
            return await translate_chunk(c, src, req.target)

    outs = await asyncio.gather(*[run(c) for c in chunks])
    return {
        "translation": "".join(outs),
        "model": MODEL,
        "source": src,
        "target": req.target,
        "chunks": len(chunks)
    }