translator / app.py
seawolf2357's picture
Update app.py
75330da verified
import gradio as gr
import torch
import fitz # PyMuPDF
from transformers import AutoModelForCausalLM, AutoTokenizer
# ใƒขใƒ‡ใƒซID
model_id = "tencent/HY-MT1.5-1.8B"
# ็’ฐๅขƒใซๅˆใ‚ใ›ใฆใƒ‡ใƒใ‚คใ‚นใจ็ฒพๅบฆใ‚’่‡ชๅ‹•้ธๆŠž
if torch.cuda.is_available():
device = "cuda"
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
print(f"Loading model on {device} with {dtype}...")
# ใƒˆใƒผใ‚ฏใƒŠใ‚คใ‚ถใƒผใจใƒขใƒ‡ใƒซใฎ่ชญใฟ่พผใฟ
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map=device,
torch_dtype=dtype
)
def extract_text_from_pdf(pdf_file):
"""PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
if pdf_file is None:
return ""
try:
doc = fitz.open(pdf_file.name)
full_text = ""
for page_num, page in enumerate(doc, 1):
text = page.get_text("text")
if text.strip():
full_text += f"\n--- Page {page_num} ---\n{text.strip()}\n"
doc.close()
return full_text.strip()
except Exception as e:
return f"โŒ PDF ์ถ”์ถœ ์˜ค๋ฅ˜: {str(e)}"
def translate_text(source_text, target_lang):
"""ํ…์ŠคํŠธ ๋ฒˆ์—ญ"""
if not source_text or not source_text.strip():
return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
# ใƒ—ใƒญใƒณใƒ—ใƒˆใฎๅˆ‡ใ‚Šๆ›ฟใˆใƒญใ‚ธใƒƒใ‚ฏ
if "Chinese" in target_lang or "ไธญๆ–‡" in target_lang:
prompt = f"ๅฐ†ไปฅไธ‹ๆ–‡ๆœฌ็ฟป่ฏ‘ไธบ{target_lang}๏ผŒๆณจๆ„ๅช้œ€่ฆ่พ“ๅ‡บ็ฟป่ฏ‘ๅŽ็š„็ป“ๆžœ๏ผŒไธ่ฆ้ขๅค–่งฃ้‡Š๏ผš\n{source_text}"
else:
prompt = f"Translate the following segment into {target_lang}, without additional explanation.\n{source_text}"
messages = [{"role": "user", "content": prompt}]
# ๅ…ฅๅŠ›ๅ‡ฆ็†
text_input = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=False,
return_tensors="pt"
).to(device)
# ็”ŸๆˆๅฎŸ่กŒ
with torch.no_grad():
generated_ids = model.generate(
text_input,
max_new_tokens=1024,
temperature=0.7,
top_p=0.6,
repetition_penalty=1.05
)
# ๅ‡บๅŠ›ๅ‡ฆ็†
input_length = text_input.shape[1]
response = generated_ids[0][input_length:]
decoded_output = tokenizer.decode(response, skip_special_tokens=True)
return decoded_output
def translate_long_text(source_text, target_lang, chunk_size=1500):
"""๊ธด ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋‚˜๋ˆ ์„œ ๋ฒˆ์—ญ"""
if not source_text or not source_text.strip():
return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
# ์งง์€ ํ…์ŠคํŠธ๋Š” ๋ฐ”๋กœ ๋ฒˆ์—ญ
if len(source_text) <= chunk_size:
return translate_text(source_text, target_lang)
# ๊ธด ํ…์ŠคํŠธ๋Š” ๋ฌธ๋‹จ ๋‹จ์œ„๋กœ ๋ถ„ํ• 
paragraphs = source_text.split('\n\n')
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk.strip())
# ๊ฐ ์ฒญํฌ ๋ฒˆ์—ญ
translated_chunks = []
for i, chunk in enumerate(chunks):
print(f"Translating chunk {i+1}/{len(chunks)}...")
translated = translate_text(chunk, target_lang)
translated_chunks.append(translated)
return "\n\n".join(translated_chunks)
def process_pdf_and_translate(pdf_file, target_lang):
"""PDF ์—…๋กœ๋“œ โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ โ†’ ๋ฒˆ์—ญ"""
if pdf_file is None:
return "", "PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."
# ํ…์ŠคํŠธ ์ถ”์ถœ
extracted_text = extract_text_from_pdf(pdf_file)
if extracted_text.startswith("โŒ"):
return "", extracted_text
if not extracted_text.strip():
return "", "PDF์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
# ๋ฒˆ์—ญ
translated_text = translate_long_text(extracted_text, target_lang)
return extracted_text, translated_text
def translate_input_text(source_text, target_lang):
"""์ž…๋ ฅ ํ…์ŠคํŠธ ๋ฒˆ์—ญ"""
return translate_long_text(source_text, target_lang)
# UIใฎๆง‹็ฏ‰
langs = ["Japanese", "English", "Chinese", "Korean", "French", "German", "Spanish", "ํ•œ๊ตญ์–ด", "ๆ—ฅๆœฌ่ชž", "ไธญๆ–‡"]
with gr.Blocks(title="HY-MT1.5 Translator") as demo:
gr.Markdown("# ๐Ÿš€ HY-MT1.5-1.8B Translator")
gr.Markdown("Tencent์˜ 1.8B ๋ฒˆ์—ญ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•œ ํ…์ŠคํŠธ/PDF ๋ฒˆ์—ญ ๋ฐ๋ชจ์ž…๋‹ˆ๋‹ค.")
with gr.Tabs():
# ============ Tab 1: ํ…์ŠคํŠธ ๋ฒˆ์—ญ ============
with gr.TabItem("๐Ÿ“ Text Translation"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="์›๋ฌธ (Source Text)",
lines=10,
placeholder="๋ฒˆ์—ญํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
)
target_lang_text = gr.Dropdown(
choices=langs,
value="English",
label="๋ฒˆ์—ญ ์–ธ์–ด (Target Language)"
)
translate_btn = gr.Button("๐Ÿ”„ ๋ฒˆ์—ญ (Translate)", variant="primary")
with gr.Column():
output_text = gr.Textbox(
label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ (Result)",
lines=10,
interactive=False
)
translate_btn.click(
fn=translate_input_text,
inputs=[input_text, target_lang_text],
outputs=output_text
)
# ============ Tab 2: PDF ๋ฒˆ์—ญ ============
with gr.TabItem("๐Ÿ“„ PDF Translation"):
gr.Markdown("### PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๋ฉด ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๊ณ  ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="๐Ÿ“„ PDF ํŒŒ์ผ ์—…๋กœ๋“œ",
file_types=[".pdf"]
)
target_lang_pdf = gr.Dropdown(
choices=langs,
value="English",
label="๋ฒˆ์—ญ ์–ธ์–ด (Target Language)"
)
translate_pdf_btn = gr.Button("๐Ÿ”„ PDF ๋ฒˆ์—ญ", variant="primary")
with gr.Row():
with gr.Column():
extracted_text = gr.Textbox(
label="๐Ÿ“‹ ์ถ”์ถœ๋œ ์›๋ฌธ (Extracted Text)",
lines=15,
interactive=False
)
with gr.Column():
translated_pdf_text = gr.Textbox(
label="๐Ÿ“‹ ๋ฒˆ์—ญ ๊ฒฐ๊ณผ (Translated Text)",
lines=15,
interactive=False
)
translate_pdf_btn.click(
fn=process_pdf_and_translate,
inputs=[pdf_input, target_lang_pdf],
outputs=[extracted_text, translated_pdf_text]
)
# ่ตทๅ‹•
demo.launch()