Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import fitz # PyMuPDF | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # ใขใใซID | |
| model_id = "tencent/HY-MT1.5-1.8B" | |
| # ็ฐๅขใซๅใใใฆใใใคในใจ็ฒพๅบฆใ่ชๅ้ธๆ | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| dtype = torch.float16 | |
| else: | |
| device = "cpu" | |
| dtype = torch.float32 | |
| print(f"Loading model on {device} with {dtype}...") | |
| # ใใผใฏใใคใถใผใจใขใใซใฎ่ชญใฟ่พผใฟ | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| device_map=device, | |
| torch_dtype=dtype | |
| ) | |
| def extract_text_from_pdf(pdf_file): | |
| """PDF์์ ํ ์คํธ ์ถ์ถ""" | |
| if pdf_file is None: | |
| return "" | |
| try: | |
| doc = fitz.open(pdf_file.name) | |
| full_text = "" | |
| for page_num, page in enumerate(doc, 1): | |
| text = page.get_text("text") | |
| if text.strip(): | |
| full_text += f"\n--- Page {page_num} ---\n{text.strip()}\n" | |
| doc.close() | |
| return full_text.strip() | |
| except Exception as e: | |
| return f"โ PDF ์ถ์ถ ์ค๋ฅ: {str(e)}" | |
| def translate_text(source_text, target_lang): | |
| """ํ ์คํธ ๋ฒ์ญ""" | |
| if not source_text or not source_text.strip(): | |
| return "์ ๋ ฅ ํ ์คํธ๊ฐ ์์ต๋๋ค." | |
| # ใใญใณใใใฎๅใๆฟใใญใธใใฏ | |
| if "Chinese" in target_lang or "ไธญๆ" in target_lang: | |
| prompt = f"ๅฐไปฅไธๆๆฌ็ฟป่ฏไธบ{target_lang}๏ผๆณจๆๅช้่ฆ่พๅบ็ฟป่ฏๅ็็ปๆ๏ผไธ่ฆ้ขๅค่งฃ้๏ผ\n{source_text}" | |
| else: | |
| prompt = f"Translate the following segment into {target_lang}, without additional explanation.\n{source_text}" | |
| messages = [{"role": "user", "content": prompt}] | |
| # ๅ ฅๅๅฆ็ | |
| text_input = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=False, | |
| return_tensors="pt" | |
| ).to(device) | |
| # ็ๆๅฎ่ก | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| text_input, | |
| max_new_tokens=1024, | |
| temperature=0.7, | |
| top_p=0.6, | |
| repetition_penalty=1.05 | |
| ) | |
| # ๅบๅๅฆ็ | |
| input_length = text_input.shape[1] | |
| response = generated_ids[0][input_length:] | |
| decoded_output = tokenizer.decode(response, skip_special_tokens=True) | |
| return decoded_output | |
| def translate_long_text(source_text, target_lang, chunk_size=1500): | |
| """๊ธด ํ ์คํธ๋ฅผ ์ฒญํฌ๋ก ๋๋ ์ ๋ฒ์ญ""" | |
| if not source_text or not source_text.strip(): | |
| return "์ ๋ ฅ ํ ์คํธ๊ฐ ์์ต๋๋ค." | |
| # ์งง์ ํ ์คํธ๋ ๋ฐ๋ก ๋ฒ์ญ | |
| if len(source_text) <= chunk_size: | |
| return translate_text(source_text, target_lang) | |
| # ๊ธด ํ ์คํธ๋ ๋ฌธ๋จ ๋จ์๋ก ๋ถํ | |
| paragraphs = source_text.split('\n\n') | |
| chunks = [] | |
| current_chunk = "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < chunk_size: | |
| current_chunk += para + "\n\n" | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = para + "\n\n" | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| # ๊ฐ ์ฒญํฌ ๋ฒ์ญ | |
| translated_chunks = [] | |
| for i, chunk in enumerate(chunks): | |
| print(f"Translating chunk {i+1}/{len(chunks)}...") | |
| translated = translate_text(chunk, target_lang) | |
| translated_chunks.append(translated) | |
| return "\n\n".join(translated_chunks) | |
| def process_pdf_and_translate(pdf_file, target_lang): | |
| """PDF ์ ๋ก๋ โ ํ ์คํธ ์ถ์ถ โ ๋ฒ์ญ""" | |
| if pdf_file is None: | |
| return "", "PDF ํ์ผ์ ์ ๋ก๋ํด์ฃผ์ธ์." | |
| # ํ ์คํธ ์ถ์ถ | |
| extracted_text = extract_text_from_pdf(pdf_file) | |
| if extracted_text.startswith("โ"): | |
| return "", extracted_text | |
| if not extracted_text.strip(): | |
| return "", "PDF์์ ํ ์คํธ๋ฅผ ์ถ์ถํ ์ ์์ต๋๋ค." | |
| # ๋ฒ์ญ | |
| translated_text = translate_long_text(extracted_text, target_lang) | |
| return extracted_text, translated_text | |
| def translate_input_text(source_text, target_lang): | |
| """์ ๋ ฅ ํ ์คํธ ๋ฒ์ญ""" | |
| return translate_long_text(source_text, target_lang) | |
| # UIใฎๆง็ฏ | |
| langs = ["Japanese", "English", "Chinese", "Korean", "French", "German", "Spanish", "ํ๊ตญ์ด", "ๆฅๆฌ่ช", "ไธญๆ"] | |
| with gr.Blocks(title="HY-MT1.5 Translator") as demo: | |
| gr.Markdown("# ๐ HY-MT1.5-1.8B Translator") | |
| gr.Markdown("Tencent์ 1.8B ๋ฒ์ญ ๋ชจ๋ธ์ ์ฌ์ฉํ ํ ์คํธ/PDF ๋ฒ์ญ ๋ฐ๋ชจ์ ๋๋ค.") | |
| with gr.Tabs(): | |
| # ============ Tab 1: ํ ์คํธ ๋ฒ์ญ ============ | |
| with gr.TabItem("๐ Text Translation"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="์๋ฌธ (Source Text)", | |
| lines=10, | |
| placeholder="๋ฒ์ญํ ํ ์คํธ๋ฅผ ์ ๋ ฅํ์ธ์..." | |
| ) | |
| target_lang_text = gr.Dropdown( | |
| choices=langs, | |
| value="English", | |
| label="๋ฒ์ญ ์ธ์ด (Target Language)" | |
| ) | |
| translate_btn = gr.Button("๐ ๋ฒ์ญ (Translate)", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="๋ฒ์ญ ๊ฒฐ๊ณผ (Result)", | |
| lines=10, | |
| interactive=False | |
| ) | |
| translate_btn.click( | |
| fn=translate_input_text, | |
| inputs=[input_text, target_lang_text], | |
| outputs=output_text | |
| ) | |
| # ============ Tab 2: PDF ๋ฒ์ญ ============ | |
| with gr.TabItem("๐ PDF Translation"): | |
| gr.Markdown("### PDF ํ์ผ์ ์ ๋ก๋ํ๋ฉด ํ ์คํธ๋ฅผ ์ถ์ถํ๊ณ ๋ฒ์ญํฉ๋๋ค.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| label="๐ PDF ํ์ผ ์ ๋ก๋", | |
| file_types=[".pdf"] | |
| ) | |
| target_lang_pdf = gr.Dropdown( | |
| choices=langs, | |
| value="English", | |
| label="๋ฒ์ญ ์ธ์ด (Target Language)" | |
| ) | |
| translate_pdf_btn = gr.Button("๐ PDF ๋ฒ์ญ", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| extracted_text = gr.Textbox( | |
| label="๐ ์ถ์ถ๋ ์๋ฌธ (Extracted Text)", | |
| lines=15, | |
| interactive=False | |
| ) | |
| with gr.Column(): | |
| translated_pdf_text = gr.Textbox( | |
| label="๐ ๋ฒ์ญ ๊ฒฐ๊ณผ (Translated Text)", | |
| lines=15, | |
| interactive=False | |
| ) | |
| translate_pdf_btn.click( | |
| fn=process_pdf_and_translate, | |
| inputs=[pdf_input, target_lang_pdf], | |
| outputs=[extracted_text, translated_pdf_text] | |
| ) | |
| # ่ตทๅ | |
| demo.launch() |