Spaces:
Sleeping
Sleeping
Piyazon commited on
Commit ·
b58e253
1
Parent(s): 5309fbc
fix the code in sentence
Browse files- .gitignore +2 -1
- app.py +10 -4
.gitignore
CHANGED
|
@@ -2,4 +2,5 @@ __pycache__/
|
|
| 2 |
DS_Store
|
| 3 |
.env
|
| 4 |
.vscode/
|
| 5 |
-
*.pyc
|
|
|
|
|
|
| 2 |
DS_Store
|
| 3 |
.env
|
| 4 |
.vscode/
|
| 5 |
+
*.pyc
|
| 6 |
+
.gradio/
|
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
| 4 |
import logging
|
| 5 |
from detect_language import detect_language
|
| 6 |
import traceback
|
|
|
|
| 7 |
|
| 8 |
# --- 1. SETUP ---
|
| 9 |
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
|
|
@@ -48,10 +49,13 @@ def predict(text, tgt_choice):
|
|
| 48 |
# B. GET TARGET
|
| 49 |
tgt_lang = LANG_CODES.get(tgt_choice, "uig_Arab")
|
| 50 |
tokenizer.src_lang = src_lang
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# C. GENERATE
|
| 53 |
inputs = tokenizer(
|
| 54 |
-
|
| 55 |
return_tensors="pt",
|
| 56 |
padding=True,
|
| 57 |
truncation=True,
|
|
@@ -69,8 +73,10 @@ def predict(text, tgt_choice):
|
|
| 69 |
no_repeat_ngram_size=3
|
| 70 |
)
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
print(f"From {src_lang} to {tgt_lang}: {translation}")
|
| 75 |
|
| 76 |
# D. HANDLE UI DIRECTION
|
|
@@ -161,7 +167,7 @@ js_logic = """
|
|
| 161 |
setTimeout(adjustTextareaHeight, 1000);
|
| 162 |
"""
|
| 163 |
|
| 164 |
-
demo.launch(css=custom_css, js=js_logic, theme=gr.themes.Ocean()
|
| 165 |
|
| 166 |
|
| 167 |
|
|
|
|
| 4 |
import logging
|
| 5 |
from detect_language import detect_language
|
| 6 |
import traceback
|
| 7 |
+
import re
|
| 8 |
|
| 9 |
# --- 1. SETUP ---
|
| 10 |
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
|
|
|
|
| 49 |
# B. GET TARGET
|
| 50 |
tgt_lang = LANG_CODES.get(tgt_choice, "uig_Arab")
|
| 51 |
tokenizer.src_lang = src_lang
|
| 52 |
+
|
| 53 |
+
sentences = re.split(r'(?<=[.!?؟])\s+|(?<=[。!?])', text.strip())
|
| 54 |
+
print(f"Splitting paragraph into {len(sentences)} sentences to translate...")
|
| 55 |
|
| 56 |
# C. GENERATE
|
| 57 |
inputs = tokenizer(
|
| 58 |
+
sentences,
|
| 59 |
return_tensors="pt",
|
| 60 |
padding=True,
|
| 61 |
truncation=True,
|
|
|
|
| 73 |
no_repeat_ngram_size=3
|
| 74 |
)
|
| 75 |
|
| 76 |
+
translated_sentences = tokenizer.batch_decode(out, skip_special_tokens=True)
|
| 77 |
+
|
| 78 |
+
translation = " ".join(translated_sentences)
|
| 79 |
+
|
| 80 |
print(f"From {src_lang} to {tgt_lang}: {translation}")
|
| 81 |
|
| 82 |
# D. HANDLE UI DIRECTION
|
|
|
|
| 167 |
setTimeout(adjustTextareaHeight, 1000);
|
| 168 |
"""
|
| 169 |
|
| 170 |
+
demo.launch(css=custom_css, js=js_logic, theme=gr.themes.Ocean())
|
| 171 |
|
| 172 |
|
| 173 |
|