Update app.py
Browse files
app.py
CHANGED
|
@@ -701,22 +701,63 @@ async def translate_endpoint(
|
|
| 701 |
try:
|
| 702 |
target_code = LANGUAGE_CODES[target_language_lower]
|
| 703 |
|
| 704 |
-
# Split text into
|
| 705 |
-
|
| 706 |
-
logger.info(f"Split text into {len(
|
| 707 |
|
| 708 |
-
|
| 709 |
|
| 710 |
-
for i,
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
-
|
| 716 |
-
|
| 717 |
|
| 718 |
-
|
| 719 |
-
|
| 720 |
|
| 721 |
# Save to cache
|
| 722 |
await save_translation_to_cache(text_hash, target_language_lower, translated_text)
|
|
|
|
| 701 |
try:
|
| 702 |
target_code = LANGUAGE_CODES[target_language_lower]
|
| 703 |
|
| 704 |
+
# Split text into sentences for better translation
|
| 705 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 706 |
+
logger.info(f"Split text into {len(sentences)} sentences")
|
| 707 |
|
| 708 |
+
translated_sentences = []
|
| 709 |
|
| 710 |
+
for i, sentence in enumerate(sentences):
|
| 711 |
+
if not sentence.strip():
|
| 712 |
+
continue
|
| 713 |
+
|
| 714 |
+
logger.info(f"Translating sentence {i+1}/{len(sentences)}: '{sentence[:50]}...'")
|
| 715 |
+
|
| 716 |
+
try:
|
| 717 |
+
# Tokenize input
|
| 718 |
+
inputs = translation_tokenizer(
|
| 719 |
+
sentence,
|
| 720 |
+
return_tensors="pt",
|
| 721 |
+
padding=True,
|
| 722 |
+
truncation=True,
|
| 723 |
+
max_length=512
|
| 724 |
+
)
|
| 725 |
+
|
| 726 |
+
if device == "cuda":
|
| 727 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 728 |
+
|
| 729 |
+
# Generate translation
|
| 730 |
+
translated_tokens = translation_model.generate(
|
| 731 |
+
**inputs,
|
| 732 |
+
forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
|
| 733 |
+
max_length=512,
|
| 734 |
+
num_beams=4,
|
| 735 |
+
early_stopping=True
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
# Decode output
|
| 739 |
+
translated_sentence = translation_tokenizer.batch_decode(
|
| 740 |
+
translated_tokens,
|
| 741 |
+
skip_special_tokens=True
|
| 742 |
+
)[0].strip()
|
| 743 |
+
|
| 744 |
+
if translated_sentence:
|
| 745 |
+
translated_sentences.append(translated_sentence)
|
| 746 |
+
else:
|
| 747 |
+
translated_sentences.append(f"[Translation failed for: {sentence}]")
|
| 748 |
+
|
| 749 |
+
except Exception as e:
|
| 750 |
+
logger.error(f"Error translating sentence {i+1}: {e}")
|
| 751 |
+
translated_sentences.append(f"[Translation error: {str(e)}]")
|
| 752 |
+
|
| 753 |
+
# Combine translated sentences
|
| 754 |
+
translated_text = ' '.join(translated_sentences)
|
| 755 |
|
| 756 |
+
if not translated_text or translated_text.startswith("[Translation error"):
|
| 757 |
+
raise HTTPException(status_code=500, detail="Translation returned empty result or encountered errors")
|
| 758 |
|
| 759 |
+
# Clean up translation
|
| 760 |
+
translated_text = re.sub(r'\s+', ' ', translated_text).strip()
|
| 761 |
|
| 762 |
# Save to cache
|
| 763 |
await save_translation_to_cache(text_hash, target_language_lower, translated_text)
|