danicor commited on
Commit
7463e7e
·
verified ·
1 Parent(s): 53d6860

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -12
app.py CHANGED
@@ -701,22 +701,63 @@ async def translate_endpoint(
701
  try:
702
  target_code = LANGUAGE_CODES[target_language_lower]
703
 
704
- # Split text into chunks for translation
705
- chunks = split_text_into_chunks(text)
706
- logger.info(f"Split text into {len(chunks)} chunks for translation")
707
 
708
- translated_chunks = []
709
 
710
- for i, chunk in enumerate(chunks):
711
- logger.info(f"Translating chunk {i+1}/{len(chunks)} (length: {len(chunk)} chars)")
712
- translated_chunk = translate_text_chunk(chunk, target_code)
713
- translated_chunks.append(translated_chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
 
715
- # Combine translated chunks
716
- translated_text = ' '.join(translated_chunks)
717
 
718
- if not translated_text:
719
- raise HTTPException(status_code=500, detail="Translation returned empty result")
720
 
721
  # Save to cache
722
  await save_translation_to_cache(text_hash, target_language_lower, translated_text)
 
701
  try:
702
  target_code = LANGUAGE_CODES[target_language_lower]
703
 
704
+ # Split text into sentences for better translation
705
+ sentences = re.split(r'(?<=[.!?])\s+', text)
706
+ logger.info(f"Split text into {len(sentences)} sentences")
707
 
708
+ translated_sentences = []
709
 
710
+ for i, sentence in enumerate(sentences):
711
+ if not sentence.strip():
712
+ continue
713
+
714
+ logger.info(f"Translating sentence {i+1}/{len(sentences)}: '{sentence[:50]}...'")
715
+
716
+ try:
717
+ # Tokenize input
718
+ inputs = translation_tokenizer(
719
+ sentence,
720
+ return_tensors="pt",
721
+ padding=True,
722
+ truncation=True,
723
+ max_length=512
724
+ )
725
+
726
+ if device == "cuda":
727
+ inputs = {k: v.to(device) for k, v in inputs.items()}
728
+
729
+ # Generate translation
730
+ translated_tokens = translation_model.generate(
731
+ **inputs,
732
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
733
+ max_length=512,
734
+ num_beams=4,
735
+ early_stopping=True
736
+ )
737
+
738
+ # Decode output
739
+ translated_sentence = translation_tokenizer.batch_decode(
740
+ translated_tokens,
741
+ skip_special_tokens=True
742
+ )[0].strip()
743
+
744
+ if translated_sentence:
745
+ translated_sentences.append(translated_sentence)
746
+ else:
747
+ translated_sentences.append(f"[Translation failed for: {sentence}]")
748
+
749
+ except Exception as e:
750
+ logger.error(f"Error translating sentence {i+1}: {e}")
751
+ translated_sentences.append(f"[Translation error: {str(e)}]")
752
+
753
+ # Combine translated sentences
754
+ translated_text = ' '.join(translated_sentences)
755
 
756
+ if not translated_text or translated_text.startswith("[Translation error"):
757
+ raise HTTPException(status_code=500, detail="Translation returned empty result or encountered errors")
758
 
759
+ # Clean up translation
760
+ translated_text = re.sub(r'\s+', ' ', translated_text).strip()
761
 
762
  # Save to cache
763
  await save_translation_to_cache(text_hash, target_language_lower, translated_text)