Spaces:

danicor
/

wv3

Sleeping

App Files Files Community

danicor commited on Sep 9, 2025

Commit

7463e7e

verified ·

1 Parent(s): 53d6860

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -12

app.py CHANGED Viewed

@@ -701,22 +701,63 @@ async def translate_endpoint(
     try:
         target_code = LANGUAGE_CODES[target_language_lower]
-        # Split text into chunks for translation
-        chunks = split_text_into_chunks(text)
-        logger.info(f"Split text into {len(chunks)} chunks for translation")
-        translated_chunks = []
-        for i, chunk in enumerate(chunks):
-            logger.info(f"Translating chunk {i+1}/{len(chunks)} (length: {len(chunk)} chars)")
-            translated_chunk = translate_text_chunk(chunk, target_code)
-            translated_chunks.append(translated_chunk)
-        # Combine translated chunks
-        translated_text = ' '.join(translated_chunks)
-        if not translated_text:
-            raise HTTPException(status_code=500, detail="Translation returned empty result")
         # Save to cache
         await save_translation_to_cache(text_hash, target_language_lower, translated_text)

     try:
         target_code = LANGUAGE_CODES[target_language_lower]
+        # Split text into sentences for better translation
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        logger.info(f"Split text into {len(sentences)} sentences")
+        translated_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            logger.info(f"Translating sentence {i+1}/{len(sentences)}: '{sentence[:50]}...'")
+            try:
+                # Tokenize input
+                inputs = translation_tokenizer(
+                    sentence,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=512
+                )
+                if device == "cuda":
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                # Generate translation
+                translated_tokens = translation_model.generate(
+                    **inputs,
+                    forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
+                    max_length=512,
+                    num_beams=4,
+                    early_stopping=True
+                )
+                # Decode output
+                translated_sentence = translation_tokenizer.batch_decode(
+                    translated_tokens,
+                    skip_special_tokens=True
+                )[0].strip()
+                if translated_sentence:
+                    translated_sentences.append(translated_sentence)
+                else:
+                    translated_sentences.append(f"[Translation failed for: {sentence}]")
+            except Exception as e:
+                logger.error(f"Error translating sentence {i+1}: {e}")
+                translated_sentences.append(f"[Translation error: {str(e)}]")
+        # Combine translated sentences
+        translated_text = ' '.join(translated_sentences)
+        if not translated_text or translated_text.startswith("[Translation error"):
+            raise HTTPException(status_code=500, detail="Translation returned empty result or encountered errors")
+        # Clean up translation
+        translated_text = re.sub(r'\s+', ' ', translated_text).strip()
         # Save to cache
         await save_translation_to_cache(text_hash, target_language_lower, translated_text)