Spaces:

danicor
/

wv3

Sleeping

App Files Files Community

danicor commited on Sep 9, 2025

Commit

2e6183c

verified ·

1 Parent(s): 7463e7e

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -42

app.py CHANGED Viewed

@@ -433,68 +433,236 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
         except Exception as e:
             logger.error(f"Error deleting temp file: {e}")
-def split_text_into_chunks(text: str, max_tokens: int = 400) -> list:
-    """Split text into chunks for translation"""
-    # Split by sentences first
-    sentences = re.split(r'(?<=[.!?])\s+', text)
     chunks = []
     current_chunk = []
     current_length = 0
     for sentence in sentences:
-        sentence_length = len(translation_tokenizer.tokenize(sentence))
-        if current_length + sentence_length > max_tokens and current_chunk:
-            # Save current chunk and start new one
-            chunks.append(' '.join(current_chunk))
             current_chunk = [sentence]
-            current_length = sentence_length
         else:
             current_chunk.append(sentence)
-            current_length += sentence_length
-    # Add the last chunk
     if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    return chunks
-def translate_text_chunk(text: str, target_code: str) -> str:
-    """Translate a single chunk of text"""
-    try:
-        # Tokenize input
-        inputs = translation_tokenizer(
-            text,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512
         )
-        if device == "cuda":
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate translation
-        translated_tokens = translation_model.generate(
-            **inputs,
-            forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
-            max_length=512,
-            num_beams=5,
-            early_stopping=True
-        )
-        # Decode output
-        translated_text = translation_tokenizer.batch_decode(
-            translated_tokens,
-            skip_special_tokens=True
-        )[0].strip()
-        return translated_text
     except Exception as e:
-        logger.error(f"Error translating chunk: {e}")
-        return f"[Translation error: {str(e)}]"
 @app.get("/")
 async def root():

         except Exception as e:
             logger.error(f"Error deleting temp file: {e}")
+def split_text_smartly(text: str, max_tokens: int = 400) -> list:
+    """Improved text splitting that handles various languages and formats"""
+    # First try to split by sentences (multiple patterns for different languages)
+    sentence_patterns = [
+        r'(?<=[.!?])\s+',  # English
+        r'(?<=[。！？])\s*',  # Chinese/Japanese
+        r'(?<=[۔؟!])\s+',   # Persian/Arabic
+        r'(?<=[\.!?])\s+'   # Fallback
+    ]
+    sentences = []
+    remaining_text = text
+    for pattern in sentence_patterns:
+        try:
+            potential_sentences = re.split(pattern, remaining_text)
+            if len(potential_sentences) > 1:
+                sentences = potential_sentences
+                break
+        except:
+            continue
+    # If no sentence splitting worked, split by length
+    if not sentences or len(sentences) == 1:
+        chunk_size = 200  # Conservative chunk size
+        sentences = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    # Group sentences into chunks that fit token limit
     chunks = []
     current_chunk = []
     current_length = 0
     for sentence in sentences:
+        if not sentence.strip():
+            continue
+        try:
+            sentence_tokens = len(translation_tokenizer.tokenize(sentence))
+        except:
+            # Estimate tokens if tokenizer fails
+            sentence_tokens = len(sentence.split()) * 1.3
+        if current_length + sentence_tokens > max_tokens and current_chunk:
+            chunks.append(' '.join(current_chunk).strip())
             current_chunk = [sentence]
+            current_length = sentence_tokens
         else:
             current_chunk.append(sentence)
+            current_length += sentence_tokens
     if current_chunk:
+        chunks.append(' '.join(current_chunk).strip())
+    # Remove empty chunks
+    chunks = [chunk for chunk in chunks if chunk.strip()]
+    return chunks if chunks else [text]
+def translate_text_chunk(text: str, target_code: str, max_retries: int = 3) -> str:
+    """Improved translation with retry logic and better error handling"""
+    if not text.strip():
+        return ""
+    for attempt in range(max_retries):
+        try:
+            # Use longer max_length for better translation quality
+            max_length = min(1024, len(text) * 2)  # Dynamic max length
+            inputs = translation_tokenizer(
+                text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512  # Input limit
+            )
+            if device == "cuda":
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+            # Generate translation with better parameters
+            translated_tokens = translation_model.generate(
+                **inputs,
+                forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
+                max_length=max_length,  # Use dynamic max length
+                min_length=5,  # Ensure minimum output
+                num_beams=4,
+                early_stopping=True,
+                do_sample=False,  # Deterministic output
+                temperature=1.0,
+                repetition_penalty=1.1
+            )
+            translated_text = translation_tokenizer.batch_decode(
+                translated_tokens,
+                skip_special_tokens=True
+            )[0].strip()
+            if translated_text and len(translated_text) > 2:
+                return translated_text
+            else:
+                logger.warning(f"Empty translation on attempt {attempt + 1}")
+        except Exception as e:
+            logger.error(f"Translation attempt {attempt + 1} failed: {e}")
+            if attempt == max_retries - 1:
+                return f"[Translation failed: {text[:50]}...]"
+    return f"[Translation failed after {max_retries} attempts]"
+async def debug_translate_endpoint(
+    text: str = Form(..., min_length=1),
+    target_language: str = Form(...)
+):
+    """Debug version of translation endpoint with detailed logging"""
+    if not translation_model:
+        raise HTTPException(status_code=503, detail="Translation service not available")
+    text = text.strip()
+    logger.info(f"=== TRANSLATION DEBUG START ===")
+    logger.info(f"Original text length: {len(text)} characters")
+    logger.info(f"Original text preview: {text[:200]}...")
+    logger.info(f"Target language: {target_language}")
+    target_language_lower = target_language.lower()
+    if target_language_lower not in LANGUAGE_CODES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
         )
+    # Check cache
+    text_hash = calculate_text_hash(text)
+    cached_translation = await get_translation_from_cache(text_hash, target_language_lower)
+    if cached_translation:
+        logger.info("Returning cached translation")
+        return JSONResponse({
+            "text": text,
+            "translated_text": cached_translation,
+            "target_language": target_language,
+            "from_cache": True
+        })
+    try:
+        target_code = LANGUAGE_CODES[target_language_lower]
+        # Smart text splitting with debug info
+        chunks = split_text_smartly(text, max_tokens=350)
+        logger.info(f"Text split into {len(chunks)} chunks")
+        for i, chunk in enumerate(chunks):
+            logger.info(f"Chunk {i+1}: length={len(chunk)}, preview='{chunk[:100]}...'")
+        translated_chunks = []
+        debug_info = {
+            "total_chunks": len(chunks),
+            "successful_chunks": 0,
+            "failed_chunks": 0,
+            "chunk_details": []
+        }
+        for i, chunk in enumerate(chunks):
+            if not chunk.strip():
+                continue
+            chunk_info = {
+                "chunk_id": i+1,
+                "original_length": len(chunk),
+                "original_preview": chunk[:100]
+            }
+            logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+            translated_chunk = translate_text_chunk(chunk, target_code)
+            chunk_info["translated_preview"] = translated_chunk[:100] if translated_chunk else "FAILED"
+            chunk_info["translated_length"] = len(translated_chunk) if translated_chunk else 0
+            if translated_chunk and not translated_chunk.startswith("[Translation"):
+                translated_chunks.append(translated_chunk)
+                debug_info["successful_chunks"] += 1
+                chunk_info["status"] = "success"
+                logger.info(f"Chunk {i+1} translated successfully: {len(translated_chunk)} chars")
+            else:
+                debug_info["failed_chunks"] += 1
+                chunk_info["status"] = "failed"
+                logger.error(f"Chunk {i+1} translation failed: {translated_chunk}")
+            debug_info["chunk_details"].append(chunk_info)
+        if not translated_chunks:
+            logger.error("All translation chunks failed!")
+            raise HTTPException(status_code=500, detail="Translation failed for all text chunks")
+        # Combine translated chunks
+        translated_text = ' '.join(translated_chunks)
+        logger.info(f"Combined translation length: {len(translated_text)} characters")
+        logger.info(f"Translation preview: {translated_text[:200]}...")
+        # Clean up the translation
+        original_length = len(translated_text)
+        translated_text = re.sub(r'\s+', ' ', translated_text).strip()
+        logger.info(f"After cleanup: {len(translated_text)} characters (was {original_length})")
+        # Quality check
+        translation_ratio = len(translated_text) / len(text) if len(text) > 0 else 0
+        logger.info(f"Translation ratio: {translation_ratio:.2f} (translated/original)")
+        if translation_ratio < 0.1:
+            logger.warning(f"Translation seems too short! Ratio: {translation_ratio}")
+        # Save to cache
+        await save_translation_to_cache(text_hash, target_language_lower, translated_text)
+        logger.info("=== TRANSLATION DEBUG END ===")
+        return JSONResponse({
+            "text": text,
+            "translated_text": translated_text,
+            "target_language": target_language,
+            "from_cache": False,
+            "debug_info": debug_info,
+            "translation_ratio": translation_ratio
+        })
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Translation error: {e}")
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
 @app.get("/")
 async def root():