danicor commited on
Commit
2e6183c
·
verified ·
1 Parent(s): 7463e7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -42
app.py CHANGED
@@ -433,68 +433,236 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
433
  except Exception as e:
434
  logger.error(f"Error deleting temp file: {e}")
435
 
436
- def split_text_into_chunks(text: str, max_tokens: int = 400) -> list:
437
- """Split text into chunks for translation"""
438
- # Split by sentences first
439
- sentences = re.split(r'(?<=[.!?])\s+', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
 
441
  chunks = []
442
  current_chunk = []
443
  current_length = 0
444
 
445
  for sentence in sentences:
446
- sentence_length = len(translation_tokenizer.tokenize(sentence))
447
-
448
- if current_length + sentence_length > max_tokens and current_chunk:
449
- # Save current chunk and start new one
450
- chunks.append(' '.join(current_chunk))
 
 
 
 
 
 
451
  current_chunk = [sentence]
452
- current_length = sentence_length
453
  else:
454
  current_chunk.append(sentence)
455
- current_length += sentence_length
456
 
457
- # Add the last chunk
458
  if current_chunk:
459
- chunks.append(' '.join(current_chunk))
460
 
461
- return chunks
 
 
 
462
 
463
- def translate_text_chunk(text: str, target_code: str) -> str:
464
- """Translate a single chunk of text"""
465
- try:
466
- # Tokenize input
467
- inputs = translation_tokenizer(
468
- text,
469
- return_tensors="pt",
470
- padding=True,
471
- truncation=True,
472
- max_length=512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
- if device == "cuda":
476
- inputs = {k: v.to(device) for k, v in inputs.items()}
 
477
 
478
- # Generate translation
479
- translated_tokens = translation_model.generate(
480
- **inputs,
481
- forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
482
- max_length=512,
483
- num_beams=5,
484
- early_stopping=True
485
- )
486
 
487
- # Decode output
488
- translated_text = translation_tokenizer.batch_decode(
489
- translated_tokens,
490
- skip_special_tokens=True
491
- )[0].strip()
 
 
492
 
493
- return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  except Exception as e:
496
- logger.error(f"Error translating chunk: {e}")
497
- return f"[Translation error: {str(e)}]"
498
 
499
  @app.get("/")
500
  async def root():
 
433
  except Exception as e:
434
  logger.error(f"Error deleting temp file: {e}")
435
 
436
+ def split_text_smartly(text: str, max_tokens: int = 400) -> list:
437
+ """Improved text splitting that handles various languages and formats"""
438
+ # First try to split by sentences (multiple patterns for different languages)
439
+ sentence_patterns = [
440
+ r'(?<=[.!?])\s+', # English
441
+ r'(?<=[。!?])\s*', # Chinese/Japanese
442
+ r'(?<=[۔؟!])\s+', # Persian/Arabic
443
+ r'(?<=[\.!?])\s+' # Fallback
444
+ ]
445
+
446
+ sentences = []
447
+ remaining_text = text
448
+
449
+ for pattern in sentence_patterns:
450
+ try:
451
+ potential_sentences = re.split(pattern, remaining_text)
452
+ if len(potential_sentences) > 1:
453
+ sentences = potential_sentences
454
+ break
455
+ except:
456
+ continue
457
+
458
+ # If no sentence splitting worked, split by length
459
+ if not sentences or len(sentences) == 1:
460
+ chunk_size = 200 # Conservative chunk size
461
+ sentences = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
462
 
463
+ # Group sentences into chunks that fit token limit
464
  chunks = []
465
  current_chunk = []
466
  current_length = 0
467
 
468
  for sentence in sentences:
469
+ if not sentence.strip():
470
+ continue
471
+
472
+ try:
473
+ sentence_tokens = len(translation_tokenizer.tokenize(sentence))
474
+ except:
475
+ # Estimate tokens if tokenizer fails
476
+ sentence_tokens = len(sentence.split()) * 1.3
477
+
478
+ if current_length + sentence_tokens > max_tokens and current_chunk:
479
+ chunks.append(' '.join(current_chunk).strip())
480
  current_chunk = [sentence]
481
+ current_length = sentence_tokens
482
  else:
483
  current_chunk.append(sentence)
484
+ current_length += sentence_tokens
485
 
 
486
  if current_chunk:
487
+ chunks.append(' '.join(current_chunk).strip())
488
 
489
+ # Remove empty chunks
490
+ chunks = [chunk for chunk in chunks if chunk.strip()]
491
+
492
+ return chunks if chunks else [text]
493
 
494
+ def translate_text_chunk(text: str, target_code: str, max_retries: int = 3) -> str:
495
+ """Improved translation with retry logic and better error handling"""
496
+ if not text.strip():
497
+ return ""
498
+
499
+ for attempt in range(max_retries):
500
+ try:
501
+ # Use longer max_length for better translation quality
502
+ max_length = min(1024, len(text) * 2) # Dynamic max length
503
+
504
+ inputs = translation_tokenizer(
505
+ text,
506
+ return_tensors="pt",
507
+ padding=True,
508
+ truncation=True,
509
+ max_length=512 # Input limit
510
+ )
511
+
512
+ if device == "cuda":
513
+ inputs = {k: v.to(device) for k, v in inputs.items()}
514
+
515
+ # Generate translation with better parameters
516
+ translated_tokens = translation_model.generate(
517
+ **inputs,
518
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
519
+ max_length=max_length, # Use dynamic max length
520
+ min_length=5, # Ensure minimum output
521
+ num_beams=4,
522
+ early_stopping=True,
523
+ do_sample=False, # Deterministic output
524
+ temperature=1.0,
525
+ repetition_penalty=1.1
526
+ )
527
+
528
+ translated_text = translation_tokenizer.batch_decode(
529
+ translated_tokens,
530
+ skip_special_tokens=True
531
+ )[0].strip()
532
+
533
+ if translated_text and len(translated_text) > 2:
534
+ return translated_text
535
+ else:
536
+ logger.warning(f"Empty translation on attempt {attempt + 1}")
537
+
538
+ except Exception as e:
539
+ logger.error(f"Translation attempt {attempt + 1} failed: {e}")
540
+ if attempt == max_retries - 1:
541
+ return f"[Translation failed: {text[:50]}...]"
542
+
543
+ return f"[Translation failed after {max_retries} attempts]"
544
+
545
+ async def debug_translate_endpoint(
546
+ text: str = Form(..., min_length=1),
547
+ target_language: str = Form(...)
548
+ ):
549
+ """Debug version of translation endpoint with detailed logging"""
550
+
551
+ if not translation_model:
552
+ raise HTTPException(status_code=503, detail="Translation service not available")
553
+
554
+ text = text.strip()
555
+ logger.info(f"=== TRANSLATION DEBUG START ===")
556
+ logger.info(f"Original text length: {len(text)} characters")
557
+ logger.info(f"Original text preview: {text[:200]}...")
558
+ logger.info(f"Target language: {target_language}")
559
+
560
+ target_language_lower = target_language.lower()
561
+ if target_language_lower not in LANGUAGE_CODES:
562
+ raise HTTPException(
563
+ status_code=400,
564
+ detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
565
  )
566
+
567
+ # Check cache
568
+ text_hash = calculate_text_hash(text)
569
+ cached_translation = await get_translation_from_cache(text_hash, target_language_lower)
570
+ if cached_translation:
571
+ logger.info("Returning cached translation")
572
+ return JSONResponse({
573
+ "text": text,
574
+ "translated_text": cached_translation,
575
+ "target_language": target_language,
576
+ "from_cache": True
577
+ })
578
+
579
+ try:
580
+ target_code = LANGUAGE_CODES[target_language_lower]
581
 
582
+ # Smart text splitting with debug info
583
+ chunks = split_text_smartly(text, max_tokens=350)
584
+ logger.info(f"Text split into {len(chunks)} chunks")
585
 
586
+ for i, chunk in enumerate(chunks):
587
+ logger.info(f"Chunk {i+1}: length={len(chunk)}, preview='{chunk[:100]}...'")
 
 
 
 
 
 
588
 
589
+ translated_chunks = []
590
+ debug_info = {
591
+ "total_chunks": len(chunks),
592
+ "successful_chunks": 0,
593
+ "failed_chunks": 0,
594
+ "chunk_details": []
595
+ }
596
 
597
+ for i, chunk in enumerate(chunks):
598
+ if not chunk.strip():
599
+ continue
600
+
601
+ chunk_info = {
602
+ "chunk_id": i+1,
603
+ "original_length": len(chunk),
604
+ "original_preview": chunk[:100]
605
+ }
606
+
607
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
608
+
609
+ translated_chunk = translate_text_chunk(chunk, target_code)
610
+
611
+ chunk_info["translated_preview"] = translated_chunk[:100] if translated_chunk else "FAILED"
612
+ chunk_info["translated_length"] = len(translated_chunk) if translated_chunk else 0
613
+
614
+ if translated_chunk and not translated_chunk.startswith("[Translation"):
615
+ translated_chunks.append(translated_chunk)
616
+ debug_info["successful_chunks"] += 1
617
+ chunk_info["status"] = "success"
618
+ logger.info(f"Chunk {i+1} translated successfully: {len(translated_chunk)} chars")
619
+ else:
620
+ debug_info["failed_chunks"] += 1
621
+ chunk_info["status"] = "failed"
622
+ logger.error(f"Chunk {i+1} translation failed: {translated_chunk}")
623
+
624
+ debug_info["chunk_details"].append(chunk_info)
625
 
626
+ if not translated_chunks:
627
+ logger.error("All translation chunks failed!")
628
+ raise HTTPException(status_code=500, detail="Translation failed for all text chunks")
629
+
630
+ # Combine translated chunks
631
+ translated_text = ' '.join(translated_chunks)
632
+ logger.info(f"Combined translation length: {len(translated_text)} characters")
633
+ logger.info(f"Translation preview: {translated_text[:200]}...")
634
+
635
+ # Clean up the translation
636
+ original_length = len(translated_text)
637
+ translated_text = re.sub(r'\s+', ' ', translated_text).strip()
638
+ logger.info(f"After cleanup: {len(translated_text)} characters (was {original_length})")
639
+
640
+ # Quality check
641
+ translation_ratio = len(translated_text) / len(text) if len(text) > 0 else 0
642
+ logger.info(f"Translation ratio: {translation_ratio:.2f} (translated/original)")
643
+
644
+ if translation_ratio < 0.1:
645
+ logger.warning(f"Translation seems too short! Ratio: {translation_ratio}")
646
+
647
+ # Save to cache
648
+ await save_translation_to_cache(text_hash, target_language_lower, translated_text)
649
+
650
+ logger.info("=== TRANSLATION DEBUG END ===")
651
+
652
+ return JSONResponse({
653
+ "text": text,
654
+ "translated_text": translated_text,
655
+ "target_language": target_language,
656
+ "from_cache": False,
657
+ "debug_info": debug_info,
658
+ "translation_ratio": translation_ratio
659
+ })
660
+
661
+ except HTTPException:
662
+ raise
663
  except Exception as e:
664
+ logger.error(f"Translation error: {e}")
665
+ raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
666
 
667
  @app.get("/")
668
  async def root():