Update app.py
Browse files
app.py
CHANGED
|
@@ -433,68 +433,236 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
|
|
| 433 |
except Exception as e:
|
| 434 |
logger.error(f"Error deleting temp file: {e}")
|
| 435 |
|
| 436 |
-
def
|
| 437 |
-
"""
|
| 438 |
-
#
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
|
|
|
| 441 |
chunks = []
|
| 442 |
current_chunk = []
|
| 443 |
current_length = 0
|
| 444 |
|
| 445 |
for sentence in sentences:
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
current_chunk = [sentence]
|
| 452 |
-
current_length =
|
| 453 |
else:
|
| 454 |
current_chunk.append(sentence)
|
| 455 |
-
current_length +=
|
| 456 |
|
| 457 |
-
# Add the last chunk
|
| 458 |
if current_chunk:
|
| 459 |
-
chunks.append(' '.join(current_chunk))
|
| 460 |
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
-
def translate_text_chunk(text: str, target_code: str) -> str:
|
| 464 |
-
"""
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
-
|
| 476 |
-
|
|
|
|
| 477 |
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
**inputs,
|
| 481 |
-
forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
|
| 482 |
-
max_length=512,
|
| 483 |
-
num_beams=5,
|
| 484 |
-
early_stopping=True
|
| 485 |
-
)
|
| 486 |
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
|
|
|
|
|
|
| 492 |
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
except Exception as e:
|
| 496 |
-
logger.error(f"
|
| 497 |
-
|
| 498 |
|
| 499 |
@app.get("/")
|
| 500 |
async def root():
|
|
|
|
| 433 |
except Exception as e:
|
| 434 |
logger.error(f"Error deleting temp file: {e}")
|
| 435 |
|
| 436 |
+
def split_text_smartly(text: str, max_tokens: int = 400) -> list:
|
| 437 |
+
"""Improved text splitting that handles various languages and formats"""
|
| 438 |
+
# First try to split by sentences (multiple patterns for different languages)
|
| 439 |
+
sentence_patterns = [
|
| 440 |
+
r'(?<=[.!?])\s+', # English
|
| 441 |
+
r'(?<=[。!?])\s*', # Chinese/Japanese
|
| 442 |
+
r'(?<=[۔؟!])\s+', # Persian/Arabic
|
| 443 |
+
r'(?<=[\.!?])\s+' # Fallback
|
| 444 |
+
]
|
| 445 |
+
|
| 446 |
+
sentences = []
|
| 447 |
+
remaining_text = text
|
| 448 |
+
|
| 449 |
+
for pattern in sentence_patterns:
|
| 450 |
+
try:
|
| 451 |
+
potential_sentences = re.split(pattern, remaining_text)
|
| 452 |
+
if len(potential_sentences) > 1:
|
| 453 |
+
sentences = potential_sentences
|
| 454 |
+
break
|
| 455 |
+
except:
|
| 456 |
+
continue
|
| 457 |
+
|
| 458 |
+
# If no sentence splitting worked, split by length
|
| 459 |
+
if not sentences or len(sentences) == 1:
|
| 460 |
+
chunk_size = 200 # Conservative chunk size
|
| 461 |
+
sentences = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
|
| 462 |
|
| 463 |
+
# Group sentences into chunks that fit token limit
|
| 464 |
chunks = []
|
| 465 |
current_chunk = []
|
| 466 |
current_length = 0
|
| 467 |
|
| 468 |
for sentence in sentences:
|
| 469 |
+
if not sentence.strip():
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
try:
|
| 473 |
+
sentence_tokens = len(translation_tokenizer.tokenize(sentence))
|
| 474 |
+
except:
|
| 475 |
+
# Estimate tokens if tokenizer fails
|
| 476 |
+
sentence_tokens = len(sentence.split()) * 1.3
|
| 477 |
+
|
| 478 |
+
if current_length + sentence_tokens > max_tokens and current_chunk:
|
| 479 |
+
chunks.append(' '.join(current_chunk).strip())
|
| 480 |
current_chunk = [sentence]
|
| 481 |
+
current_length = sentence_tokens
|
| 482 |
else:
|
| 483 |
current_chunk.append(sentence)
|
| 484 |
+
current_length += sentence_tokens
|
| 485 |
|
|
|
|
| 486 |
if current_chunk:
|
| 487 |
+
chunks.append(' '.join(current_chunk).strip())
|
| 488 |
|
| 489 |
+
# Remove empty chunks
|
| 490 |
+
chunks = [chunk for chunk in chunks if chunk.strip()]
|
| 491 |
+
|
| 492 |
+
return chunks if chunks else [text]
|
| 493 |
|
| 494 |
+
def translate_text_chunk(text: str, target_code: str, max_retries: int = 3) -> str:
|
| 495 |
+
"""Improved translation with retry logic and better error handling"""
|
| 496 |
+
if not text.strip():
|
| 497 |
+
return ""
|
| 498 |
+
|
| 499 |
+
for attempt in range(max_retries):
|
| 500 |
+
try:
|
| 501 |
+
# Use longer max_length for better translation quality
|
| 502 |
+
max_length = min(1024, len(text) * 2) # Dynamic max length
|
| 503 |
+
|
| 504 |
+
inputs = translation_tokenizer(
|
| 505 |
+
text,
|
| 506 |
+
return_tensors="pt",
|
| 507 |
+
padding=True,
|
| 508 |
+
truncation=True,
|
| 509 |
+
max_length=512 # Input limit
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
if device == "cuda":
|
| 513 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 514 |
+
|
| 515 |
+
# Generate translation with better parameters
|
| 516 |
+
translated_tokens = translation_model.generate(
|
| 517 |
+
**inputs,
|
| 518 |
+
forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
|
| 519 |
+
max_length=max_length, # Use dynamic max length
|
| 520 |
+
min_length=5, # Ensure minimum output
|
| 521 |
+
num_beams=4,
|
| 522 |
+
early_stopping=True,
|
| 523 |
+
do_sample=False, # Deterministic output
|
| 524 |
+
temperature=1.0,
|
| 525 |
+
repetition_penalty=1.1
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
translated_text = translation_tokenizer.batch_decode(
|
| 529 |
+
translated_tokens,
|
| 530 |
+
skip_special_tokens=True
|
| 531 |
+
)[0].strip()
|
| 532 |
+
|
| 533 |
+
if translated_text and len(translated_text) > 2:
|
| 534 |
+
return translated_text
|
| 535 |
+
else:
|
| 536 |
+
logger.warning(f"Empty translation on attempt {attempt + 1}")
|
| 537 |
+
|
| 538 |
+
except Exception as e:
|
| 539 |
+
logger.error(f"Translation attempt {attempt + 1} failed: {e}")
|
| 540 |
+
if attempt == max_retries - 1:
|
| 541 |
+
return f"[Translation failed: {text[:50]}...]"
|
| 542 |
+
|
| 543 |
+
return f"[Translation failed after {max_retries} attempts]"
|
| 544 |
+
|
| 545 |
+
async def debug_translate_endpoint(
|
| 546 |
+
text: str = Form(..., min_length=1),
|
| 547 |
+
target_language: str = Form(...)
|
| 548 |
+
):
|
| 549 |
+
"""Debug version of translation endpoint with detailed logging"""
|
| 550 |
+
|
| 551 |
+
if not translation_model:
|
| 552 |
+
raise HTTPException(status_code=503, detail="Translation service not available")
|
| 553 |
+
|
| 554 |
+
text = text.strip()
|
| 555 |
+
logger.info(f"=== TRANSLATION DEBUG START ===")
|
| 556 |
+
logger.info(f"Original text length: {len(text)} characters")
|
| 557 |
+
logger.info(f"Original text preview: {text[:200]}...")
|
| 558 |
+
logger.info(f"Target language: {target_language}")
|
| 559 |
+
|
| 560 |
+
target_language_lower = target_language.lower()
|
| 561 |
+
if target_language_lower not in LANGUAGE_CODES:
|
| 562 |
+
raise HTTPException(
|
| 563 |
+
status_code=400,
|
| 564 |
+
detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
|
| 565 |
)
|
| 566 |
+
|
| 567 |
+
# Check cache
|
| 568 |
+
text_hash = calculate_text_hash(text)
|
| 569 |
+
cached_translation = await get_translation_from_cache(text_hash, target_language_lower)
|
| 570 |
+
if cached_translation:
|
| 571 |
+
logger.info("Returning cached translation")
|
| 572 |
+
return JSONResponse({
|
| 573 |
+
"text": text,
|
| 574 |
+
"translated_text": cached_translation,
|
| 575 |
+
"target_language": target_language,
|
| 576 |
+
"from_cache": True
|
| 577 |
+
})
|
| 578 |
+
|
| 579 |
+
try:
|
| 580 |
+
target_code = LANGUAGE_CODES[target_language_lower]
|
| 581 |
|
| 582 |
+
# Smart text splitting with debug info
|
| 583 |
+
chunks = split_text_smartly(text, max_tokens=350)
|
| 584 |
+
logger.info(f"Text split into {len(chunks)} chunks")
|
| 585 |
|
| 586 |
+
for i, chunk in enumerate(chunks):
|
| 587 |
+
logger.info(f"Chunk {i+1}: length={len(chunk)}, preview='{chunk[:100]}...'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
|
| 589 |
+
translated_chunks = []
|
| 590 |
+
debug_info = {
|
| 591 |
+
"total_chunks": len(chunks),
|
| 592 |
+
"successful_chunks": 0,
|
| 593 |
+
"failed_chunks": 0,
|
| 594 |
+
"chunk_details": []
|
| 595 |
+
}
|
| 596 |
|
| 597 |
+
for i, chunk in enumerate(chunks):
|
| 598 |
+
if not chunk.strip():
|
| 599 |
+
continue
|
| 600 |
+
|
| 601 |
+
chunk_info = {
|
| 602 |
+
"chunk_id": i+1,
|
| 603 |
+
"original_length": len(chunk),
|
| 604 |
+
"original_preview": chunk[:100]
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
| 608 |
+
|
| 609 |
+
translated_chunk = translate_text_chunk(chunk, target_code)
|
| 610 |
+
|
| 611 |
+
chunk_info["translated_preview"] = translated_chunk[:100] if translated_chunk else "FAILED"
|
| 612 |
+
chunk_info["translated_length"] = len(translated_chunk) if translated_chunk else 0
|
| 613 |
+
|
| 614 |
+
if translated_chunk and not translated_chunk.startswith("[Translation"):
|
| 615 |
+
translated_chunks.append(translated_chunk)
|
| 616 |
+
debug_info["successful_chunks"] += 1
|
| 617 |
+
chunk_info["status"] = "success"
|
| 618 |
+
logger.info(f"Chunk {i+1} translated successfully: {len(translated_chunk)} chars")
|
| 619 |
+
else:
|
| 620 |
+
debug_info["failed_chunks"] += 1
|
| 621 |
+
chunk_info["status"] = "failed"
|
| 622 |
+
logger.error(f"Chunk {i+1} translation failed: {translated_chunk}")
|
| 623 |
+
|
| 624 |
+
debug_info["chunk_details"].append(chunk_info)
|
| 625 |
|
| 626 |
+
if not translated_chunks:
|
| 627 |
+
logger.error("All translation chunks failed!")
|
| 628 |
+
raise HTTPException(status_code=500, detail="Translation failed for all text chunks")
|
| 629 |
+
|
| 630 |
+
# Combine translated chunks
|
| 631 |
+
translated_text = ' '.join(translated_chunks)
|
| 632 |
+
logger.info(f"Combined translation length: {len(translated_text)} characters")
|
| 633 |
+
logger.info(f"Translation preview: {translated_text[:200]}...")
|
| 634 |
+
|
| 635 |
+
# Clean up the translation
|
| 636 |
+
original_length = len(translated_text)
|
| 637 |
+
translated_text = re.sub(r'\s+', ' ', translated_text).strip()
|
| 638 |
+
logger.info(f"After cleanup: {len(translated_text)} characters (was {original_length})")
|
| 639 |
+
|
| 640 |
+
# Quality check
|
| 641 |
+
translation_ratio = len(translated_text) / len(text) if len(text) > 0 else 0
|
| 642 |
+
logger.info(f"Translation ratio: {translation_ratio:.2f} (translated/original)")
|
| 643 |
+
|
| 644 |
+
if translation_ratio < 0.1:
|
| 645 |
+
logger.warning(f"Translation seems too short! Ratio: {translation_ratio}")
|
| 646 |
+
|
| 647 |
+
# Save to cache
|
| 648 |
+
await save_translation_to_cache(text_hash, target_language_lower, translated_text)
|
| 649 |
+
|
| 650 |
+
logger.info("=== TRANSLATION DEBUG END ===")
|
| 651 |
+
|
| 652 |
+
return JSONResponse({
|
| 653 |
+
"text": text,
|
| 654 |
+
"translated_text": translated_text,
|
| 655 |
+
"target_language": target_language,
|
| 656 |
+
"from_cache": False,
|
| 657 |
+
"debug_info": debug_info,
|
| 658 |
+
"translation_ratio": translation_ratio
|
| 659 |
+
})
|
| 660 |
+
|
| 661 |
+
except HTTPException:
|
| 662 |
+
raise
|
| 663 |
except Exception as e:
|
| 664 |
+
logger.error(f"Translation error: {e}")
|
| 665 |
+
raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
|
| 666 |
|
| 667 |
@app.get("/")
|
| 668 |
async def root():
|