Spaces:

danicor
/

TR

Sleeping

App Files Files Community

danicor commited on Sep 15, 2025

Commit

b250f6c

verified ·

1 Parent(s): 49a587c

Update app.py

Browse files

Files changed (1) hide show

app.py +257 -29

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-# app.py
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import time
 import json
 import hashlib
 from datetime import datetime, timedelta
 import threading
 from queue import Queue
@@ -32,6 +32,7 @@ class TranslationResponse(BaseModel):
     processing_time: float
     character_count: int
     status: str
 class TranslationCache:
     def __init__(self, cache_duration_minutes: int = 60):
@@ -99,6 +100,127 @@ class TranslationQueue:
                     thread = threading.Thread(target=worker)
                     thread.start()
 class MultilingualTranslator:
     def __init__(self, cache_duration_minutes: int = 60):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -120,46 +242,148 @@ class MultilingualTranslator:
         except Exception as e:
             logger.error(f"Error loading model: {e}")
             raise
-    def translate_text(self, text: str, source_lang: str, target_lang: str) -> Tuple[str, float]:
-        """Translate text from source to target language"""
-        start_time = time.time()
-        # Check cache first
-        cached_result = self.cache.get(text, source_lang, target_lang)
-        if cached_result:
-            return cached_result, time.time() - start_time
         try:
             # Set source language for tokenizer
             self.tokenizer.src_lang = source_lang
             # Encode input
-            encoded = self.tokenizer(text, return_tensors="pt").to(self.device)
-            # Generate translation
             generated_tokens = self.model.generate(
                 **encoded,
                 forced_bos_token_id=self.tokenizer.get_lang_id(target_lang),
-                max_length=512,
-                num_beams=4,
-                early_stopping=True
             )
             # Decode result
             translation = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-            # Cache the result
-            self.cache.set(text, source_lang, target_lang, translation)
             processing_time = time.time() - start_time
-            logger.info(f"Translation completed in {processing_time:.2f} seconds")
-            return translation, processing_time
         except Exception as e:
             logger.error(f"Translation error: {e}")
-            return f"Translation error: {str(e)}", time.time() - start_time
 # Language mappings for M2M100 model
 LANGUAGE_MAP = {
@@ -236,7 +460,7 @@ LANGUAGE_MAP = {
 translator = MultilingualTranslator(60)
 # Create FastAPI app
-app = FastAPI(title="Multilingual Translation API", version="1.0.0")
 # Add CORS middleware
 app.add_middleware(
@@ -249,11 +473,11 @@ app.add_middleware(
 @app.get("/")
 async def root():
-    return {"message": "Multilingual Translation API", "status": "active"}
 @app.post("/api/translate")
 async def api_translate(request: TranslationRequest):
-    """API endpoint for translation"""
     if not request.text.strip():
         raise HTTPException(status_code=400, detail="No text provided")
@@ -264,7 +488,7 @@ async def api_translate(request: TranslationRequest):
         raise HTTPException(status_code=400, detail="Invalid language codes")
     try:
-        translation, processing_time = translator.translate_text(request.text, source_code, target_code)
         return TranslationResponse(
             translation=translation,
@@ -272,7 +496,8 @@ async def api_translate(request: TranslationRequest):
             target_language=request.target_lang,
             processing_time=processing_time,
             character_count=len(request.text),
-            status="success"
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
@@ -280,7 +505,7 @@ async def api_translate(request: TranslationRequest):
 # Alternative endpoint for form data (compatibility with WordPress)
 @app.post("/api/translate/form")
 async def api_translate_form(request: Request):
-    """Alternative endpoint that accepts form data"""
     try:
         form_data = await request.form()
         text = form_data.get("text", "")
@@ -308,7 +533,7 @@ async def api_translate_form(request: Request):
         raise HTTPException(status_code=400, detail="Invalid language codes")
     try:
-        translation, processing_time = translator.translate_text(text, source_code, target_code)
         return {
             "translation": translation,
@@ -316,7 +541,8 @@ async def api_translate_form(request: Request):
             "target_language": target_lang,
             "processing_time": processing_time,
             "character_count": len(text),
-            "status": "success"
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
@@ -337,7 +563,9 @@ async def health_check():
         "status": "healthy",
         "device": str(translator.device),
         "model": translator.model_name,
-        "cache_size": len(translator.cache.cache)
     }
 if __name__ == "__main__":

 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import time
 import json
 import hashlib
+import re
 from datetime import datetime, timedelta
 import threading
 from queue import Queue
     processing_time: float
     character_count: int
     status: str
+    chunks_processed: Optional[int] = None
 class TranslationCache:
     def __init__(self, cache_duration_minutes: int = 60):
                     thread = threading.Thread(target=worker)
                     thread.start()
+class TextChunker:
+    """کلاس برای تقسیم متن طولانی به بخش‌های کوچکتر"""
+    @staticmethod
+    def split_text_smart(text: str, max_chunk_size: int = 400) -> List[str]:
+        """تقسیم هوشمند متن بر اساس جملات و پاراگراف‌ها"""
+        if len(text) <= max_chunk_size:
+            return [text]
+        chunks = []
+        # تقسیم بر اساس پاراگراف‌ها
+        paragraphs = text.split('\n\n')
+        current_chunk = ""
+        for paragraph in paragraphs:
+            # اگر پاراگراف خودش بزرگ است، آن را تقسیم کن
+            if len(paragraph) > max_chunk_size:
+                # ذخیره قسمت فعلی اگر وجود دارد
+                if current_chunk.strip():
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+                # تقسیم پاراگراف بزرگ
+                sub_chunks = TextChunker._split_paragraph(paragraph, max_chunk_size)
+                chunks.extend(sub_chunks)
+            else:
+                # بررسی اینکه آیا اضافه کردن این پاراگراف از حد تجاوز می‌کند
+                if len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
+                    if current_chunk.strip():
+                        chunks.append(current_chunk.strip())
+                    current_chunk = paragraph
+                else:
+                    if current_chunk:
+                        current_chunk += "\n\n" + paragraph
+                    else:
+                        current_chunk = paragraph
+        # اضافه کردن آخرین قسمت
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        return chunks
+    @staticmethod
+    def _split_paragraph(paragraph: str, max_chunk_size: int) -> List[str]:
+        """تقسیم پاراگراف بزرگ به جملات"""
+        # تقسیم بر اساس جملات
+        sentences = re.split(r'[.!?]+\s+', paragraph)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            # اضافه کردن علامت نقطه اگر حذف شده
+            if not sentence.endswith(('.', '!', '?')):
+                sentence += '.'
+            if len(sentence) > max_chunk_size:
+                # جمله خودش خیلی بلند است - تقسیم بر اساس کاما
+                if current_chunk.strip():
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+                sub_chunks = TextChunker._split_by_comma(sentence, max_chunk_size)
+                chunks.extend(sub_chunks)
+            else:
+                if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
+                    if current_chunk.strip():
+                        chunks.append(current_chunk.strip())
+                    current_chunk = sentence
+                else:
+                    if current_chunk:
+                        current_chunk += " " + sentence
+                    else:
+                        current_chunk = sentence
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        return chunks
+    @staticmethod
+    def _split_by_comma(sentence: str, max_chunk_size: int) -> List[str]:
+        """تقسیم جمله طولانی بر اساس کاما"""
+        parts = sentence.split(', ')
+        chunks = []
+        current_chunk = ""
+        for part in parts:
+            if len(part) > max_chunk_size:
+                # قسمت خودش خیلی بلند است - تقسیم اجباری
+                if current_chunk.strip():
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+                # تقسیم اجباری بر اساس طول
+                while len(part) > max_chunk_size:
+                    chunks.append(part[:max_chunk_size].strip())
+                    part = part[max_chunk_size:].strip()
+                if part:
+                    current_chunk = part
+            else:
+                if len(current_chunk) + len(part) + 2 > max_chunk_size:
+                    if current_chunk.strip():
+                        chunks.append(current_chunk.strip())
+                    current_chunk = part
+                else:
+                    if current_chunk:
+                        current_chunk += ", " + part
+                    else:
+                        current_chunk = part
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        return chunks
 class MultilingualTranslator:
     def __init__(self, cache_duration_minutes: int = 60):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         except Exception as e:
             logger.error(f"Error loading model: {e}")
             raise
+        # تنظیمات بهینه برای ترجمه متن‌های بلند
+        self.max_chunk_size = 350  # حداکثر طول هر قسمت
+        self.min_chunk_overlap = 20  # همپوشانی بین قسمت‌ها
+    def translate_chunk(self, text: str, source_lang: str, target_lang: str) -> str:
+        """ترجمه یک قسمت کوچک از متن"""
         try:
             # Set source language for tokenizer
             self.tokenizer.src_lang = source_lang
             # Encode input
+            encoded = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+            # Generate translation with optimized parameters
             generated_tokens = self.model.generate(
                 **encoded,
                 forced_bos_token_id=self.tokenizer.get_lang_id(target_lang),
+                max_length=1024,  # افزایش طول خروجی
+                min_length=10,    # حداقل طول خروجی
+                num_beams=5,      # افزایش تعداد beam ها برای کیفیت بهتر
+                early_stopping=True,
+                no_repeat_ngram_size=3,  # جلوگیری از تکرار
+                length_penalty=1.0,      # تنظیم جریمه طول
+                repetition_penalty=1.2,  # جلوگیری از تکرار کلمات
+                do_sample=False,         # استفاده از روش قطعی
+                temperature=0.7,         # کنترل تنوع
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id
             )
             # Decode result
             translation = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+            # پاک‌سازی ترجمه از کاراکترهای اضافی
+            translation = translation.strip()
+            return translation
+        except Exception as e:
+            logger.error(f"Chunk translation error: {e}")
+            return f"[Translation Error: {str(e)}]"
+    def translate_text(self, text: str, source_lang: str, target_lang: str) -> Tuple[str, float, int]:
+        """ترجمه متن با پشتیبانی از متن‌های طولانی"""
+        start_time = time.time()
+        # بررسی کش برای کل متن
+        cached_result = self.cache.get(text, source_lang, target_lang)
+        if cached_result:
+            return cached_result, time.time() - start_time, 1
+        try:
+            # اگر متن کوتاه است، مستقیماً ترجمه کن
+            if len(text) <= self.max_chunk_size:
+                translation = self.translate_chunk(text, source_lang, target_lang)
+                # ذخیره در کش
+                self.cache.set(text, source_lang, target_lang, translation)
+                processing_time = time.time() - start_time
+                logger.info(f"Short text translation completed in {processing_time:.2f} seconds")
+                return translation, processing_time, 1
+            # تقسیم متن طولانی به قسمت‌های کوچکتر
+            chunks = TextChunker.split_text_smart(text, self.max_chunk_size)
+            logger.info(f"Split long text into {len(chunks)} chunks")
+            # ترجمه هر قسمت
+            translated_chunks = []
+            for i, chunk in enumerate(chunks):
+                logger.info(f"Translating chunk {i+1}/{len(chunks)} (length: {len(chunk)})")
+                # بررسی کش برای هر قسمت
+                chunk_translation = self.cache.get(chunk, source_lang, target_lang)
+                if not chunk_translation:
+                    chunk_translation = self.translate_chunk(chunk, source_lang, target_lang)
+                    # ذخیره قسمت در کش
+                    self.cache.set(chunk, source_lang, target_lang, chunk_translation)
+                translated_chunks.append(chunk_translation)
+                # کمی استراحت بین ترجمه‌ها برای جلوگیری از بارگذاری زیاد
+                if i < len(chunks) - 1:
+                    time.sleep(0.1)
+            # ترکیب قسمت‌های ترجمه شده
+            final_translation = self._combine_translations(translated_chunks, text)
+            # ذخیره نتیجه نهایی در کش
+            self.cache.set(text, source_lang, target_lang, final_translation)
             processing_time = time.time() - start_time
+            logger.info(f"Long text translation completed in {processing_time:.2f} seconds ({len(chunks)} chunks)")
+            return final_translation, processing_time, len(chunks)
         except Exception as e:
             logger.error(f"Translation error: {e}")
+            return f"Translation error: {str(e)}", time.time() - start_time, 0
+    def _combine_translations(self, translated_chunks: List[str], original_text: str) -> str:
+        """ترکیب قسمت‌های ترجمه شده به یک متن یکپارچه"""
+        if not translated_chunks:
+            return ""
+        if len(translated_chunks) == 1:
+            return translated_chunks[0]
+        # ترکیب قسمت‌ها با در نظر گیری ساختار اصلی متن
+        combined = []
+        for i, chunk in enumerate(translated_chunks):
+            # پاک‌سازی قسمت
+            chunk = chunk.strip()
+            if not chunk:
+                continue
+            # اضافه کردن فاصله مناسب بین قسمت‌ها
+            if i > 0 and combined:
+                # اگر قسمت قبلی با نقطه تمام نمی‌شود، نقطه اضافه کن
+                if not combined[-1].rstrip().endswith(('.', '!', '?', ':', '؛', '.')):
+                    combined[-1] += '.'
+                # بررسی اینکه آیا نیاز به پاراگراف جدید داریم
+                if '\n\n' in original_text:
+                    combined.append('\n\n' + chunk)
+                else:
+                    combined.append(' ' + chunk)
+            else:
+                combined.append(chunk)
+        result = ''.join(combined)
+        # پاک‌سازی نهایی
+        result = re.sub(r'\s+', ' ', result)  # حذف فاصله‌های اضافی
+        result = re.sub(r'\.+', '.', result)  # حذف نقطه‌های تکراری
+        result = result.strip()
+        return result
 # Language mappings for M2M100 model
 LANGUAGE_MAP = {
 translator = MultilingualTranslator(60)
 # Create FastAPI app
+app = FastAPI(title="Multilingual Translation API", version="2.0.0")
 # Add CORS middleware
 app.add_middleware(
 @app.get("/")
 async def root():
+    return {"message": "Multilingual Translation API v2.0", "status": "active", "features": ["long_text_support", "smart_chunking", "cache_optimization"]}
 @app.post("/api/translate")
 async def api_translate(request: TranslationRequest):
+    """API endpoint for translation with long text support"""
     if not request.text.strip():
         raise HTTPException(status_code=400, detail="No text provided")
         raise HTTPException(status_code=400, detail="Invalid language codes")
     try:
+        translation, processing_time, chunks_count = translator.translate_text(request.text, source_code, target_code)
         return TranslationResponse(
             translation=translation,
             target_language=request.target_lang,
             processing_time=processing_time,
             character_count=len(request.text),
+            status="success",
+            chunks_processed=chunks_count
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
 # Alternative endpoint for form data (compatibility with WordPress)
 @app.post("/api/translate/form")
 async def api_translate_form(request: Request):
+    """Alternative endpoint that accepts form data with long text support"""
     try:
         form_data = await request.form()
         text = form_data.get("text", "")
         raise HTTPException(status_code=400, detail="Invalid language codes")
     try:
+        translation, processing_time, chunks_count = translator.translate_text(text, source_code, target_code)
         return {
             "translation": translation,
             "target_language": target_lang,
             "processing_time": processing_time,
             "character_count": len(text),
+            "status": "success",
+            "chunks_processed": chunks_count
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
         "status": "healthy",
         "device": str(translator.device),
         "model": translator.model_name,
+        "cache_size": len(translator.cache.cache),
+        "max_chunk_size": translator.max_chunk_size,
+        "version": "2.0.0"
     }
 if __name__ == "__main__":