Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on Aug 18, 2025

Commit

9b421db

1 Parent(s): 9bab5c0

temp change to nllb

Browse files

Files changed (1) hide show

app.py +38 -26

app.py CHANGED Viewed

@@ -831,7 +831,7 @@ async def get_user_documents(
 # ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
-# --- SEA-LION API HELPER --- #
 async def call_sealion_for_translation(prompt: str) -> str:
     """Send one prompt to Sea-Lion and return raw text output."""
@@ -849,7 +849,7 @@ async def call_sealion_for_translation(prompt: str) -> str:
     payload = {
         "max_completion_tokens": 2048,
         "messages": [{"role": "user", "content": prompt}],
-        "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
     }
     async with httpx.AsyncClient() as client:
@@ -866,29 +866,42 @@ async def call_sealion_for_translation(prompt: str) -> str:
             return f"Translation Error"
-async def batch_translate(texts: list[str], lang: str) -> list[str]:
-    """Batch texts into one request, return aligned translations as list."""
-    if not texts:
-        return []
-    numbered_texts = "\n".join(
-        [f"{i+1}. {t}" for i, t in enumerate(texts) if t.strip()]
-    )
-    prompt = f"""Translate the following texts to {lang}.
-Return ONLY the translations as a numbered list, same order.
-{numbered_texts}
-"""
-    raw_output = await call_sealion_for_translation(prompt)
     translations = []
-    for line in raw_output.splitlines():
-        if ". " in line:
-            translations.append(line.partition(". ")[2].strip())
-    # Ensure lengths align (pad with original if mismatch)
-    while len(translations) < len(texts):
-        translations.append(texts[len(translations)])
     return translations
@@ -922,7 +935,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
     try:
         def do_ocr() -> list[dict]:
-            client = Client("kevansoon/PaddleOCR")
             result = client.predict(
                 img=handle_file(temp_filepath),
                 lang="en",
@@ -939,7 +952,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
 # --- TRANSLATION FUNCTIONS --- #
 async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
-    """Batch translate all hOCR words/lines at once."""
     soup = BeautifulSoup(hocr_html, "html.parser")
     elements_to_translate = soup.find_all(class_="ocrx_word")
     if not elements_to_translate:
@@ -958,7 +971,7 @@ async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> s
 async def translate_paddle_data_batched(
     paddle_data: list[dict], target_language: str
 ) -> list[dict]:
-    """Batch translate PaddleOCR text fields at once."""
     original_texts = [item.get("text", "") for item in paddle_data]
     translated_texts = await batch_translate(original_texts, target_language)
@@ -1039,7 +1052,7 @@ async def generate_html_from_dual_ocr(
 async def translate_document_dual_ocr(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
-    """Full dual OCR + translation pipeline with batching."""
     if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
         raise HTTPException(
             status_code=400,
@@ -1063,7 +1076,7 @@ async def translate_document_dual_ocr(
                 detail="Neither OCR engine could extract text.",
             )
-        # Step 2: Batched translation
         translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
         translated_paddle_task = translate_paddle_data_batched(
             paddle_data, target_language
@@ -1088,5 +1101,4 @@ async def translate_document_dual_ocr(
 # ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------

 # ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
+# --- SEA-LION API HELPER (kept, but not used) --- #
 async def call_sealion_for_translation(prompt: str) -> str:
     """Send one prompt to Sea-Lion and return raw text output."""
     payload = {
         "max_completion_tokens": 2048,
         "messages": [{"role": "user", "content": prompt}],
+        "model": "aisingapore/Llama-SEA-LION-v3.5-8B-R",
     }
     async with httpx.AsyncClient() as client:
             return f"Translation Error"
+# --- NLLB HELPER --- #
+nllb_client = GradioClient("UNESCO/nllb")
+def call_nllb_for_translation(text: str, src_lang: str, tgt_lang: str) -> str:
+    """Call UNESCO NLLB model via Gradio Client."""
+    try:
+        result = nllb_client.predict(
+            text=text,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            api_name="/translate",
+        )
+        return str(result).strip()
+    except Exception as e:
+        print(f"NLLB translation failed: {e}")
+        return f"Translation Error: {text}"
+# --- BATCH TRANSLATION (NLLB ONLY) --- #
+async def batch_translate(texts: list[str], tgt_lang: str, src_lang: str = "English") -> list[str]:
+    """Batch translate texts using NLLB (one-by-one calls inside asyncio.to_thread)."""
+    if not texts:
+        return []
     translations = []
+    for text in texts:
+        if not text.strip():
+            translations.append("")
+            continue
+        translated = await asyncio.to_thread(
+            call_nllb_for_translation, text, src_lang, tgt_lang
+        )
+        translations.append(translated)
     return translations
     try:
         def do_ocr() -> list[dict]:
+            client = HFClient("kevansoon/PaddleOCR")
             result = client.predict(
                 img=handle_file(temp_filepath),
                 lang="en",
 # --- TRANSLATION FUNCTIONS --- #
 async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
+    """Batch translate all hOCR words/lines at once with NLLB."""
     soup = BeautifulSoup(hocr_html, "html.parser")
     elements_to_translate = soup.find_all(class_="ocrx_word")
     if not elements_to_translate:
 async def translate_paddle_data_batched(
     paddle_data: list[dict], target_language: str
 ) -> list[dict]:
+    """Batch translate PaddleOCR text fields with NLLB."""
     original_texts = [item.get("text", "") for item in paddle_data]
     translated_texts = await batch_translate(original_texts, target_language)
 async def translate_document_dual_ocr(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
+    """Full dual OCR + translation pipeline with NLLB (Sea-Lion calls commented out)."""
     if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
         raise HTTPException(
             status_code=400,
                 detail="Neither OCR engine could extract text.",
             )
+        # Step 2: Translation (NLLB)
         translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
         translated_paddle_task = translate_paddle_data_batched(
             paddle_data, target_language
 # ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------