KevanSoon commited on
Commit
9b421db
·
1 Parent(s): 9bab5c0

temp change to nllb

Browse files
Files changed (1) hide show
  1. app.py +38 -26
app.py CHANGED
@@ -831,7 +831,7 @@ async def get_user_documents(
831
 
832
  # ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
833
 
834
- # --- SEA-LION API HELPER --- #
835
 
836
  async def call_sealion_for_translation(prompt: str) -> str:
837
  """Send one prompt to Sea-Lion and return raw text output."""
@@ -849,7 +849,7 @@ async def call_sealion_for_translation(prompt: str) -> str:
849
  payload = {
850
  "max_completion_tokens": 2048,
851
  "messages": [{"role": "user", "content": prompt}],
852
- "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
853
  }
854
 
855
  async with httpx.AsyncClient() as client:
@@ -866,29 +866,42 @@ async def call_sealion_for_translation(prompt: str) -> str:
866
  return f"Translation Error"
867
 
868
 
869
- async def batch_translate(texts: list[str], lang: str) -> list[str]:
870
- """Batch texts into one request, return aligned translations as list."""
871
- if not texts:
872
- return []
873
 
874
- numbered_texts = "\n".join(
875
- [f"{i+1}. {t}" for i, t in enumerate(texts) if t.strip()]
876
- )
877
- prompt = f"""Translate the following texts to {lang}.
878
- Return ONLY the translations as a numbered list, same order.
879
 
880
- {numbered_texts}
881
- """
 
 
 
 
 
 
 
 
 
 
 
882
 
883
- raw_output = await call_sealion_for_translation(prompt)
 
 
 
 
 
 
884
 
885
  translations = []
886
- for line in raw_output.splitlines():
887
- if ". " in line:
888
- translations.append(line.partition(". ")[2].strip())
889
- # Ensure lengths align (pad with original if mismatch)
890
- while len(translations) < len(texts):
891
- translations.append(texts[len(translations)])
 
 
 
892
  return translations
893
 
894
 
@@ -922,7 +935,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
922
 
923
  try:
924
  def do_ocr() -> list[dict]:
925
- client = Client("kevansoon/PaddleOCR")
926
  result = client.predict(
927
  img=handle_file(temp_filepath),
928
  lang="en",
@@ -939,7 +952,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
939
  # --- TRANSLATION FUNCTIONS --- #
940
 
941
  async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
942
- """Batch translate all hOCR words/lines at once."""
943
  soup = BeautifulSoup(hocr_html, "html.parser")
944
  elements_to_translate = soup.find_all(class_="ocrx_word")
945
  if not elements_to_translate:
@@ -958,7 +971,7 @@ async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> s
958
  async def translate_paddle_data_batched(
959
  paddle_data: list[dict], target_language: str
960
  ) -> list[dict]:
961
- """Batch translate PaddleOCR text fields at once."""
962
  original_texts = [item.get("text", "") for item in paddle_data]
963
  translated_texts = await batch_translate(original_texts, target_language)
964
 
@@ -1039,7 +1052,7 @@ async def generate_html_from_dual_ocr(
1039
  async def translate_document_dual_ocr(
1040
  target_language: str = Form(...), file: UploadFile = File(...)
1041
  ):
1042
- """Full dual OCR + translation pipeline with batching."""
1043
  if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
1044
  raise HTTPException(
1045
  status_code=400,
@@ -1063,7 +1076,7 @@ async def translate_document_dual_ocr(
1063
  detail="Neither OCR engine could extract text.",
1064
  )
1065
 
1066
- # Step 2: Batched translation
1067
  translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
1068
  translated_paddle_task = translate_paddle_data_batched(
1069
  paddle_data, target_language
@@ -1088,5 +1101,4 @@ async def translate_document_dual_ocr(
1088
 
1089
 
1090
 
1091
-
1092
  # ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
 
831
 
832
  # ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
833
 
834
+ # --- SEA-LION API HELPER (kept, but not used) --- #
835
 
836
  async def call_sealion_for_translation(prompt: str) -> str:
837
  """Send one prompt to Sea-Lion and return raw text output."""
 
849
  payload = {
850
  "max_completion_tokens": 2048,
851
  "messages": [{"role": "user", "content": prompt}],
852
+ "model": "aisingapore/Llama-SEA-LION-v3.5-8B-R",
853
  }
854
 
855
  async with httpx.AsyncClient() as client:
 
866
  return f"Translation Error"
867
 
868
 
869
+ # --- NLLB HELPER --- #
 
 
 
870
 
871
+ nllb_client = GradioClient("UNESCO/nllb")
 
 
 
 
872
 
873
+ def call_nllb_for_translation(text: str, src_lang: str, tgt_lang: str) -> str:
874
+ """Call UNESCO NLLB model via Gradio Client."""
875
+ try:
876
+ result = nllb_client.predict(
877
+ text=text,
878
+ src_lang=src_lang,
879
+ tgt_lang=tgt_lang,
880
+ api_name="/translate",
881
+ )
882
+ return str(result).strip()
883
+ except Exception as e:
884
+ print(f"NLLB translation failed: {e}")
885
+ return f"Translation Error: {text}"
886
 
887
+
888
+ # --- BATCH TRANSLATION (NLLB ONLY) --- #
889
+
890
+ async def batch_translate(texts: list[str], tgt_lang: str, src_lang: str = "English") -> list[str]:
891
+ """Batch translate texts using NLLB (one-by-one calls inside asyncio.to_thread)."""
892
+ if not texts:
893
+ return []
894
 
895
  translations = []
896
+ for text in texts:
897
+ if not text.strip():
898
+ translations.append("")
899
+ continue
900
+ translated = await asyncio.to_thread(
901
+ call_nllb_for_translation, text, src_lang, tgt_lang
902
+ )
903
+ translations.append(translated)
904
+
905
  return translations
906
 
907
 
 
935
 
936
  try:
937
  def do_ocr() -> list[dict]:
938
+ client = HFClient("kevansoon/PaddleOCR")
939
  result = client.predict(
940
  img=handle_file(temp_filepath),
941
  lang="en",
 
952
  # --- TRANSLATION FUNCTIONS --- #
953
 
954
  async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
955
+ """Batch translate all hOCR words/lines at once with NLLB."""
956
  soup = BeautifulSoup(hocr_html, "html.parser")
957
  elements_to_translate = soup.find_all(class_="ocrx_word")
958
  if not elements_to_translate:
 
971
  async def translate_paddle_data_batched(
972
  paddle_data: list[dict], target_language: str
973
  ) -> list[dict]:
974
+ """Batch translate PaddleOCR text fields with NLLB."""
975
  original_texts = [item.get("text", "") for item in paddle_data]
976
  translated_texts = await batch_translate(original_texts, target_language)
977
 
 
1052
  async def translate_document_dual_ocr(
1053
  target_language: str = Form(...), file: UploadFile = File(...)
1054
  ):
1055
+ """Full dual OCR + translation pipeline with NLLB (Sea-Lion calls commented out)."""
1056
  if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
1057
  raise HTTPException(
1058
  status_code=400,
 
1076
  detail="Neither OCR engine could extract text.",
1077
  )
1078
 
1079
+ # Step 2: Translation (NLLB)
1080
  translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
1081
  translated_paddle_task = translate_paddle_data_batched(
1082
  paddle_data, target_language
 
1101
 
1102
 
1103
 
 
1104
  # ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------