KevanSoon
commited on
Commit
·
9b421db
1
Parent(s):
9bab5c0
temp change to nllb
Browse files
app.py
CHANGED
|
@@ -831,7 +831,7 @@ async def get_user_documents(
|
|
| 831 |
|
| 832 |
# ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
|
| 833 |
|
| 834 |
-
# --- SEA-LION API HELPER --- #
|
| 835 |
|
| 836 |
async def call_sealion_for_translation(prompt: str) -> str:
|
| 837 |
"""Send one prompt to Sea-Lion and return raw text output."""
|
|
@@ -849,7 +849,7 @@ async def call_sealion_for_translation(prompt: str) -> str:
|
|
| 849 |
payload = {
|
| 850 |
"max_completion_tokens": 2048,
|
| 851 |
"messages": [{"role": "user", "content": prompt}],
|
| 852 |
-
"model": "aisingapore/Llama-SEA-LION-v3-
|
| 853 |
}
|
| 854 |
|
| 855 |
async with httpx.AsyncClient() as client:
|
|
@@ -866,29 +866,42 @@ async def call_sealion_for_translation(prompt: str) -> str:
|
|
| 866 |
return f"Translation Error"
|
| 867 |
|
| 868 |
|
| 869 |
-
|
| 870 |
-
"""Batch texts into one request, return aligned translations as list."""
|
| 871 |
-
if not texts:
|
| 872 |
-
return []
|
| 873 |
|
| 874 |
-
|
| 875 |
-
[f"{i+1}. {t}" for i, t in enumerate(texts) if t.strip()]
|
| 876 |
-
)
|
| 877 |
-
prompt = f"""Translate the following texts to {lang}.
|
| 878 |
-
Return ONLY the translations as a numbered list, same order.
|
| 879 |
|
| 880 |
-
|
| 881 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 882 |
|
| 883 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
|
| 885 |
translations = []
|
| 886 |
-
for
|
| 887 |
-
if
|
| 888 |
-
translations.append(
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
|
|
|
|
|
|
|
|
|
| 892 |
return translations
|
| 893 |
|
| 894 |
|
|
@@ -922,7 +935,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
|
|
| 922 |
|
| 923 |
try:
|
| 924 |
def do_ocr() -> list[dict]:
|
| 925 |
-
client =
|
| 926 |
result = client.predict(
|
| 927 |
img=handle_file(temp_filepath),
|
| 928 |
lang="en",
|
|
@@ -939,7 +952,7 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
|
|
| 939 |
# --- TRANSLATION FUNCTIONS --- #
|
| 940 |
|
| 941 |
async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
|
| 942 |
-
"""Batch translate all hOCR words/lines at once."""
|
| 943 |
soup = BeautifulSoup(hocr_html, "html.parser")
|
| 944 |
elements_to_translate = soup.find_all(class_="ocrx_word")
|
| 945 |
if not elements_to_translate:
|
|
@@ -958,7 +971,7 @@ async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> s
|
|
| 958 |
async def translate_paddle_data_batched(
|
| 959 |
paddle_data: list[dict], target_language: str
|
| 960 |
) -> list[dict]:
|
| 961 |
-
"""Batch translate PaddleOCR text fields
|
| 962 |
original_texts = [item.get("text", "") for item in paddle_data]
|
| 963 |
translated_texts = await batch_translate(original_texts, target_language)
|
| 964 |
|
|
@@ -1039,7 +1052,7 @@ async def generate_html_from_dual_ocr(
|
|
| 1039 |
async def translate_document_dual_ocr(
|
| 1040 |
target_language: str = Form(...), file: UploadFile = File(...)
|
| 1041 |
):
|
| 1042 |
-
"""Full dual OCR + translation pipeline with
|
| 1043 |
if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
| 1044 |
raise HTTPException(
|
| 1045 |
status_code=400,
|
|
@@ -1063,7 +1076,7 @@ async def translate_document_dual_ocr(
|
|
| 1063 |
detail="Neither OCR engine could extract text.",
|
| 1064 |
)
|
| 1065 |
|
| 1066 |
-
# Step 2:
|
| 1067 |
translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
|
| 1068 |
translated_paddle_task = translate_paddle_data_batched(
|
| 1069 |
paddle_data, target_language
|
|
@@ -1088,5 +1101,4 @@ async def translate_document_dual_ocr(
|
|
| 1088 |
|
| 1089 |
|
| 1090 |
|
| 1091 |
-
|
| 1092 |
# ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
|
|
|
|
| 831 |
|
| 832 |
# ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
|
| 833 |
|
| 834 |
+
# --- SEA-LION API HELPER (kept, but not used) --- #
|
| 835 |
|
| 836 |
async def call_sealion_for_translation(prompt: str) -> str:
|
| 837 |
"""Send one prompt to Sea-Lion and return raw text output."""
|
|
|
|
| 849 |
payload = {
|
| 850 |
"max_completion_tokens": 2048,
|
| 851 |
"messages": [{"role": "user", "content": prompt}],
|
| 852 |
+
"model": "aisingapore/Llama-SEA-LION-v3.5-8B-R",
|
| 853 |
}
|
| 854 |
|
| 855 |
async with httpx.AsyncClient() as client:
|
|
|
|
| 866 |
return f"Translation Error"
|
| 867 |
|
| 868 |
|
| 869 |
+
# --- NLLB HELPER --- #
|
|
|
|
|
|
|
|
|
|
| 870 |
|
| 871 |
+
nllb_client = GradioClient("UNESCO/nllb")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
|
| 873 |
+
def call_nllb_for_translation(text: str, src_lang: str, tgt_lang: str) -> str:
|
| 874 |
+
"""Call UNESCO NLLB model via Gradio Client."""
|
| 875 |
+
try:
|
| 876 |
+
result = nllb_client.predict(
|
| 877 |
+
text=text,
|
| 878 |
+
src_lang=src_lang,
|
| 879 |
+
tgt_lang=tgt_lang,
|
| 880 |
+
api_name="/translate",
|
| 881 |
+
)
|
| 882 |
+
return str(result).strip()
|
| 883 |
+
except Exception as e:
|
| 884 |
+
print(f"NLLB translation failed: {e}")
|
| 885 |
+
return f"Translation Error: {text}"
|
| 886 |
|
| 887 |
+
|
| 888 |
+
# --- BATCH TRANSLATION (NLLB ONLY) --- #
|
| 889 |
+
|
| 890 |
+
async def batch_translate(texts: list[str], tgt_lang: str, src_lang: str = "English") -> list[str]:
|
| 891 |
+
"""Batch translate texts using NLLB (one-by-one calls inside asyncio.to_thread)."""
|
| 892 |
+
if not texts:
|
| 893 |
+
return []
|
| 894 |
|
| 895 |
translations = []
|
| 896 |
+
for text in texts:
|
| 897 |
+
if not text.strip():
|
| 898 |
+
translations.append("")
|
| 899 |
+
continue
|
| 900 |
+
translated = await asyncio.to_thread(
|
| 901 |
+
call_nllb_for_translation, text, src_lang, tgt_lang
|
| 902 |
+
)
|
| 903 |
+
translations.append(translated)
|
| 904 |
+
|
| 905 |
return translations
|
| 906 |
|
| 907 |
|
|
|
|
| 935 |
|
| 936 |
try:
|
| 937 |
def do_ocr() -> list[dict]:
|
| 938 |
+
client = HFClient("kevansoon/PaddleOCR")
|
| 939 |
result = client.predict(
|
| 940 |
img=handle_file(temp_filepath),
|
| 941 |
lang="en",
|
|
|
|
| 952 |
# --- TRANSLATION FUNCTIONS --- #
|
| 953 |
|
| 954 |
async def translate_hocr_html_batched(hocr_html: str, target_language: str) -> str:
|
| 955 |
+
"""Batch translate all hOCR words/lines at once with NLLB."""
|
| 956 |
soup = BeautifulSoup(hocr_html, "html.parser")
|
| 957 |
elements_to_translate = soup.find_all(class_="ocrx_word")
|
| 958 |
if not elements_to_translate:
|
|
|
|
| 971 |
async def translate_paddle_data_batched(
|
| 972 |
paddle_data: list[dict], target_language: str
|
| 973 |
) -> list[dict]:
|
| 974 |
+
"""Batch translate PaddleOCR text fields with NLLB."""
|
| 975 |
original_texts = [item.get("text", "") for item in paddle_data]
|
| 976 |
translated_texts = await batch_translate(original_texts, target_language)
|
| 977 |
|
|
|
|
| 1052 |
async def translate_document_dual_ocr(
|
| 1053 |
target_language: str = Form(...), file: UploadFile = File(...)
|
| 1054 |
):
|
| 1055 |
+
"""Full dual OCR + translation pipeline with NLLB (Sea-Lion calls commented out)."""
|
| 1056 |
if file.content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
| 1057 |
raise HTTPException(
|
| 1058 |
status_code=400,
|
|
|
|
| 1076 |
detail="Neither OCR engine could extract text.",
|
| 1077 |
)
|
| 1078 |
|
| 1079 |
+
# Step 2: Translation (NLLB)
|
| 1080 |
translated_hocr_task = translate_hocr_html_batched(hocr_html, target_language)
|
| 1081 |
translated_paddle_task = translate_paddle_data_batched(
|
| 1082 |
paddle_data, target_language
|
|
|
|
| 1101 |
|
| 1102 |
|
| 1103 |
|
|
|
|
| 1104 |
# ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
|