KevanSoon commited on
Commit ·
6e28bd0
1
Parent(s): 820a928
adjusted pytesseract workflow
Browse files
app.py
CHANGED
|
@@ -920,7 +920,7 @@ async def ocr_and_parse_hocr(file_content: bytes) -> list[dict]:
|
|
| 920 |
# --- END: New hOCR Functions ---
|
| 921 |
|
| 922 |
|
| 923 |
-
async def
|
| 924 |
paddle_data: list[dict], target_language: str
|
| 925 |
) -> list[dict]:
|
| 926 |
"""
|
|
@@ -982,7 +982,7 @@ def inject_dropdown_script(html_content):
|
|
| 982 |
return f"{html_content}{script}"
|
| 983 |
|
| 984 |
|
| 985 |
-
async def
|
| 986 |
"""
|
| 987 |
Receives translated OCR data (text with coordinates) and uses Gemini
|
| 988 |
to generate a layout-aware HTML document.
|
|
@@ -1062,13 +1062,13 @@ async def translate_document_mvp(
|
|
| 1062 |
print(f"***** Step 1 Done: Extracted {len(ocr_data)} words ******")
|
| 1063 |
|
| 1064 |
# === MVP STEP 2: Translate each text block concurrently ===
|
| 1065 |
-
translated_data = await
|
| 1066 |
ocr_data, target_language
|
| 1067 |
)
|
| 1068 |
print("***** Step 2 Done: Translated data ******")
|
| 1069 |
|
| 1070 |
# === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
|
| 1071 |
-
final_html = await
|
| 1072 |
print("***** Step 3 Done: Generated HTML ******")
|
| 1073 |
return HTMLResponse(content=final_html)
|
| 1074 |
|
|
|
|
| 920 |
# --- END: New hOCR Functions ---
|
| 921 |
|
| 922 |
|
| 923 |
+
async def translate_tesseract_data_concurrently(
|
| 924 |
paddle_data: list[dict], target_language: str
|
| 925 |
) -> list[dict]:
|
| 926 |
"""
|
|
|
|
| 982 |
return f"{html_content}{script}"
|
| 983 |
|
| 984 |
|
| 985 |
+
async def generate_html_from_tesseract_data(translated_data: list[dict]) -> str:
|
| 986 |
"""
|
| 987 |
Receives translated OCR data (text with coordinates) and uses Gemini
|
| 988 |
to generate a layout-aware HTML document.
|
|
|
|
| 1062 |
print(f"***** Step 1 Done: Extracted {len(ocr_data)} words ******")
|
| 1063 |
|
| 1064 |
# === MVP STEP 2: Translate each text block concurrently ===
|
| 1065 |
+
translated_data = await translate_tesseract_data_concurrently(
|
| 1066 |
ocr_data, target_language
|
| 1067 |
)
|
| 1068 |
print("***** Step 2 Done: Translated data ******")
|
| 1069 |
|
| 1070 |
# === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
|
| 1071 |
+
final_html = await generate_html_from_tesseract_data(translated_data)
|
| 1072 |
print("***** Step 3 Done: Generated HTML ******")
|
| 1073 |
return HTMLResponse(content=final_html)
|
| 1074 |
|