KevanSoon
commited on
Commit
·
68fca2a
1
Parent(s):
ea8fc4a
added pytesseract workflow
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ import html
|
|
| 7 |
import requests
|
| 8 |
import httpx
|
| 9 |
import uuid
|
|
|
|
|
|
|
| 10 |
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
from fastapi.responses import HTMLResponse
|
|
@@ -19,7 +21,7 @@ import google.generativeai as genai
|
|
| 19 |
from google.api_core import exceptions as google_exceptions
|
| 20 |
from pydantic import BaseModel
|
| 21 |
from gradio_client import Client, handle_file
|
| 22 |
-
|
| 23 |
|
| 24 |
from auth.clerk import verify_clerk_jwt
|
| 25 |
from tools.tools import (
|
|
@@ -848,3 +850,240 @@ async def get_user_documents(
|
|
| 848 |
print(documents)
|
| 849 |
|
| 850 |
return documents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
import requests
|
| 8 |
import httpx
|
| 9 |
import uuid
|
| 10 |
+
import tempfile
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
|
| 13 |
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
from fastapi.responses import HTMLResponse
|
|
|
|
| 21 |
from google.api_core import exceptions as google_exceptions
|
| 22 |
from pydantic import BaseModel
|
| 23 |
from gradio_client import Client, handle_file
|
| 24 |
+
|
| 25 |
|
| 26 |
from auth.clerk import verify_clerk_jwt
|
| 27 |
from tools.tools import (
|
|
|
|
| 850 |
print(documents)
|
| 851 |
|
| 852 |
return documents
|
| 853 |
+
|
| 854 |
+
|
| 855 |
+
#----------------------------------Start OF PYTESSERACT workflow-----------------------------------
|
| 856 |
+
|
| 857 |
+
# --- START: New hOCR Functions ---
|
| 858 |
+
|
| 859 |
+
def parse_hocr_to_data(hocr_html: str) -> list[dict]:
|
| 860 |
+
"""
|
| 861 |
+
Parses hOCR HTML output to extract text and bounding boxes.
|
| 862 |
+
|
| 863 |
+
Args:
|
| 864 |
+
hocr_html: A string containing the hOCR output from Tesseract.
|
| 865 |
+
|
| 866 |
+
Returns:
|
| 867 |
+
A list of dictionaries, where each dictionary has 'text' and 'box' keys,
|
| 868 |
+
matching the format expected by the downstream pipeline.
|
| 869 |
+
"""
|
| 870 |
+
soup = BeautifulSoup(hocr_html, 'html.parser')
|
| 871 |
+
data = []
|
| 872 |
+
# Find all ocrx_word elements, as they have the most granular bbox info
|
| 873 |
+
words = soup.find_all('span', class_='ocrx_word')
|
| 874 |
+
|
| 875 |
+
for word in words:
|
| 876 |
+
text = word.get_text().strip()
|
| 877 |
+
if not text:
|
| 878 |
+
continue
|
| 879 |
+
|
| 880 |
+
# The bounding box is in the 'title' attribute, e.g., "bbox 123 456 789 1011"
|
| 881 |
+
title = word.get('title', '')
|
| 882 |
+
bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
|
| 883 |
+
if bbox_match:
|
| 884 |
+
x1, y1, x2, y2 = map(int, bbox_match.groups())
|
| 885 |
+
# The required format is a list of four [x, y] coordinates
|
| 886 |
+
box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
| 887 |
+
data.append({'text': text, 'box': box})
|
| 888 |
+
return data
|
| 889 |
+
|
| 890 |
+
async def ocr_and_parse_hocr(file_content: bytes) -> list[dict]:
|
| 891 |
+
"""
|
| 892 |
+
Replaces extract_text_and_boxes_with_paddle.
|
| 893 |
+
Performs OCR using Tesseract to get hOCR, then parses it into the pipeline's expected format.
|
| 894 |
+
|
| 895 |
+
Args:
|
| 896 |
+
file_content: The raw bytes of the image file.
|
| 897 |
+
|
| 898 |
+
Returns:
|
| 899 |
+
A list of dictionaries with text and bounding box data.
|
| 900 |
+
"""
|
| 901 |
+
try:
|
| 902 |
+
image = Image.open(io.BytesIO(file_content))
|
| 903 |
+
except Exception:
|
| 904 |
+
raise HTTPException(status_code=400, detail="Cannot open image from bytes")
|
| 905 |
+
|
| 906 |
+
# Pytesseract can run in a thread pool to avoid blocking the event loop
|
| 907 |
+
loop = asyncio.get_running_loop()
|
| 908 |
+
hocr_bytes = await loop.run_in_executor(
|
| 909 |
+
None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension='hocr')
|
| 910 |
+
)
|
| 911 |
+
hocr_html = hocr_bytes.decode('utf-8')
|
| 912 |
+
|
| 913 |
+
# Parsing can also be run in an executor if it's CPU intensive
|
| 914 |
+
parsed_data = await loop.run_in_executor(None, parse_hocr_to_data, hocr_html)
|
| 915 |
+
|
| 916 |
+
return parsed_data
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
# --- END: New hOCR Functions ---
|
| 920 |
+
|
| 921 |
+
|
| 922 |
+
async def translate_paddle_data_concurrently(
|
| 923 |
+
paddle_data: list[dict], target_language: str
|
| 924 |
+
) -> list[dict]:
|
| 925 |
+
"""
|
| 926 |
+
Translates the 'text' field of each item in the paddle_data list concurrently.
|
| 927 |
+
(This function remains unchanged as its input format is still valid)
|
| 928 |
+
"""
|
| 929 |
+
|
| 930 |
+
async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
|
| 931 |
+
"""Helper function to call the translation API for a single piece of text."""
|
| 932 |
+
# This is a placeholder for your actual Sea-Lion API call
|
| 933 |
+
# For demonstration, we'll just append the target language.
|
| 934 |
+
# return f"{text_to_translate}-{lang}"
|
| 935 |
+
url = "https://api.sea-lion.ai/v1/chat/completions"
|
| 936 |
+
api_key = os.getenv("SEALION_API_KEY")
|
| 937 |
+
if not api_key:
|
| 938 |
+
# In a real scenario, handle this gracefully
|
| 939 |
+
return f"{text_to_translate} (SEALION_API_KEY not set)"
|
| 940 |
+
headers = {
|
| 941 |
+
"Authorization": f"Bearer {api_key}",
|
| 942 |
+
"Content-Type": "application/json",
|
| 943 |
+
}
|
| 944 |
+
prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
|
| 945 |
+
payload = {
|
| 946 |
+
"max_completion_tokens": 256,
|
| 947 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 948 |
+
"model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
|
| 949 |
+
}
|
| 950 |
+
async with httpx.AsyncClient() as client:
|
| 951 |
+
try:
|
| 952 |
+
response = await client.post(
|
| 953 |
+
url, headers=headers, json=payload, timeout=30.0
|
| 954 |
+
)
|
| 955 |
+
response.raise_for_status()
|
| 956 |
+
response_json = response.json()
|
| 957 |
+
return response_json["choices"][0]["message"]["content"].strip()
|
| 958 |
+
except httpx.RequestError as e:
|
| 959 |
+
return f"Translation Error: {e}"
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
translation_tasks = [
|
| 963 |
+
call_sealion_for_translation(item["text"], target_language)
|
| 964 |
+
for item in paddle_data
|
| 965 |
+
]
|
| 966 |
+
translated_texts = await asyncio.gather(*translation_tasks)
|
| 967 |
+
|
| 968 |
+
translated_data = []
|
| 969 |
+
for i, item in enumerate(paddle_data):
|
| 970 |
+
translated_data.append({"text": translated_texts[i], "box": item["box"]})
|
| 971 |
+
|
| 972 |
+
return translated_data
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
# Helper functions for HTML generation - assumed to exist
|
| 976 |
+
def wrap_words_with_spans(html_content):
|
| 977 |
+
return f"<div id='word-wrapper'>{html_content}</div>"
|
| 978 |
+
|
| 979 |
+
def inject_dropdown_script(html_content):
|
| 980 |
+
script = "<script>/* Dropdown script here */</script>"
|
| 981 |
+
return f"{html_content}{script}"
|
| 982 |
+
|
| 983 |
+
|
| 984 |
+
async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
|
| 985 |
+
"""
|
| 986 |
+
Receives translated OCR data (text with coordinates) and uses Gemini
|
| 987 |
+
to generate a layout-aware HTML document.
|
| 988 |
+
(This function remains unchanged as its input format is still valid)
|
| 989 |
+
"""
|
| 990 |
+
try:
|
| 991 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 992 |
+
if not api_key:
|
| 993 |
+
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
| 994 |
+
|
| 995 |
+
genai.configure(api_key=api_key)
|
| 996 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Updated model name
|
| 997 |
+
|
| 998 |
+
json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
|
| 999 |
+
|
| 1000 |
+
prompt = f"""
|
| 1001 |
+
You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
|
| 1002 |
+
**Your Task:**
|
| 1003 |
+
1. Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
|
| 1004 |
+
2. Use the `box` coordinates to understand the document's spatial structure.
|
| 1005 |
+
3. Reconstruct the visual layout using semantic HTML. Use `<table>` for grid-like data. Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
|
| 1006 |
+
4. Do NOT use absolute positioning. Create a clean, flowing HTML structure.
|
| 1007 |
+
5. Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
|
| 1008 |
+
**OCR Data to process:**
|
| 1009 |
+
```json
|
| 1010 |
+
{json_data_for_prompt}
|
| 1011 |
+
```
|
| 1012 |
+
"""
|
| 1013 |
+
|
| 1014 |
+
def do_request():
|
| 1015 |
+
"""Synchronous function to be run in a separate thread."""
|
| 1016 |
+
response = model.generate_content(prompt)
|
| 1017 |
+
# A simple regex to strip markdown, might need adjustment
|
| 1018 |
+
match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
|
| 1019 |
+
raw_html = match.group(1).strip() if match else response.text.strip()
|
| 1020 |
+
# Reuse existing functions to make the HTML interactive
|
| 1021 |
+
wrapped_html = wrap_words_with_spans(raw_html)
|
| 1022 |
+
final_html = inject_dropdown_script(wrapped_html)
|
| 1023 |
+
return final_html
|
| 1024 |
+
|
| 1025 |
+
return await asyncio.to_thread(do_request)
|
| 1026 |
+
except Exception as e:
|
| 1027 |
+
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
| 1028 |
+
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
app = FastAPI()
|
| 1032 |
+
|
| 1033 |
+
@app.post("/api/translate_file_mvp", response_class=HTMLResponse)
|
| 1034 |
+
async def translate_document_mvp(
|
| 1035 |
+
target_language: str = Form(...), file: UploadFile = File(...)
|
| 1036 |
+
):
|
| 1037 |
+
"""
|
| 1038 |
+
Processes a document using the Layout-Aware MVP pipeline:
|
| 1039 |
+
1. Tesseract hOCR extracts text and coordinates.
|
| 1040 |
+
2. Sea-Lion translates each text block concurrently.
|
| 1041 |
+
3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
|
| 1042 |
+
"""
|
| 1043 |
+
content_type = file.content_type
|
| 1044 |
+
if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
| 1045 |
+
raise HTTPException(
|
| 1046 |
+
status_code=400,
|
| 1047 |
+
detail="Unsupported file type for MVP pipeline. Please use PNG, JPG, BMP or TIFF.",
|
| 1048 |
+
)
|
| 1049 |
+
|
| 1050 |
+
try:
|
| 1051 |
+
file_content = await file.read()
|
| 1052 |
+
|
| 1053 |
+
# === MVP STEP 1: Extract text and coordinates with Tesseract hOCR ===
|
| 1054 |
+
# This is the updated function call
|
| 1055 |
+
ocr_data = await ocr_and_parse_hocr(file_content)
|
| 1056 |
+
if not ocr_data:
|
| 1057 |
+
raise HTTPException(
|
| 1058 |
+
status_code=400,
|
| 1059 |
+
detail="Tesseract hOCR could not extract any text from the image.",
|
| 1060 |
+
)
|
| 1061 |
+
print(f"***** Step 1 Done: Extracted {len(ocr_data)} words ******")
|
| 1062 |
+
|
| 1063 |
+
# === MVP STEP 2: Translate each text block concurrently ===
|
| 1064 |
+
translated_data = await translate_paddle_data_concurrently(
|
| 1065 |
+
ocr_data, target_language
|
| 1066 |
+
)
|
| 1067 |
+
print("***** Step 2 Done: Translated data ******")
|
| 1068 |
+
|
| 1069 |
+
# === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
|
| 1070 |
+
final_html = await generate_html_from_paddle_data(translated_data)
|
| 1071 |
+
print("***** Step 3 Done: Generated HTML ******")
|
| 1072 |
+
return HTMLResponse(content=final_html)
|
| 1073 |
+
|
| 1074 |
+
except httpx.HTTPStatusError as e:
|
| 1075 |
+
raise HTTPException(
|
| 1076 |
+
status_code=e.response.status_code,
|
| 1077 |
+
detail=f"Error from a downstream AI service: {e.response.text}",
|
| 1078 |
+
)
|
| 1079 |
+
except Exception as e:
|
| 1080 |
+
# Provide a more specific error for debugging
|
| 1081 |
+
import traceback
|
| 1082 |
+
traceback.print_exc()
|
| 1083 |
+
raise HTTPException(
|
| 1084 |
+
status_code=500,
|
| 1085 |
+
detail=f"An unexpected error occurred during MVP processing: {str(e)}",
|
| 1086 |
+
)
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
#----------------------------------END OF PYTESSERACT workflow-----------------------------------
|