KevanSoon commited on
Commit
68fca2a
·
1 Parent(s): ea8fc4a

added pytesseract workflow

Browse files
Files changed (1) hide show
  1. app.py +240 -1
app.py CHANGED
@@ -7,6 +7,8 @@ import html
7
  import requests
8
  import httpx
9
  import uuid
 
 
10
  from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import HTMLResponse
@@ -19,7 +21,7 @@ import google.generativeai as genai
19
  from google.api_core import exceptions as google_exceptions
20
  from pydantic import BaseModel
21
  from gradio_client import Client, handle_file
22
- import tempfile
23
 
24
  from auth.clerk import verify_clerk_jwt
25
  from tools.tools import (
@@ -848,3 +850,240 @@ async def get_user_documents(
848
  print(documents)
849
 
850
  return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import requests
8
  import httpx
9
  import uuid
10
+ import tempfile
11
+ from bs4 import BeautifulSoup
12
  from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from fastapi.responses import HTMLResponse
 
21
  from google.api_core import exceptions as google_exceptions
22
  from pydantic import BaseModel
23
  from gradio_client import Client, handle_file
24
+
25
 
26
  from auth.clerk import verify_clerk_jwt
27
  from tools.tools import (
 
850
  print(documents)
851
 
852
  return documents
853
+
854
+
855
+ #----------------------------------Start OF PYTESSERACT workflow-----------------------------------
856
+
857
+ # --- START: New hOCR Functions ---
858
+
859
+ def parse_hocr_to_data(hocr_html: str) -> list[dict]:
860
+ """
861
+ Parses hOCR HTML output to extract text and bounding boxes.
862
+
863
+ Args:
864
+ hocr_html: A string containing the hOCR output from Tesseract.
865
+
866
+ Returns:
867
+ A list of dictionaries, where each dictionary has 'text' and 'box' keys,
868
+ matching the format expected by the downstream pipeline.
869
+ """
870
+ soup = BeautifulSoup(hocr_html, 'html.parser')
871
+ data = []
872
+ # Find all ocrx_word elements, as they have the most granular bbox info
873
+ words = soup.find_all('span', class_='ocrx_word')
874
+
875
+ for word in words:
876
+ text = word.get_text().strip()
877
+ if not text:
878
+ continue
879
+
880
+ # The bounding box is in the 'title' attribute, e.g., "bbox 123 456 789 1011"
881
+ title = word.get('title', '')
882
+ bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
883
+ if bbox_match:
884
+ x1, y1, x2, y2 = map(int, bbox_match.groups())
885
+ # The required format is a list of four [x, y] coordinates
886
+ box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
887
+ data.append({'text': text, 'box': box})
888
+ return data
889
+
890
+ async def ocr_and_parse_hocr(file_content: bytes) -> list[dict]:
891
+ """
892
+ Replaces extract_text_and_boxes_with_paddle.
893
+ Performs OCR using Tesseract to get hOCR, then parses it into the pipeline's expected format.
894
+
895
+ Args:
896
+ file_content: The raw bytes of the image file.
897
+
898
+ Returns:
899
+ A list of dictionaries with text and bounding box data.
900
+ """
901
+ try:
902
+ image = Image.open(io.BytesIO(file_content))
903
+ except Exception:
904
+ raise HTTPException(status_code=400, detail="Cannot open image from bytes")
905
+
906
+ # Pytesseract can run in a thread pool to avoid blocking the event loop
907
+ loop = asyncio.get_running_loop()
908
+ hocr_bytes = await loop.run_in_executor(
909
+ None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension='hocr')
910
+ )
911
+ hocr_html = hocr_bytes.decode('utf-8')
912
+
913
+ # Parsing can also be run in an executor if it's CPU intensive
914
+ parsed_data = await loop.run_in_executor(None, parse_hocr_to_data, hocr_html)
915
+
916
+ return parsed_data
917
+
918
+
919
+ # --- END: New hOCR Functions ---
920
+
921
+
922
+ async def translate_paddle_data_concurrently(
923
+ paddle_data: list[dict], target_language: str
924
+ ) -> list[dict]:
925
+ """
926
+ Translates the 'text' field of each item in the paddle_data list concurrently.
927
+ (This function remains unchanged as its input format is still valid)
928
+ """
929
+
930
+ async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
931
+ """Helper function to call the translation API for a single piece of text."""
932
+ # This is a placeholder for your actual Sea-Lion API call
933
+ # For demonstration, we'll just append the target language.
934
+ # return f"{text_to_translate}-{lang}"
935
+ url = "https://api.sea-lion.ai/v1/chat/completions"
936
+ api_key = os.getenv("SEALION_API_KEY")
937
+ if not api_key:
938
+ # In a real scenario, handle this gracefully
939
+ return f"{text_to_translate} (SEALION_API_KEY not set)"
940
+ headers = {
941
+ "Authorization": f"Bearer {api_key}",
942
+ "Content-Type": "application/json",
943
+ }
944
+ prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
945
+ payload = {
946
+ "max_completion_tokens": 256,
947
+ "messages": [{"role": "user", "content": prompt}],
948
+ "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
949
+ }
950
+ async with httpx.AsyncClient() as client:
951
+ try:
952
+ response = await client.post(
953
+ url, headers=headers, json=payload, timeout=30.0
954
+ )
955
+ response.raise_for_status()
956
+ response_json = response.json()
957
+ return response_json["choices"][0]["message"]["content"].strip()
958
+ except httpx.RequestError as e:
959
+ return f"Translation Error: {e}"
960
+
961
+
962
+ translation_tasks = [
963
+ call_sealion_for_translation(item["text"], target_language)
964
+ for item in paddle_data
965
+ ]
966
+ translated_texts = await asyncio.gather(*translation_tasks)
967
+
968
+ translated_data = []
969
+ for i, item in enumerate(paddle_data):
970
+ translated_data.append({"text": translated_texts[i], "box": item["box"]})
971
+
972
+ return translated_data
973
+
974
+
975
+ # Helper functions for HTML generation - assumed to exist
976
+ def wrap_words_with_spans(html_content):
977
+ return f"<div id='word-wrapper'>{html_content}</div>"
978
+
979
+ def inject_dropdown_script(html_content):
980
+ script = "<script>/* Dropdown script here */</script>"
981
+ return f"{html_content}{script}"
982
+
983
+
984
+ async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
985
+ """
986
+ Receives translated OCR data (text with coordinates) and uses Gemini
987
+ to generate a layout-aware HTML document.
988
+ (This function remains unchanged as its input format is still valid)
989
+ """
990
+ try:
991
+ api_key = os.getenv("GEMINI_API_KEY")
992
+ if not api_key:
993
+ raise ValueError("GEMINI_API_KEY not found in environment variables.")
994
+
995
+ genai.configure(api_key=api_key)
996
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Updated model name
997
+
998
+ json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
999
+
1000
+ prompt = f"""
1001
+ You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
1002
+ **Your Task:**
1003
+ 1. Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
1004
+ 2. Use the `box` coordinates to understand the document's spatial structure.
1005
+ 3. Reconstruct the visual layout using semantic HTML. Use `<table>` for grid-like data. Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
1006
+ 4. Do NOT use absolute positioning. Create a clean, flowing HTML structure.
1007
+ 5. Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
1008
+ **OCR Data to process:**
1009
+ ```json
1010
+ {json_data_for_prompt}
1011
+ ```
1012
+ """
1013
+
1014
+ def do_request():
1015
+ """Synchronous function to be run in a separate thread."""
1016
+ response = model.generate_content(prompt)
1017
+ # A simple regex to strip markdown, might need adjustment
1018
+ match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
1019
+ raw_html = match.group(1).strip() if match else response.text.strip()
1020
+ # Reuse existing functions to make the HTML interactive
1021
+ wrapped_html = wrap_words_with_spans(raw_html)
1022
+ final_html = inject_dropdown_script(wrapped_html)
1023
+ return final_html
1024
+
1025
+ return await asyncio.to_thread(do_request)
1026
+ except Exception as e:
1027
+ error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
1028
+ return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
1029
+
1030
+
1031
+ app = FastAPI()
1032
+
1033
+ @app.post("/api/translate_file_mvp", response_class=HTMLResponse)
1034
+ async def translate_document_mvp(
1035
+ target_language: str = Form(...), file: UploadFile = File(...)
1036
+ ):
1037
+ """
1038
+ Processes a document using the Layout-Aware MVP pipeline:
1039
+ 1. Tesseract hOCR extracts text and coordinates.
1040
+ 2. Sea-Lion translates each text block concurrently.
1041
+ 3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
1042
+ """
1043
+ content_type = file.content_type
1044
+ if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
1045
+ raise HTTPException(
1046
+ status_code=400,
1047
+ detail="Unsupported file type for MVP pipeline. Please use PNG, JPG, BMP or TIFF.",
1048
+ )
1049
+
1050
+ try:
1051
+ file_content = await file.read()
1052
+
1053
+ # === MVP STEP 1: Extract text and coordinates with Tesseract hOCR ===
1054
+ # This is the updated function call
1055
+ ocr_data = await ocr_and_parse_hocr(file_content)
1056
+ if not ocr_data:
1057
+ raise HTTPException(
1058
+ status_code=400,
1059
+ detail="Tesseract hOCR could not extract any text from the image.",
1060
+ )
1061
+ print(f"***** Step 1 Done: Extracted {len(ocr_data)} words ******")
1062
+
1063
+ # === MVP STEP 2: Translate each text block concurrently ===
1064
+ translated_data = await translate_paddle_data_concurrently(
1065
+ ocr_data, target_language
1066
+ )
1067
+ print("***** Step 2 Done: Translated data ******")
1068
+
1069
+ # === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
1070
+ final_html = await generate_html_from_paddle_data(translated_data)
1071
+ print("***** Step 3 Done: Generated HTML ******")
1072
+ return HTMLResponse(content=final_html)
1073
+
1074
+ except httpx.HTTPStatusError as e:
1075
+ raise HTTPException(
1076
+ status_code=e.response.status_code,
1077
+ detail=f"Error from a downstream AI service: {e.response.text}",
1078
+ )
1079
+ except Exception as e:
1080
+ # Provide a more specific error for debugging
1081
+ import traceback
1082
+ traceback.print_exc()
1083
+ raise HTTPException(
1084
+ status_code=500,
1085
+ detail=f"An unexpected error occurred during MVP processing: {str(e)}",
1086
+ )
1087
+
1088
+
1089
+ #----------------------------------END OF PYTESSERACT workflow-----------------------------------