Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on Aug 11, 2025

Commit

68fca2a

1 Parent(s): ea8fc4a

added pytesseract workflow

Browse files

Files changed (1) hide show

app.py +240 -1

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import html
 import requests
 import httpx
 import uuid
 from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
@@ -19,7 +21,7 @@ import google.generativeai as genai
 from google.api_core import exceptions as google_exceptions
 from pydantic import BaseModel
 from gradio_client import Client, handle_file
-import tempfile
 from auth.clerk import verify_clerk_jwt
 from tools.tools import (
@@ -848,3 +850,240 @@ async def get_user_documents(
     print(documents)
     return documents

 import requests
 import httpx
 import uuid
+import tempfile
+from bs4 import BeautifulSoup
 from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
 from google.api_core import exceptions as google_exceptions
 from pydantic import BaseModel
 from gradio_client import Client, handle_file
 from auth.clerk import verify_clerk_jwt
 from tools.tools import (
     print(documents)
     return documents
+#----------------------------------Start OF PYTESSERACT workflow-----------------------------------
+# --- START: New hOCR Functions ---
+def parse_hocr_to_data(hocr_html: str) -> list[dict]:
+    """
+    Parses hOCR HTML output to extract text and bounding boxes.
+    Args:
+        hocr_html: A string containing the hOCR output from Tesseract.
+    Returns:
+        A list of dictionaries, where each dictionary has 'text' and 'box' keys,
+        matching the format expected by the downstream pipeline.
+    """
+    soup = BeautifulSoup(hocr_html, 'html.parser')
+    data = []
+    # Find all ocrx_word elements, as they have the most granular bbox info
+    words = soup.find_all('span', class_='ocrx_word')
+    for word in words:
+        text = word.get_text().strip()
+        if not text:
+            continue
+        # The bounding box is in the 'title' attribute, e.g., "bbox 123 456 789 1011"
+        title = word.get('title', '')
+        bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
+        if bbox_match:
+            x1, y1, x2, y2 = map(int, bbox_match.groups())
+            # The required format is a list of four [x, y] coordinates
+            box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+            data.append({'text': text, 'box': box})
+    return data
+async def ocr_and_parse_hocr(file_content: bytes) -> list[dict]:
+    """
+    Replaces extract_text_and_boxes_with_paddle.
+    Performs OCR using Tesseract to get hOCR, then parses it into the pipeline's expected format.
+    Args:
+        file_content: The raw bytes of the image file.
+    Returns:
+        A list of dictionaries with text and bounding box data.
+    """
+    try:
+        image = Image.open(io.BytesIO(file_content))
+    except Exception:
+        raise HTTPException(status_code=400, detail="Cannot open image from bytes")
+    # Pytesseract can run in a thread pool to avoid blocking the event loop
+    loop = asyncio.get_running_loop()
+    hocr_bytes = await loop.run_in_executor(
+        None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension='hocr')
+    )
+    hocr_html = hocr_bytes.decode('utf-8')
+    # Parsing can also be run in an executor if it's CPU intensive
+    parsed_data = await loop.run_in_executor(None, parse_hocr_to_data, hocr_html)
+    return parsed_data
+# --- END: New hOCR Functions ---
+async def translate_paddle_data_concurrently(
+    paddle_data: list[dict], target_language: str
+) -> list[dict]:
+    """
+    Translates the 'text' field of each item in the paddle_data list concurrently.
+    (This function remains unchanged as its input format is still valid)
+    """
+    async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
+        """Helper function to call the translation API for a single piece of text."""
+        # This is a placeholder for your actual Sea-Lion API call
+        # For demonstration, we'll just append the target language.
+        # return f"{text_to_translate}-{lang}"
+        url = "https://api.sea-lion.ai/v1/chat/completions"
+        api_key = os.getenv("SEALION_API_KEY")
+        if not api_key:
+             # In a real scenario, handle this gracefully
+            return f"{text_to_translate} (SEALION_API_KEY not set)"
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        }
+        prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
+        payload = {
+            "max_completion_tokens": 256,
+            "messages": [{"role": "user", "content": prompt}],
+            "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
+        }
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.post(
+                    url, headers=headers, json=payload, timeout=30.0
+                )
+                response.raise_for_status()
+                response_json = response.json()
+                return response_json["choices"][0]["message"]["content"].strip()
+            except httpx.RequestError as e:
+                return f"Translation Error: {e}"
+    translation_tasks = [
+        call_sealion_for_translation(item["text"], target_language)
+        for item in paddle_data
+    ]
+    translated_texts = await asyncio.gather(*translation_tasks)
+    translated_data = []
+    for i, item in enumerate(paddle_data):
+        translated_data.append({"text": translated_texts[i], "box": item["box"]})
+    return translated_data
+# Helper functions for HTML generation - assumed to exist
+def wrap_words_with_spans(html_content):
+    return f"<div id='word-wrapper'>{html_content}</div>"
+def inject_dropdown_script(html_content):
+    script = "<script>/* Dropdown script here */</script>"
+    return f"{html_content}{script}"
+async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
+    """
+    Receives translated OCR data (text with coordinates) and uses Gemini
+    to generate a layout-aware HTML document.
+    (This function remains unchanged as its input format is still valid)
+    """
+    try:
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Updated model name
+        json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
+        prompt = f"""
+        You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
+        **Your Task:**
+        1.  Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
+        2.  Use the `box` coordinates to understand the document's spatial structure.
+        3.  Reconstruct the visual layout using semantic HTML. Use `<table>` for grid-like data. Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
+        4.  Do NOT use absolute positioning. Create a clean, flowing HTML structure.
+        5.  Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
+        **OCR Data to process:**
+        ```json
+        {json_data_for_prompt}
+        ```
+        """
+        def do_request():
+            """Synchronous function to be run in a separate thread."""
+            response = model.generate_content(prompt)
+            # A simple regex to strip markdown, might need adjustment
+            match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
+            raw_html = match.group(1).strip() if match else response.text.strip()
+            # Reuse existing functions to make the HTML interactive
+            wrapped_html = wrap_words_with_spans(raw_html)
+            final_html = inject_dropdown_script(wrapped_html)
+            return final_html
+        return await asyncio.to_thread(do_request)
+    except Exception as e:
+        error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
+        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
+app = FastAPI()
+@app.post("/api/translate_file_mvp", response_class=HTMLResponse)
+async def translate_document_mvp(
+    target_language: str = Form(...), file: UploadFile = File(...)
+):
+    """
+    Processes a document using the Layout-Aware MVP pipeline:
+    1. Tesseract hOCR extracts text and coordinates.
+    2. Sea-Lion translates each text block concurrently.
+    3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
+    """
+    content_type = file.content_type
+    if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
+        raise HTTPException(
+            status_code=400,
+            detail="Unsupported file type for MVP pipeline. Please use PNG, JPG, BMP or TIFF.",
+        )
+    try:
+        file_content = await file.read()
+        # === MVP STEP 1: Extract text and coordinates with Tesseract hOCR ===
+        # This is the updated function call
+        ocr_data = await ocr_and_parse_hocr(file_content)
+        if not ocr_data:
+            raise HTTPException(
+                status_code=400,
+                detail="Tesseract hOCR could not extract any text from the image.",
+            )
+        print(f"***** Step 1 Done: Extracted {len(ocr_data)} words ******")
+        # === MVP STEP 2: Translate each text block concurrently ===
+        translated_data = await translate_paddle_data_concurrently(
+            ocr_data, target_language
+        )
+        print("***** Step 2 Done: Translated data ******")
+        # === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
+        final_html = await generate_html_from_paddle_data(translated_data)
+        print("***** Step 3 Done: Generated HTML ******")
+        return HTMLResponse(content=final_html)
+    except httpx.HTTPStatusError as e:
+        raise HTTPException(
+            status_code=e.response.status_code,
+            detail=f"Error from a downstream AI service: {e.response.text}",
+        )
+    except Exception as e:
+        # Provide a more specific error for debugging
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=500,
+            detail=f"An unexpected error occurred during MVP processing: {str(e)}",
+        )
+    #----------------------------------END OF PYTESSERACT workflow-----------------------------------