Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on Aug 19, 2025

Commit

1a92019

1 Parent(s): 6ac8032

paddle OCR only endpoint

Browse files

Files changed (1) hide show

app.py +173 -106

app.py CHANGED Viewed

@@ -620,114 +620,179 @@ async def translate_document_dual_ocr(
 #-------------------------- start of updated gemini workflow ----------------------------------
-# --- OCR EXTRACTION FUNCTION (Tesseract only) ---
-async def get_hocr_from_image(image_bytes: bytes) -> str:
     """
-    Performs OCR using Tesseract to get raw hOCR HTML output.
-    This function accepts image bytes.
     """
-    if not image_bytes:
-        raise ValueError("Image bytes cannot be empty.")
     try:
-        image = Image.open(io.BytesIO(image_bytes))
     except Exception as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Cannot open image for Tesseract. It may be corrupted or unsupported. Error: {e}",
-        )
-    # Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
-    loop = asyncio.get_running_loop()
-    hocr_bytes = await loop.run_in_executor(
-        None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
-    )
-    return hocr_bytes.decode("utf-8")
-# --- FINAL HTML GENERATION (GEMINI) ---
-async def generate_final_html_from_hocr_with_gemini(
-    hocr_html: str, target_language: str
-) -> str:
     """
-    Receives raw hOCR data, sends it to Gemini for translation, and asks Gemini
-    to generate a final, layout-aware HTML document.
     """
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        # This would be where you configure your generative AI library
-        # genai.configure(api_key=api_key)
-        # model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
         prompt = f"""
-                You are an expert web developer and translator. Your task is to take raw hOCR input,
-                translate all the text within it to {target_language}, and then generate a single,
-                clean, and well-styled HTML document that visually represents the original document layout.
-                Input: Raw hOCR HTML
-                --- HOCR START ---
-                {hocr_html}
-                --- HOCR END ---
                 STRICT RULES:
-                1.  **Translate First**: Identify all the text in the hOCR (`ocrx_word` or `ocr_line` elements). Translate this text to **{target_language}**.
-                2.  **Reconstruct Layout**: Use the translated text and the bounding box information (`title` attribute in hOCR) to create a new HTML structure.
-                3.  **Output ONLY RAW HTML**: Your entire output must be only the final HTML code.
-                    - It must start with `<!DOCTYPE html>` and end with `</html>`.
-                    - Do NOT include ```html, markdown, or any explanations.
-                4.  **Self-Contained HTML**: The HTML must be fully self-contained.
-                    - Include `<html>`, `<head>`, `<style>`, and `<body>`.
-                    - All CSS must be inside a `<style>` block in the `<head>`.
-                5.  **Use Absolute Positioning**: Use CSS absolute positioning for divs (`position: absolute; left: ...px; top: ...px;`) based on the hOCR bounding box coordinates to preserve the original layout of the text. This is more reliable than tables for complex layouts.
-                6.  **Ensure Readability**: The final HTML should be clean, readable, and visually accurate.
-                Example of how to interpret an hOCR element:
-                If you see `<span class='ocrx_word' title='bbox 135 73 214 92; x_wconf 96'>Hello</span>`,
-                it means the word "Hello" is in a box from coordinates (135, 73) to (214, 92).
-                You should translate "Hello" to {target_language} and place the translated word inside a styled div at `left: 135px; top: 73px;`.
                 FINAL OUTPUT REQUIREMENT:
-                - Output ONLY the complete, valid, and translated HTML. No commentary.
                 """
-        # This part remains a placeholder for the actual API call
-        # Since I cannot make live API calls, I'll simulate a response structure.
-        # In a real implementation, you would use the Gemini SDK here.
-        # --- MOCK API CALL START ---
-        # async with httpx.AsyncClient() as client:
-        #     # In a real scenario, you'd use the Gemini client library
-        #     # response = await client.post(...)
-        #     # mocked_response_text = response.text.strip()
-        # --- MOCK API CALL END ---
-        # For demonstration, this function would return the generated HTML from Gemini
-        # For now, we'll just wrap the input in a basic HTML structure for testing.
-        mocked_response_text = f"""
-        <!DOCTYPE html>
-        <html>
-        <head>
-            <title>Translated Document</title>
-            <style>
-                body {{ font-family: sans-serif; }}
-                .translated-content {{ border: 1px solid #ccc; padding: 20px; }}
-            </style>
-        </head>
-        <body>
-            <h1>Translation and Generation in Progress</h1>
-            <p>This is a placeholder response. In a real application, Gemini would generate the full HTML based on the provided hOCR.</p>
-            <h2>Original hOCR Provided:</h2>
-            <pre><code>{html.escape(hocr_html)}</code></pre>
-        </body>
-        </html>
-        """
-        return mocked_response_text.strip()
     except Exception as e:
         error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
@@ -735,17 +800,15 @@ async def generate_final_html_from_hocr_with_gemini(
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-# --- NEW, SIMPLIFIED API ENDPOINT ---
-@app.post("/api/translate_file_hocr_gemini", response_class=HTMLResponse)
-async def translate_document_hocr_gemini(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
     """
-    Processes a document using a simplified hOCR-to-Gemini pipeline:
-    1. Tesseract extracts text and layout data into hOCR format.
-    2. Gemini uses the hOCR to translate the text and generate a final,
-       layout-aware HTML document in a single step.
     """
     content_type = file.content_type
     if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
@@ -760,32 +823,36 @@ async def translate_document_hocr_gemini(
         if not image_bytes:
             raise HTTPException(status_code=400, detail="Uploaded file is empty.")
-        # === STEP 1: Run Tesseract OCR extraction ===
-        print("***** Step 1: Starting Tesseract OCR extraction to get hOCR ******")
-        hocr_html = await get_hocr_from_image(image_bytes)
-        if not hocr_html or "ocr_page" not in hocr_html:
             raise HTTPException(
                 status_code=400,
-                detail="Tesseract could not extract any data from the image.",
             )
-        print(hocr_html)
-        print("***** Step 1 Done: Finished hOCR extraction ******")
-        # === STEP 2: Generate final HTML from hOCR data using Gemini ===
-        print(
-            "***** Step 2: Generating final translated HTML from hOCR data via Gemini ******"
         )
-        final_html = await generate_final_html_from_hocr_with_gemini(
-            hocr_html, target_language
         )
-        print("***** Step 2 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
     except HTTPException:
-        # Re-raise HTTPException to ensure FastAPI handles it correctly
-        raise
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(

 #-------------------------- start of updated gemini workflow ----------------------------------
+async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
     """
+    Translates a list of texts using Gemini in a single batch API call.
     """
+    if not texts:
+        return []
     try:
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
+        # Create a single prompt asking for a JSON array response
+        prompt = f"""
+        Translate each string in the following JSON array of strings to {target_language}.
+        Return a single JSON array where each element is the translated string corresponding
+        to the original at the same index. Your output MUST be only the JSON array and nothing else.
+        Example Input:
+        ["Hello world", "How are you?"]
+        Example Output for target language 'Spanish':
+        ["Hola mundo", "¿Cómo estás?"]
+        Input for this task:
+        {json.dumps(texts)}
+        """
+        def do_request():
+            """Synchronous function to be run in a separate thread."""
+            response = model.generate_content(prompt)
+            return response.text.strip()
+        # Run the synchronous SDK call in a thread to avoid blocking asyncio
+        response_text = await asyncio.to_thread(do_request)
+        # Clean the response to ensure it's valid JSON
+        json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
+        if not json_response_match:
+            print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
+            # Fallback: return original texts if parsing fails
+            return texts
+        cleaned_json = json_response_match.group(0)
+        translated_texts = json.loads(cleaned_json)
+        if len(translated_texts) != len(texts):
+            print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
+            # Fallback in case of length mismatch
+            return texts
+        return translated_texts
     except Exception as e:
+        print(f"An error occurred during Gemini translation: {e}")
+        # Return original texts as a fallback
+        return texts
+# --- OCR EXTRACTION FUNCTION ---
+async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
+    """
+    Extracts text and their bounding boxes from an image using PaddleOCR.
+    Returns the full list of dictionary objects from the OCR tool.
+    """
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
+        temp_file.write(image_bytes)
+        temp_filepath = temp_file.name
+    try:
+        def do_ocr() -> list[dict]:
+            """Synchronous function to be run in a separate thread."""
+            client = Client("kevansoon/PaddleOCR")
+            # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
+            result = client.predict(
+                img=handle_file(temp_filepath),
+                lang="en",
+                api_name="/predict",
+            )
+            return result
+        loop = asyncio.get_running_loop()
+        extracted_data = await loop.run_in_executor(None, do_ocr)
+        if not extracted_data:
+            print("Warning: PaddleOCR returned no data.")
+            return []
+        return extracted_data
+    finally:
+        os.unlink(temp_filepath)
+# --- TRANSLATION FUNCTION (UPDATED TO USE GEMINI) ---
+async def translate_paddle_data_with_gemini(
+    paddle_data: list[dict], target_language: str
+) -> list[dict]:
     """
+    Translates the 'text' field of each item in the paddle_data list
+    using a single batch call to Gemini.
+    """
+    original_texts = [item.get("text", "") for item in paddle_data]
+    if not original_texts:
+        return []
+    # Translate all texts in one go
+    translated_texts = await translate_texts_with_gemini(original_texts, target_language)
+    translated_data = []
+    for i, item in enumerate(paddle_data):
+         # Ensure we don't go out of bounds if translation failed
+        translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
+        translated_data.append({"text": translated_text, "box": item.get("box")})
+    return translated_data
+# --- FINAL HTML GENERATION ---
+async def generate_html_from_paddle_ocr(translated_paddle_data: list[dict]) -> str:
+    """
+    Receives translated PaddleOCR data and uses Gemini to generate
+    a final, layout-aware HTML document.
     """
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
         prompt = f"""
+                You are provided with translated OCR data from PaddleOCR.
+                Your task is to convert this data into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
+                Input: Translated PaddleOCR data (a Python list of dictionaries with 'text' and 'box' keys):
+                --- PADDLEOCR DATA START ---
+                {str(translated_paddle_data)}
+                --- PADDLEOCR DATA END ---
                 STRICT RULES:
+                1. You MUST output ONLY the FINAL RAW HTML code.
+                   - Do not wrap the code in ```html or any other markdown.
+                   - Your output must begin strictly with <!DOCTYPE html> and end with </html>.
+                2. ALL text from the input data MUST be included in the final HTML.
+                   - Every text item must appear exactly once in the correct visual location.
+                3. The HTML must be fully self-contained.
+                   - Include <html>, <head>, <style>, and <body> tags.
+                   - All CSS must be included in a <style> block in the <head>.
+                4. Layout Requirement:
+                   - Use a <table> structure (<table>, <tbody>, <tr>, <td>) to organize the text into a grid that mimics the original document layout.
+                   - Analyze the 'box' coordinates to group words that are on the same horizontal line into the same table row (<tr>).
+                   - Each piece of text should be inside its own table cell (<td>).
+                   - Apply appropriate CSS to the table and cells (e.g., borders, padding) for readability.
+                5. Before outputting your response, internally double-check that you have followed all these rules, especially ensuring every text element from the input is present in the final HTML table.
                 FINAL OUTPUT REQUIREMENT:
+                - Output ONLY the complete, valid, and self-contained HTML code.
                 """
+        def do_request():
+            """Synchronous function to be run in a separate thread."""
+            response = model.generate_content(prompt)
+            # Clean potential markdown fences
+            text = response.text.strip()
+            if text.startswith("```html"):
+                text = text[7:]
+            if text.endswith("```"):
+                text = text[:-3]
+            return text.strip()
+        return await asyncio.to_thread(do_request)
     except Exception as e:
         error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
+@app.post("/api/translate_file_gemini_paddle", response_class=HTMLResponse)
+async def translate_document_paddle_ocr(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
     """
+    Processes a document using a PaddleOCR-based pipeline:
+    1. PaddleOCR extracts text and coordinates from the uploaded image.
+    2. Gemini translates the extracted text in a single batch call.
+    3. Gemini uses the translated data to generate a final, layout-aware HTML.
     """
     content_type = file.content_type
     if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
         if not image_bytes:
             raise HTTPException(status_code=400, detail="Uploaded file is empty.")
+        # === STEP 1: Run PaddleOCR extraction ===
+        print("***** Step 1: Starting PaddleOCR extraction ******")
+        paddle_data = await extract_text_and_boxes_with_paddle(image_bytes)
+        if not paddle_data:
             raise HTTPException(
                 status_code=400,
+                detail="PaddleOCR could not extract any data from the image.",
             )
+        print("***** Step 1 Done: Finished OCR extraction ******")
+        # === STEP 2: Translate OCR output using Gemini ===
+        print("***** Step 2: Starting translation with Gemini ******")
+        translated_paddle_data = await translate_paddle_data_with_gemini(
+            paddle_data, target_language
         )
+        print("***** Step 2 Done: Finished translation ******")
+        # === STEP 3: Generate final HTML from the translated data ===
+        print("***** Step 3: Generating final HTML from PaddleOCR data via Gemini ******")
+        final_html = await generate_html_from_paddle_ocr(
+            translated_paddle_data
         )
+        print("***** Step 3 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
     except HTTPException:
+        raise # Re-raise HTTPException to let FastAPI handle it
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(