KevanSoon commited on
Commit
1a92019
·
1 Parent(s): 6ac8032

paddle OCR only endpoint

Browse files
Files changed (1) hide show
  1. app.py +173 -106
app.py CHANGED
@@ -620,114 +620,179 @@ async def translate_document_dual_ocr(
620
 
621
  #-------------------------- start of updated gemini workflow ----------------------------------
622
 
623
- # --- OCR EXTRACTION FUNCTION (Tesseract only) ---
624
-
625
- async def get_hocr_from_image(image_bytes: bytes) -> str:
626
  """
627
- Performs OCR using Tesseract to get raw hOCR HTML output.
628
- This function accepts image bytes.
629
  """
630
- if not image_bytes:
631
- raise ValueError("Image bytes cannot be empty.")
632
 
633
  try:
634
- image = Image.open(io.BytesIO(image_bytes))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  except Exception as e:
636
- raise HTTPException(
637
- status_code=400,
638
- detail=f"Cannot open image for Tesseract. It may be corrupted or unsupported. Error: {e}",
639
- )
640
 
641
- # Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
642
- loop = asyncio.get_running_loop()
643
- hocr_bytes = await loop.run_in_executor(
644
- None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
645
- )
646
- return hocr_bytes.decode("utf-8")
 
 
 
 
647
 
 
 
 
 
 
 
 
 
 
 
 
648
 
649
- # --- FINAL HTML GENERATION (GEMINI) ---
 
 
 
 
 
 
 
650
 
651
- async def generate_final_html_from_hocr_with_gemini(
652
- hocr_html: str, target_language: str
653
- ) -> str:
 
 
654
  """
655
- Receives raw hOCR data, sends it to Gemini for translation, and asks Gemini
656
- to generate a final, layout-aware HTML document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  """
658
  try:
659
  api_key = os.getenv("GEMINI_API_KEY")
660
  if not api_key:
661
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
662
 
663
- # This would be where you configure your generative AI library
664
- # genai.configure(api_key=api_key)
665
- # model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
666
 
667
  prompt = f"""
668
- You are an expert web developer and translator. Your task is to take raw hOCR input,
669
- translate all the text within it to {target_language}, and then generate a single,
670
- clean, and well-styled HTML document that visually represents the original document layout.
671
 
672
- Input: Raw hOCR HTML
673
- --- HOCR START ---
674
- {hocr_html}
675
- --- HOCR END ---
676
 
677
  STRICT RULES:
678
- 1. **Translate First**: Identify all the text in the hOCR (`ocrx_word` or `ocr_line` elements). Translate this text to **{target_language}**.
679
- 2. **Reconstruct Layout**: Use the translated text and the bounding box information (`title` attribute in hOCR) to create a new HTML structure.
680
- 3. **Output ONLY RAW HTML**: Your entire output must be only the final HTML code.
681
- - It must start with `<!DOCTYPE html>` and end with `</html>`.
682
- - Do NOT include ```html, markdown, or any explanations.
683
- 4. **Self-Contained HTML**: The HTML must be fully self-contained.
684
- - Include `<html>`, `<head>`, `<style>`, and `<body>`.
685
- - All CSS must be inside a `<style>` block in the `<head>`.
686
- 5. **Use Absolute Positioning**: Use CSS absolute positioning for divs (`position: absolute; left: ...px; top: ...px;`) based on the hOCR bounding box coordinates to preserve the original layout of the text. This is more reliable than tables for complex layouts.
687
- 6. **Ensure Readability**: The final HTML should be clean, readable, and visually accurate.
688
-
689
- Example of how to interpret an hOCR element:
690
- If you see `<span class='ocrx_word' title='bbox 135 73 214 92; x_wconf 96'>Hello</span>`,
691
- it means the word "Hello" is in a box from coordinates (135, 73) to (214, 92).
692
- You should translate "Hello" to {target_language} and place the translated word inside a styled div at `left: 135px; top: 73px;`.
693
 
694
  FINAL OUTPUT REQUIREMENT:
695
- - Output ONLY the complete, valid, and translated HTML. No commentary.
696
  """
697
-
698
- # This part remains a placeholder for the actual API call
699
- # Since I cannot make live API calls, I'll simulate a response structure.
700
- # In a real implementation, you would use the Gemini SDK here.
701
-
702
- # --- MOCK API CALL START ---
703
- # async with httpx.AsyncClient() as client:
704
- # # In a real scenario, you'd use the Gemini client library
705
- # # response = await client.post(...)
706
- # # mocked_response_text = response.text.strip()
707
- # --- MOCK API CALL END ---
708
-
709
- # For demonstration, this function would return the generated HTML from Gemini
710
- # For now, we'll just wrap the input in a basic HTML structure for testing.
711
- mocked_response_text = f"""
712
- <!DOCTYPE html>
713
- <html>
714
- <head>
715
- <title>Translated Document</title>
716
- <style>
717
- body {{ font-family: sans-serif; }}
718
- .translated-content {{ border: 1px solid #ccc; padding: 20px; }}
719
- </style>
720
- </head>
721
- <body>
722
- <h1>Translation and Generation in Progress</h1>
723
- <p>This is a placeholder response. In a real application, Gemini would generate the full HTML based on the provided hOCR.</p>
724
- <h2>Original hOCR Provided:</h2>
725
- <pre><code>{html.escape(hocr_html)}</code></pre>
726
- </body>
727
- </html>
728
- """
729
- return mocked_response_text.strip()
730
 
 
 
 
 
 
 
 
 
 
 
 
 
731
 
732
  except Exception as e:
733
  error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
@@ -735,17 +800,15 @@ async def generate_final_html_from_hocr_with_gemini(
735
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
736
 
737
 
738
- # --- NEW, SIMPLIFIED API ENDPOINT ---
739
-
740
- @app.post("/api/translate_file_hocr_gemini", response_class=HTMLResponse)
741
- async def translate_document_hocr_gemini(
742
  target_language: str = Form(...), file: UploadFile = File(...)
743
  ):
744
  """
745
- Processes a document using a simplified hOCR-to-Gemini pipeline:
746
- 1. Tesseract extracts text and layout data into hOCR format.
747
- 2. Gemini uses the hOCR to translate the text and generate a final,
748
- layout-aware HTML document in a single step.
749
  """
750
  content_type = file.content_type
751
  if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
@@ -760,32 +823,36 @@ async def translate_document_hocr_gemini(
760
  if not image_bytes:
761
  raise HTTPException(status_code=400, detail="Uploaded file is empty.")
762
 
763
- # === STEP 1: Run Tesseract OCR extraction ===
764
- print("***** Step 1: Starting Tesseract OCR extraction to get hOCR ******")
765
- hocr_html = await get_hocr_from_image(image_bytes)
766
 
767
- if not hocr_html or "ocr_page" not in hocr_html:
768
  raise HTTPException(
769
  status_code=400,
770
- detail="Tesseract could not extract any data from the image.",
771
  )
772
- print(hocr_html)
773
- print("***** Step 1 Done: Finished hOCR extraction ******")
774
 
775
- # === STEP 2: Generate final HTML from hOCR data using Gemini ===
776
- print(
777
- "***** Step 2: Generating final translated HTML from hOCR data via Gemini ******"
 
778
  )
779
- final_html = await generate_final_html_from_hocr_with_gemini(
780
- hocr_html, target_language
 
 
 
 
781
  )
782
- print("***** Step 2 Done: Generated final HTML ******")
 
783
 
784
  return HTMLResponse(content=final_html)
785
 
786
  except HTTPException:
787
- # Re-raise HTTPException to ensure FastAPI handles it correctly
788
- raise
789
  except Exception as e:
790
  traceback.print_exc()
791
  raise HTTPException(
 
620
 
621
  #-------------------------- start of updated gemini workflow ----------------------------------
622
 
623
+ async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
 
 
624
  """
625
+ Translates a list of texts using Gemini in a single batch API call.
 
626
  """
627
+ if not texts:
628
+ return []
629
 
630
  try:
631
+ api_key = os.getenv("GEMINI_API_KEY")
632
+ if not api_key:
633
+ raise ValueError("GEMINI_API_KEY not found in environment variables.")
634
+
635
+ genai.configure(api_key=api_key)
636
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
637
+
638
+ # Create a single prompt asking for a JSON array response
639
+ prompt = f"""
640
+ Translate each string in the following JSON array of strings to {target_language}.
641
+ Return a single JSON array where each element is the translated string corresponding
642
+ to the original at the same index. Your output MUST be only the JSON array and nothing else.
643
+
644
+ Example Input:
645
+ ["Hello world", "How are you?"]
646
+
647
+ Example Output for target language 'Spanish':
648
+ ["Hola mundo", "¿Cómo estás?"]
649
+
650
+ Input for this task:
651
+ {json.dumps(texts)}
652
+ """
653
+
654
+ def do_request():
655
+ """Synchronous function to be run in a separate thread."""
656
+ response = model.generate_content(prompt)
657
+ return response.text.strip()
658
+
659
+ # Run the synchronous SDK call in a thread to avoid blocking asyncio
660
+ response_text = await asyncio.to_thread(do_request)
661
+
662
+ # Clean the response to ensure it's valid JSON
663
+ json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
664
+ if not json_response_match:
665
+ print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
666
+ # Fallback: return original texts if parsing fails
667
+ return texts
668
+
669
+ cleaned_json = json_response_match.group(0)
670
+ translated_texts = json.loads(cleaned_json)
671
+
672
+ if len(translated_texts) != len(texts):
673
+ print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
674
+ # Fallback in case of length mismatch
675
+ return texts
676
+
677
+ return translated_texts
678
+
679
  except Exception as e:
680
+ print(f"An error occurred during Gemini translation: {e}")
681
+ # Return original texts as a fallback
682
+ return texts
 
683
 
684
+ # --- OCR EXTRACTION FUNCTION ---
685
+
686
+ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
687
+ """
688
+ Extracts text and their bounding boxes from an image using PaddleOCR.
689
+ Returns the full list of dictionary objects from the OCR tool.
690
+ """
691
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
692
+ temp_file.write(image_bytes)
693
+ temp_filepath = temp_file.name
694
 
695
+ try:
696
+ def do_ocr() -> list[dict]:
697
+ """Synchronous function to be run in a separate thread."""
698
+ client = Client("kevansoon/PaddleOCR")
699
+ # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
700
+ result = client.predict(
701
+ img=handle_file(temp_filepath),
702
+ lang="en",
703
+ api_name="/predict",
704
+ )
705
+ return result
706
 
707
+ loop = asyncio.get_running_loop()
708
+ extracted_data = await loop.run_in_executor(None, do_ocr)
709
+ if not extracted_data:
710
+ print("Warning: PaddleOCR returned no data.")
711
+ return []
712
+ return extracted_data
713
+ finally:
714
+ os.unlink(temp_filepath)
715
 
716
+ # --- TRANSLATION FUNCTION (UPDATED TO USE GEMINI) ---
717
+
718
+ async def translate_paddle_data_with_gemini(
719
+ paddle_data: list[dict], target_language: str
720
+ ) -> list[dict]:
721
  """
722
+ Translates the 'text' field of each item in the paddle_data list
723
+ using a single batch call to Gemini.
724
+ """
725
+ original_texts = [item.get("text", "") for item in paddle_data]
726
+ if not original_texts:
727
+ return []
728
+
729
+ # Translate all texts in one go
730
+ translated_texts = await translate_texts_with_gemini(original_texts, target_language)
731
+
732
+ translated_data = []
733
+ for i, item in enumerate(paddle_data):
734
+ # Ensure we don't go out of bounds if translation failed
735
+ translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
736
+ translated_data.append({"text": translated_text, "box": item.get("box")})
737
+
738
+ return translated_data
739
+
740
+ # --- FINAL HTML GENERATION ---
741
+
742
+ async def generate_html_from_paddle_ocr(translated_paddle_data: list[dict]) -> str:
743
+ """
744
+ Receives translated PaddleOCR data and uses Gemini to generate
745
+ a final, layout-aware HTML document.
746
  """
747
  try:
748
  api_key = os.getenv("GEMINI_API_KEY")
749
  if not api_key:
750
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
751
 
752
+ genai.configure(api_key=api_key)
753
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
 
754
 
755
  prompt = f"""
756
+ You are provided with translated OCR data from PaddleOCR.
757
+ Your task is to convert this data into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
 
758
 
759
+ Input: Translated PaddleOCR data (a Python list of dictionaries with 'text' and 'box' keys):
760
+ --- PADDLEOCR DATA START ---
761
+ {str(translated_paddle_data)}
762
+ --- PADDLEOCR DATA END ---
763
 
764
  STRICT RULES:
765
+ 1. You MUST output ONLY the FINAL RAW HTML code.
766
+ - Do not wrap the code in ```html or any other markdown.
767
+ - Your output must begin strictly with <!DOCTYPE html> and end with </html>.
768
+ 2. ALL text from the input data MUST be included in the final HTML.
769
+ - Every text item must appear exactly once in the correct visual location.
770
+ 3. The HTML must be fully self-contained.
771
+ - Include <html>, <head>, <style>, and <body> tags.
772
+ - All CSS must be included in a <style> block in the <head>.
773
+ 4. Layout Requirement:
774
+ - Use a <table> structure (<table>, <tbody>, <tr>, <td>) to organize the text into a grid that mimics the original document layout.
775
+ - Analyze the 'box' coordinates to group words that are on the same horizontal line into the same table row (<tr>).
776
+ - Each piece of text should be inside its own table cell (<td>).
777
+ - Apply appropriate CSS to the table and cells (e.g., borders, padding) for readability.
778
+ 5. Before outputting your response, internally double-check that you have followed all these rules, especially ensuring every text element from the input is present in the final HTML table.
 
779
 
780
  FINAL OUTPUT REQUIREMENT:
781
+ - Output ONLY the complete, valid, and self-contained HTML code.
782
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
783
 
784
+ def do_request():
785
+ """Synchronous function to be run in a separate thread."""
786
+ response = model.generate_content(prompt)
787
+ # Clean potential markdown fences
788
+ text = response.text.strip()
789
+ if text.startswith("```html"):
790
+ text = text[7:]
791
+ if text.endswith("```"):
792
+ text = text[:-3]
793
+ return text.strip()
794
+
795
+ return await asyncio.to_thread(do_request)
796
 
797
  except Exception as e:
798
  error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
 
800
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
801
 
802
 
803
+ @app.post("/api/translate_file_gemini_paddle", response_class=HTMLResponse)
804
+ async def translate_document_paddle_ocr(
 
 
805
  target_language: str = Form(...), file: UploadFile = File(...)
806
  ):
807
  """
808
+ Processes a document using a PaddleOCR-based pipeline:
809
+ 1. PaddleOCR extracts text and coordinates from the uploaded image.
810
+ 2. Gemini translates the extracted text in a single batch call.
811
+ 3. Gemini uses the translated data to generate a final, layout-aware HTML.
812
  """
813
  content_type = file.content_type
814
  if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
 
823
  if not image_bytes:
824
  raise HTTPException(status_code=400, detail="Uploaded file is empty.")
825
 
826
+ # === STEP 1: Run PaddleOCR extraction ===
827
+ print("***** Step 1: Starting PaddleOCR extraction ******")
828
+ paddle_data = await extract_text_and_boxes_with_paddle(image_bytes)
829
 
830
+ if not paddle_data:
831
  raise HTTPException(
832
  status_code=400,
833
+ detail="PaddleOCR could not extract any data from the image.",
834
  )
835
+ print("***** Step 1 Done: Finished OCR extraction ******")
 
836
 
837
+ # === STEP 2: Translate OCR output using Gemini ===
838
+ print("***** Step 2: Starting translation with Gemini ******")
839
+ translated_paddle_data = await translate_paddle_data_with_gemini(
840
+ paddle_data, target_language
841
  )
842
+ print("***** Step 2 Done: Finished translation ******")
843
+
844
+ # === STEP 3: Generate final HTML from the translated data ===
845
+ print("***** Step 3: Generating final HTML from PaddleOCR data via Gemini ******")
846
+ final_html = await generate_html_from_paddle_ocr(
847
+ translated_paddle_data
848
  )
849
+
850
+ print("***** Step 3 Done: Generated final HTML ******")
851
 
852
  return HTMLResponse(content=final_html)
853
 
854
  except HTTPException:
855
+ raise # Re-raise HTTPException to let FastAPI handle it
 
856
  except Exception as e:
857
  traceback.print_exc()
858
  raise HTTPException(