KevanSoon
commited on
Commit
·
1a92019
1
Parent(s):
6ac8032
paddle OCR only endpoint
Browse files
app.py
CHANGED
|
@@ -620,114 +620,179 @@ async def translate_document_dual_ocr(
|
|
| 620 |
|
| 621 |
#-------------------------- start of updated gemini workflow ----------------------------------
|
| 622 |
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
async def get_hocr_from_image(image_bytes: bytes) -> str:
|
| 626 |
"""
|
| 627 |
-
|
| 628 |
-
This function accepts image bytes.
|
| 629 |
"""
|
| 630 |
-
if not
|
| 631 |
-
|
| 632 |
|
| 633 |
try:
|
| 634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
except Exception as e:
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
)
|
| 640 |
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
|
|
|
|
|
|
| 654 |
"""
|
| 655 |
-
|
| 656 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
"""
|
| 658 |
try:
|
| 659 |
api_key = os.getenv("GEMINI_API_KEY")
|
| 660 |
if not api_key:
|
| 661 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
| 662 |
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
# model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
|
| 666 |
|
| 667 |
prompt = f"""
|
| 668 |
-
You are
|
| 669 |
-
|
| 670 |
-
clean, and well-styled HTML document that visually represents the original document layout.
|
| 671 |
|
| 672 |
-
Input:
|
| 673 |
-
---
|
| 674 |
-
{
|
| 675 |
-
---
|
| 676 |
|
| 677 |
STRICT RULES:
|
| 678 |
-
1.
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
You should translate "Hello" to {target_language} and place the translated word inside a styled div at `left: 135px; top: 73px;`.
|
| 693 |
|
| 694 |
FINAL OUTPUT REQUIREMENT:
|
| 695 |
-
- Output ONLY the complete, valid, and
|
| 696 |
"""
|
| 697 |
-
|
| 698 |
-
# This part remains a placeholder for the actual API call
|
| 699 |
-
# Since I cannot make live API calls, I'll simulate a response structure.
|
| 700 |
-
# In a real implementation, you would use the Gemini SDK here.
|
| 701 |
-
|
| 702 |
-
# --- MOCK API CALL START ---
|
| 703 |
-
# async with httpx.AsyncClient() as client:
|
| 704 |
-
# # In a real scenario, you'd use the Gemini client library
|
| 705 |
-
# # response = await client.post(...)
|
| 706 |
-
# # mocked_response_text = response.text.strip()
|
| 707 |
-
# --- MOCK API CALL END ---
|
| 708 |
-
|
| 709 |
-
# For demonstration, this function would return the generated HTML from Gemini
|
| 710 |
-
# For now, we'll just wrap the input in a basic HTML structure for testing.
|
| 711 |
-
mocked_response_text = f"""
|
| 712 |
-
<!DOCTYPE html>
|
| 713 |
-
<html>
|
| 714 |
-
<head>
|
| 715 |
-
<title>Translated Document</title>
|
| 716 |
-
<style>
|
| 717 |
-
body {{ font-family: sans-serif; }}
|
| 718 |
-
.translated-content {{ border: 1px solid #ccc; padding: 20px; }}
|
| 719 |
-
</style>
|
| 720 |
-
</head>
|
| 721 |
-
<body>
|
| 722 |
-
<h1>Translation and Generation in Progress</h1>
|
| 723 |
-
<p>This is a placeholder response. In a real application, Gemini would generate the full HTML based on the provided hOCR.</p>
|
| 724 |
-
<h2>Original hOCR Provided:</h2>
|
| 725 |
-
<pre><code>{html.escape(hocr_html)}</code></pre>
|
| 726 |
-
</body>
|
| 727 |
-
</html>
|
| 728 |
-
"""
|
| 729 |
-
return mocked_response_text.strip()
|
| 730 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
|
| 732 |
except Exception as e:
|
| 733 |
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
|
@@ -735,17 +800,15 @@ async def generate_final_html_from_hocr_with_gemini(
|
|
| 735 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
| 736 |
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
@app.post("/api/translate_file_hocr_gemini", response_class=HTMLResponse)
|
| 741 |
-
async def translate_document_hocr_gemini(
|
| 742 |
target_language: str = Form(...), file: UploadFile = File(...)
|
| 743 |
):
|
| 744 |
"""
|
| 745 |
-
Processes a document using a
|
| 746 |
-
1.
|
| 747 |
-
2. Gemini
|
| 748 |
-
|
| 749 |
"""
|
| 750 |
content_type = file.content_type
|
| 751 |
if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
|
@@ -760,32 +823,36 @@ async def translate_document_hocr_gemini(
|
|
| 760 |
if not image_bytes:
|
| 761 |
raise HTTPException(status_code=400, detail="Uploaded file is empty.")
|
| 762 |
|
| 763 |
-
# === STEP 1: Run
|
| 764 |
-
print("***** Step 1: Starting
|
| 765 |
-
|
| 766 |
|
| 767 |
-
if not
|
| 768 |
raise HTTPException(
|
| 769 |
status_code=400,
|
| 770 |
-
detail="
|
| 771 |
)
|
| 772 |
-
print(
|
| 773 |
-
print("***** Step 1 Done: Finished hOCR extraction ******")
|
| 774 |
|
| 775 |
-
# === STEP 2:
|
| 776 |
-
print(
|
| 777 |
-
|
|
|
|
| 778 |
)
|
| 779 |
-
|
| 780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
)
|
| 782 |
-
|
|
|
|
| 783 |
|
| 784 |
return HTMLResponse(content=final_html)
|
| 785 |
|
| 786 |
except HTTPException:
|
| 787 |
-
# Re-raise HTTPException to
|
| 788 |
-
raise
|
| 789 |
except Exception as e:
|
| 790 |
traceback.print_exc()
|
| 791 |
raise HTTPException(
|
|
|
|
| 620 |
|
| 621 |
#-------------------------- start of updated gemini workflow ----------------------------------
|
| 622 |
|
| 623 |
+
async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
|
|
|
|
|
|
|
| 624 |
"""
|
| 625 |
+
Translates a list of texts using Gemini in a single batch API call.
|
|
|
|
| 626 |
"""
|
| 627 |
+
if not texts:
|
| 628 |
+
return []
|
| 629 |
|
| 630 |
try:
|
| 631 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 632 |
+
if not api_key:
|
| 633 |
+
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
| 634 |
+
|
| 635 |
+
genai.configure(api_key=api_key)
|
| 636 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
|
| 637 |
+
|
| 638 |
+
# Create a single prompt asking for a JSON array response
|
| 639 |
+
prompt = f"""
|
| 640 |
+
Translate each string in the following JSON array of strings to {target_language}.
|
| 641 |
+
Return a single JSON array where each element is the translated string corresponding
|
| 642 |
+
to the original at the same index. Your output MUST be only the JSON array and nothing else.
|
| 643 |
+
|
| 644 |
+
Example Input:
|
| 645 |
+
["Hello world", "How are you?"]
|
| 646 |
+
|
| 647 |
+
Example Output for target language 'Spanish':
|
| 648 |
+
["Hola mundo", "¿Cómo estás?"]
|
| 649 |
+
|
| 650 |
+
Input for this task:
|
| 651 |
+
{json.dumps(texts)}
|
| 652 |
+
"""
|
| 653 |
+
|
| 654 |
+
def do_request():
|
| 655 |
+
"""Synchronous function to be run in a separate thread."""
|
| 656 |
+
response = model.generate_content(prompt)
|
| 657 |
+
return response.text.strip()
|
| 658 |
+
|
| 659 |
+
# Run the synchronous SDK call in a thread to avoid blocking asyncio
|
| 660 |
+
response_text = await asyncio.to_thread(do_request)
|
| 661 |
+
|
| 662 |
+
# Clean the response to ensure it's valid JSON
|
| 663 |
+
json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
| 664 |
+
if not json_response_match:
|
| 665 |
+
print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
|
| 666 |
+
# Fallback: return original texts if parsing fails
|
| 667 |
+
return texts
|
| 668 |
+
|
| 669 |
+
cleaned_json = json_response_match.group(0)
|
| 670 |
+
translated_texts = json.loads(cleaned_json)
|
| 671 |
+
|
| 672 |
+
if len(translated_texts) != len(texts):
|
| 673 |
+
print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
|
| 674 |
+
# Fallback in case of length mismatch
|
| 675 |
+
return texts
|
| 676 |
+
|
| 677 |
+
return translated_texts
|
| 678 |
+
|
| 679 |
except Exception as e:
|
| 680 |
+
print(f"An error occurred during Gemini translation: {e}")
|
| 681 |
+
# Return original texts as a fallback
|
| 682 |
+
return texts
|
|
|
|
| 683 |
|
| 684 |
+
# --- OCR EXTRACTION FUNCTION ---
|
| 685 |
+
|
| 686 |
+
async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
|
| 687 |
+
"""
|
| 688 |
+
Extracts text and their bounding boxes from an image using PaddleOCR.
|
| 689 |
+
Returns the full list of dictionary objects from the OCR tool.
|
| 690 |
+
"""
|
| 691 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
| 692 |
+
temp_file.write(image_bytes)
|
| 693 |
+
temp_filepath = temp_file.name
|
| 694 |
|
| 695 |
+
try:
|
| 696 |
+
def do_ocr() -> list[dict]:
|
| 697 |
+
"""Synchronous function to be run in a separate thread."""
|
| 698 |
+
client = Client("kevansoon/PaddleOCR")
|
| 699 |
+
# Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
|
| 700 |
+
result = client.predict(
|
| 701 |
+
img=handle_file(temp_filepath),
|
| 702 |
+
lang="en",
|
| 703 |
+
api_name="/predict",
|
| 704 |
+
)
|
| 705 |
+
return result
|
| 706 |
|
| 707 |
+
loop = asyncio.get_running_loop()
|
| 708 |
+
extracted_data = await loop.run_in_executor(None, do_ocr)
|
| 709 |
+
if not extracted_data:
|
| 710 |
+
print("Warning: PaddleOCR returned no data.")
|
| 711 |
+
return []
|
| 712 |
+
return extracted_data
|
| 713 |
+
finally:
|
| 714 |
+
os.unlink(temp_filepath)
|
| 715 |
|
| 716 |
+
# --- TRANSLATION FUNCTION (UPDATED TO USE GEMINI) ---
|
| 717 |
+
|
| 718 |
+
async def translate_paddle_data_with_gemini(
|
| 719 |
+
paddle_data: list[dict], target_language: str
|
| 720 |
+
) -> list[dict]:
|
| 721 |
"""
|
| 722 |
+
Translates the 'text' field of each item in the paddle_data list
|
| 723 |
+
using a single batch call to Gemini.
|
| 724 |
+
"""
|
| 725 |
+
original_texts = [item.get("text", "") for item in paddle_data]
|
| 726 |
+
if not original_texts:
|
| 727 |
+
return []
|
| 728 |
+
|
| 729 |
+
# Translate all texts in one go
|
| 730 |
+
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
| 731 |
+
|
| 732 |
+
translated_data = []
|
| 733 |
+
for i, item in enumerate(paddle_data):
|
| 734 |
+
# Ensure we don't go out of bounds if translation failed
|
| 735 |
+
translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
|
| 736 |
+
translated_data.append({"text": translated_text, "box": item.get("box")})
|
| 737 |
+
|
| 738 |
+
return translated_data
|
| 739 |
+
|
| 740 |
+
# --- FINAL HTML GENERATION ---
|
| 741 |
+
|
| 742 |
+
async def generate_html_from_paddle_ocr(translated_paddle_data: list[dict]) -> str:
|
| 743 |
+
"""
|
| 744 |
+
Receives translated PaddleOCR data and uses Gemini to generate
|
| 745 |
+
a final, layout-aware HTML document.
|
| 746 |
"""
|
| 747 |
try:
|
| 748 |
api_key = os.getenv("GEMINI_API_KEY")
|
| 749 |
if not api_key:
|
| 750 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
| 751 |
|
| 752 |
+
genai.configure(api_key=api_key)
|
| 753 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Using Flash for speed
|
|
|
|
| 754 |
|
| 755 |
prompt = f"""
|
| 756 |
+
You are provided with translated OCR data from PaddleOCR.
|
| 757 |
+
Your task is to convert this data into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
|
|
|
|
| 758 |
|
| 759 |
+
Input: Translated PaddleOCR data (a Python list of dictionaries with 'text' and 'box' keys):
|
| 760 |
+
--- PADDLEOCR DATA START ---
|
| 761 |
+
{str(translated_paddle_data)}
|
| 762 |
+
--- PADDLEOCR DATA END ---
|
| 763 |
|
| 764 |
STRICT RULES:
|
| 765 |
+
1. You MUST output ONLY the FINAL RAW HTML code.
|
| 766 |
+
- Do not wrap the code in ```html or any other markdown.
|
| 767 |
+
- Your output must begin strictly with <!DOCTYPE html> and end with </html>.
|
| 768 |
+
2. ALL text from the input data MUST be included in the final HTML.
|
| 769 |
+
- Every text item must appear exactly once in the correct visual location.
|
| 770 |
+
3. The HTML must be fully self-contained.
|
| 771 |
+
- Include <html>, <head>, <style>, and <body> tags.
|
| 772 |
+
- All CSS must be included in a <style> block in the <head>.
|
| 773 |
+
4. Layout Requirement:
|
| 774 |
+
- Use a <table> structure (<table>, <tbody>, <tr>, <td>) to organize the text into a grid that mimics the original document layout.
|
| 775 |
+
- Analyze the 'box' coordinates to group words that are on the same horizontal line into the same table row (<tr>).
|
| 776 |
+
- Each piece of text should be inside its own table cell (<td>).
|
| 777 |
+
- Apply appropriate CSS to the table and cells (e.g., borders, padding) for readability.
|
| 778 |
+
5. Before outputting your response, internally double-check that you have followed all these rules, especially ensuring every text element from the input is present in the final HTML table.
|
|
|
|
| 779 |
|
| 780 |
FINAL OUTPUT REQUIREMENT:
|
| 781 |
+
- Output ONLY the complete, valid, and self-contained HTML code.
|
| 782 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
|
| 784 |
+
def do_request():
|
| 785 |
+
"""Synchronous function to be run in a separate thread."""
|
| 786 |
+
response = model.generate_content(prompt)
|
| 787 |
+
# Clean potential markdown fences
|
| 788 |
+
text = response.text.strip()
|
| 789 |
+
if text.startswith("```html"):
|
| 790 |
+
text = text[7:]
|
| 791 |
+
if text.endswith("```"):
|
| 792 |
+
text = text[:-3]
|
| 793 |
+
return text.strip()
|
| 794 |
+
|
| 795 |
+
return await asyncio.to_thread(do_request)
|
| 796 |
|
| 797 |
except Exception as e:
|
| 798 |
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
|
|
|
| 800 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
| 801 |
|
| 802 |
|
| 803 |
+
@app.post("/api/translate_file_gemini_paddle", response_class=HTMLResponse)
|
| 804 |
+
async def translate_document_paddle_ocr(
|
|
|
|
|
|
|
| 805 |
target_language: str = Form(...), file: UploadFile = File(...)
|
| 806 |
):
|
| 807 |
"""
|
| 808 |
+
Processes a document using a PaddleOCR-based pipeline:
|
| 809 |
+
1. PaddleOCR extracts text and coordinates from the uploaded image.
|
| 810 |
+
2. Gemini translates the extracted text in a single batch call.
|
| 811 |
+
3. Gemini uses the translated data to generate a final, layout-aware HTML.
|
| 812 |
"""
|
| 813 |
content_type = file.content_type
|
| 814 |
if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
|
|
|
| 823 |
if not image_bytes:
|
| 824 |
raise HTTPException(status_code=400, detail="Uploaded file is empty.")
|
| 825 |
|
| 826 |
+
# === STEP 1: Run PaddleOCR extraction ===
|
| 827 |
+
print("***** Step 1: Starting PaddleOCR extraction ******")
|
| 828 |
+
paddle_data = await extract_text_and_boxes_with_paddle(image_bytes)
|
| 829 |
|
| 830 |
+
if not paddle_data:
|
| 831 |
raise HTTPException(
|
| 832 |
status_code=400,
|
| 833 |
+
detail="PaddleOCR could not extract any data from the image.",
|
| 834 |
)
|
| 835 |
+
print("***** Step 1 Done: Finished OCR extraction ******")
|
|
|
|
| 836 |
|
| 837 |
+
# === STEP 2: Translate OCR output using Gemini ===
|
| 838 |
+
print("***** Step 2: Starting translation with Gemini ******")
|
| 839 |
+
translated_paddle_data = await translate_paddle_data_with_gemini(
|
| 840 |
+
paddle_data, target_language
|
| 841 |
)
|
| 842 |
+
print("***** Step 2 Done: Finished translation ******")
|
| 843 |
+
|
| 844 |
+
# === STEP 3: Generate final HTML from the translated data ===
|
| 845 |
+
print("***** Step 3: Generating final HTML from PaddleOCR data via Gemini ******")
|
| 846 |
+
final_html = await generate_html_from_paddle_ocr(
|
| 847 |
+
translated_paddle_data
|
| 848 |
)
|
| 849 |
+
|
| 850 |
+
print("***** Step 3 Done: Generated final HTML ******")
|
| 851 |
|
| 852 |
return HTMLResponse(content=final_html)
|
| 853 |
|
| 854 |
except HTTPException:
|
| 855 |
+
raise # Re-raise HTTPException to let FastAPI handle it
|
|
|
|
| 856 |
except Exception as e:
|
| 857 |
traceback.print_exc()
|
| 858 |
raise HTTPException(
|