Spaces:

brestok
/

ocr-backend

Sleeping

Istvan-Adem commited on Feb 23, 2025

Commit

df4c46f

1 Parent(s): e966906

add pytesseract

Files changed (3) hide show

ocr/api/message/prompts.py CHANGED Viewed

@@ -27,8 +27,8 @@ The report must be structured as follows, with each section containing only rele
 [/INST]"""
     extract_original_text = """## Task
-You must extract all text from the attached images and return it in the **text** field. You must not include the patient's name, contact details, or demographic data.
 ## Important notes
-- You must extract all text but exclude any information related to the name, contact details, and demographic data."""

 [/INST]"""
     extract_original_text = """## Task
+You must return ALL provided text, but not include the patient's name, contact details, or demographic data.
 ## Important notes
+- You must return all text but exclude any information related to the name, contact details, and demographic data."""

ocr/api/message/utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import base64
 import io
 import re
 from pdf2image import convert_from_bytes
@@ -16,6 +18,16 @@ def divide_images(contents: bytes) -> list[bytes]:
     return image_bytes_list
 def prepare_request_content(images: list[bytes]) -> list:
     content = [
         {"type": "text", "text": "Generate a report on the attached document"},

 import io
 import re
+import pytesseract
+from PIL import Image
 from pdf2image import convert_from_bytes
     return image_bytes_list
+def extract_text_from_images(images: list[bytes]) -> str:
+    extracted_texts = []
+    for image_bytes in images:
+        image = Image.open(io.BytesIO(image_bytes))
+        text = pytesseract.image_to_string(image)
+        extracted_texts.append(text)
+    return '\n'.join(extracted_texts)
 def prepare_request_content(images: list[bytes]) -> list:
     content = [
         {"type": "text", "text": "Generate a report on the attached document"},

ocr/api/message/views.py CHANGED Viewed

@@ -5,7 +5,7 @@ from fastapi import File, UploadFile, HTTPException
 from ocr.api.message import ocr_router
 from ocr.api.message.openai_request import generate_report, extract_original_text
 from ocr.api.message.schemas import OcrResponse
-from ocr.api.message.utils import divide_images, clean_response, prepare_request_content
 from ocr.core.wrappers import OcrResponseWrapper
@@ -21,10 +21,10 @@ async def get_all_chat_messages(
             images = [contents]
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
-        content = prepare_request_content(images)
         original_text, response = await asyncio.gather(
-            extract_original_text(content),
-            generate_report(content)
         )
         return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
     finally:

 from ocr.api.message import ocr_router
 from ocr.api.message.openai_request import generate_report, extract_original_text
 from ocr.api.message.schemas import OcrResponse
+from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
 from ocr.core.wrappers import OcrResponseWrapper
             images = [contents]
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
+        text_content = extract_text_from_images(images)
         original_text, response = await asyncio.gather(
+            extract_original_text(text_content),
+            generate_report(text_content)
         )
         return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
     finally: