Spaces:
Sleeping
Sleeping
Istvan-Adem commited on
Commit ·
df4c46f
1
Parent(s): e966906
add pytesseract
Browse files- ocr/api/message/prompts.py +2 -2
- ocr/api/message/utils.py +12 -0
- ocr/api/message/views.py +4 -4
ocr/api/message/prompts.py
CHANGED
|
@@ -27,8 +27,8 @@ The report must be structured as follows, with each section containing only rele
|
|
| 27 |
[/INST]"""
|
| 28 |
extract_original_text = """## Task
|
| 29 |
|
| 30 |
-
You must
|
| 31 |
|
| 32 |
## Important notes
|
| 33 |
|
| 34 |
-
- You must
|
|
|
|
| 27 |
[/INST]"""
|
| 28 |
extract_original_text = """## Task
|
| 29 |
|
| 30 |
+
You must return ALL provided text, but not include the patient's name, contact details, or demographic data.
|
| 31 |
|
| 32 |
## Important notes
|
| 33 |
|
| 34 |
+
- You must return all text but exclude any information related to the name, contact details, and demographic data."""
|
ocr/api/message/utils.py
CHANGED
|
@@ -2,6 +2,8 @@ import base64
|
|
| 2 |
import io
|
| 3 |
import re
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from pdf2image import convert_from_bytes
|
| 6 |
|
| 7 |
|
|
@@ -16,6 +18,16 @@ def divide_images(contents: bytes) -> list[bytes]:
|
|
| 16 |
return image_bytes_list
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def prepare_request_content(images: list[bytes]) -> list:
|
| 20 |
content = [
|
| 21 |
{"type": "text", "text": "Generate a report on the attached document"},
|
|
|
|
| 2 |
import io
|
| 3 |
import re
|
| 4 |
|
| 5 |
+
import pytesseract
|
| 6 |
+
from PIL import Image
|
| 7 |
from pdf2image import convert_from_bytes
|
| 8 |
|
| 9 |
|
|
|
|
| 18 |
return image_bytes_list
|
| 19 |
|
| 20 |
|
| 21 |
+
def extract_text_from_images(images: list[bytes]) -> str:
|
| 22 |
+
extracted_texts = []
|
| 23 |
+
|
| 24 |
+
for image_bytes in images:
|
| 25 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 26 |
+
text = pytesseract.image_to_string(image)
|
| 27 |
+
extracted_texts.append(text)
|
| 28 |
+
|
| 29 |
+
return '\n'.join(extracted_texts)
|
| 30 |
+
|
| 31 |
def prepare_request_content(images: list[bytes]) -> list:
|
| 32 |
content = [
|
| 33 |
{"type": "text", "text": "Generate a report on the attached document"},
|
ocr/api/message/views.py
CHANGED
|
@@ -5,7 +5,7 @@ from fastapi import File, UploadFile, HTTPException
|
|
| 5 |
from ocr.api.message import ocr_router
|
| 6 |
from ocr.api.message.openai_request import generate_report, extract_original_text
|
| 7 |
from ocr.api.message.schemas import OcrResponse
|
| 8 |
-
from ocr.api.message.utils import divide_images, clean_response,
|
| 9 |
from ocr.core.wrappers import OcrResponseWrapper
|
| 10 |
|
| 11 |
|
|
@@ -21,10 +21,10 @@ async def get_all_chat_messages(
|
|
| 21 |
images = [contents]
|
| 22 |
else:
|
| 23 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
| 24 |
-
|
| 25 |
original_text, response = await asyncio.gather(
|
| 26 |
-
extract_original_text(
|
| 27 |
-
generate_report(
|
| 28 |
)
|
| 29 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
|
| 30 |
finally:
|
|
|
|
| 5 |
from ocr.api.message import ocr_router
|
| 6 |
from ocr.api.message.openai_request import generate_report, extract_original_text
|
| 7 |
from ocr.api.message.schemas import OcrResponse
|
| 8 |
+
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
|
| 9 |
from ocr.core.wrappers import OcrResponseWrapper
|
| 10 |
|
| 11 |
|
|
|
|
| 21 |
images = [contents]
|
| 22 |
else:
|
| 23 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
| 24 |
+
text_content = extract_text_from_images(images)
|
| 25 |
original_text, response = await asyncio.gather(
|
| 26 |
+
extract_original_text(text_content),
|
| 27 |
+
generate_report(text_content)
|
| 28 |
)
|
| 29 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
|
| 30 |
finally:
|