File size: 1,770 Bytes
b28ce62 ed5ec6a 7ae78bd ed5ec6a 7ae78bd ed5ec6a b28ce62 3f47633 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import base64
import io
import re
import pytesseract
from PIL import Image
from pdf2image import convert_from_bytes
def transform_messages_to_openai(messages: list, user_query: str) -> list[dict]:
openai_messages = []
for message in messages:
content = message.text
openai_messages.append({
"role": message.author.value,
"content": content
})
openai_messages.append({"role": "user", "content": user_query})
return openai_messages
def divide_images(contents: bytes) -> list[bytes]:
images = convert_from_bytes(contents, dpi=250)
image_bytes_list = []
for image in images:
img_byte_array = io.BytesIO()
image.save(img_byte_array, format='PNG')
img_byte_array.seek(0)
image_bytes_list.append(img_byte_array.read())
return image_bytes_list
def extract_text_from_images(images: list[bytes]) -> str:
extracted_texts = []
for image_bytes in images:
image = Image.open(io.BytesIO(image_bytes))
text = pytesseract.image_to_string(image)
extracted_texts.append(text)
return '\n'.join(extracted_texts)
def clean_response(text: str) -> str:
try:
text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1)
except Exception as e:
pass
return text
def prepare_request_content(images: list[bytes]):
content = [
{"type": "text", "text": "Generate a report on the attached document"},
*[
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
},
}
for image in images
]
]
return content |