File size: 2,293 Bytes
d0c774c
 
 
fff96e7
d4a1668
d0c774c
 
 
 
 
b9e4562
 
 
d0c774c
 
 
 
c71745c
 
fff96e7
 
c71745c
 
d0c774c
d4a1668
fff96e7
ece1514
 
 
 
 
 
 
fff96e7
d0c774c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from PyPDF2 import PdfReader
from PIL import Image
from io import BytesIO
from docling.document_converter import DocumentConverter
import spaces

EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
EXTENSIONS_FILES = ['pdf']
EXTENSIONS_ALLOWED = EXTENSIONS_IMG_FILES + EXTENSIONS_FILES

MSG_NENHUM_ARQUIVO_ENVIADO = 'Nenhum arquivo enviado.'
MSG_TEXTO_NAO_EXTRAIDO = "Não foi possível extrair o texto."

# Max dimensions for processing
MAX_IMAGE_SIZE = 2000  # pixels

def fix_type(file_upload):
    type_file = file_upload.split('/')[-1].split('.')[-1]
    if type_file in EXTENSIONS_IMG_FILES:
        return None, type_file
        # return read_file_img(file_upload), type_file
    elif type_file in EXTENSIONS_FILES:
        return read_file_pdf(file_upload), type_file

@spaces.GPU
def doc_converter(file_path):
    try:
        converter = DocumentConverter()
        result = converter.convert(file_path)
        return result.document.export_to_markdown()
    except Exception as ex:
        print(ex)
        return None

# Resize image while maintaining aspect ratio
def resize_image(image, max_size):
    width, height = image.size
    if width <= max_size and height <= max_size:
        return image
    
    if width > height:
        new_width = max_size
        new_height = int(height * (max_size / width))
    else:
        new_height = max_size
        new_width = int(width * (max_size / height))
    
    return image.resize((new_width, new_height), Image.LANCZOS)

def process_image(image_bytes):
    try:
        image = Image.open(BytesIO(image_bytes))
        # Resize large images to prevent memory issues
        # resized = resize_image(image, MAX_IMAGE_SIZE)
        return image
    except Exception as e:
        # st.error(f"Error processing image: {str(e)}")
        return None

def read_file_img(file_img):
    image_bytes = file_img.getvalue()
    img_pil = process_image(image_bytes)
    return img_pil

def read_file_pdf(file_pdf):
    # image_bytes = file_pdf.getvalue()
    reader = PdfReader(file_pdf)
    return reader

def extract_content_in_pdf(reader):
    raw_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            raw_text += text + "\\n"
    
    return raw_text

if __name__ == '__main__':
    pass