Spaces:

FelipeErmeson
/

projeto-rag

Running on Zero

File size: 2,293 Bytes

d0c774c
 
 
fff96e7
d4a1668
d0c774c
 
 
 
 
b9e4562
 
 
d0c774c
 
 
 
c71745c
 
fff96e7
 
c71745c
 
d0c774c
d4a1668
fff96e7
ece1514
 
 
 
 
 
 
fff96e7
d0c774c

from PyPDF2 import PdfReader
from PIL import Image
from io import BytesIO
from docling.document_converter import DocumentConverter
import spaces

EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
EXTENSIONS_FILES = ['pdf']
EXTENSIONS_ALLOWED = EXTENSIONS_IMG_FILES + EXTENSIONS_FILES

MSG_NENHUM_ARQUIVO_ENVIADO = 'Nenhum arquivo enviado.'
MSG_TEXTO_NAO_EXTRAIDO = "Não foi possível extrair o texto."

# Max dimensions for processing
MAX_IMAGE_SIZE = 2000  # pixels

def fix_type(file_upload):
    type_file = file_upload.split('/')[-1].split('.')[-1]
    if type_file in EXTENSIONS_IMG_FILES:
        return None, type_file
        # return read_file_img(file_upload), type_file
    elif type_file in EXTENSIONS_FILES:
        return read_file_pdf(file_upload), type_file

@spaces.GPU
def doc_converter(file_path):
    try:
        converter = DocumentConverter()
        result = converter.convert(file_path)
        return result.document.export_to_markdown()
    except Exception as ex:
        print(ex)
        return None

# Resize image while maintaining aspect ratio
def resize_image(image, max_size):
    width, height = image.size
    if width <= max_size and height <= max_size:
        return image
    
    if width > height:
        new_width = max_size
        new_height = int(height * (max_size / width))
    else:
        new_height = max_size
        new_width = int(width * (max_size / height))
    
    return image.resize((new_width, new_height), Image.LANCZOS)

def process_image(image_bytes):
    try:
        image = Image.open(BytesIO(image_bytes))
        # Resize large images to prevent memory issues
        # resized = resize_image(image, MAX_IMAGE_SIZE)
        return image
    except Exception as e:
        # st.error(f"Error processing image: {str(e)}")
        return None

def read_file_img(file_img):
    image_bytes = file_img.getvalue()
    img_pil = process_image(image_bytes)
    return img_pil

def read_file_pdf(file_pdf):
    # image_bytes = file_pdf.getvalue()
    reader = PdfReader(file_pdf)
    return reader

def extract_content_in_pdf(reader):
    raw_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            raw_text += text + "\\n"
    
    return raw_text

if __name__ == '__main__':
    pass