projeto-rag / utils.py
Felipe Silva
Ajustando github
b9e4562
from PyPDF2 import PdfReader
from PIL import Image
from io import BytesIO
from docling.document_converter import DocumentConverter
import spaces
EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
EXTENSIONS_FILES = ['pdf']
EXTENSIONS_ALLOWED = EXTENSIONS_IMG_FILES + EXTENSIONS_FILES
MSG_NENHUM_ARQUIVO_ENVIADO = 'Nenhum arquivo enviado.'
MSG_TEXTO_NAO_EXTRAIDO = "Não foi possível extrair o texto."
# Max dimensions for processing
MAX_IMAGE_SIZE = 2000 # pixels
def fix_type(file_upload):
type_file = file_upload.split('/')[-1].split('.')[-1]
if type_file in EXTENSIONS_IMG_FILES:
return None, type_file
# return read_file_img(file_upload), type_file
elif type_file in EXTENSIONS_FILES:
return read_file_pdf(file_upload), type_file
@spaces.GPU
def doc_converter(file_path):
try:
converter = DocumentConverter()
result = converter.convert(file_path)
return result.document.export_to_markdown()
except Exception as ex:
print(ex)
return None
# Resize image while maintaining aspect ratio
def resize_image(image, max_size):
width, height = image.size
if width <= max_size and height <= max_size:
return image
if width > height:
new_width = max_size
new_height = int(height * (max_size / width))
else:
new_height = max_size
new_width = int(width * (max_size / height))
return image.resize((new_width, new_height), Image.LANCZOS)
def process_image(image_bytes):
try:
image = Image.open(BytesIO(image_bytes))
# Resize large images to prevent memory issues
# resized = resize_image(image, MAX_IMAGE_SIZE)
return image
except Exception as e:
# st.error(f"Error processing image: {str(e)}")
return None
def read_file_img(file_img):
image_bytes = file_img.getvalue()
img_pil = process_image(image_bytes)
return img_pil
def read_file_pdf(file_pdf):
# image_bytes = file_pdf.getvalue()
reader = PdfReader(file_pdf)
return reader
def extract_content_in_pdf(reader):
raw_text = ""
for page in reader.pages:
text = page.extract_text()
if text:
raw_text += text + "\\n"
return raw_text
if __name__ == '__main__':
pass