Spaces:
Running
on
Zero
Running
on
Zero
| from PyPDF2 import PdfReader | |
| from PIL import Image | |
| from io import BytesIO | |
| from docling.document_converter import DocumentConverter | |
| import spaces | |
| EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png'] | |
| EXTENSIONS_FILES = ['pdf'] | |
| EXTENSIONS_ALLOWED = EXTENSIONS_IMG_FILES + EXTENSIONS_FILES | |
| MSG_NENHUM_ARQUIVO_ENVIADO = 'Nenhum arquivo enviado.' | |
| MSG_TEXTO_NAO_EXTRAIDO = "Não foi possível extrair o texto." | |
| # Max dimensions for processing | |
| MAX_IMAGE_SIZE = 2000 # pixels | |
| def fix_type(file_upload): | |
| type_file = file_upload.split('/')[-1].split('.')[-1] | |
| if type_file in EXTENSIONS_IMG_FILES: | |
| return None, type_file | |
| # return read_file_img(file_upload), type_file | |
| elif type_file in EXTENSIONS_FILES: | |
| return read_file_pdf(file_upload), type_file | |
| def doc_converter(file_path): | |
| try: | |
| converter = DocumentConverter() | |
| result = converter.convert(file_path) | |
| return result.document.export_to_markdown() | |
| except Exception as ex: | |
| print(ex) | |
| return None | |
| # Resize image while maintaining aspect ratio | |
| def resize_image(image, max_size): | |
| width, height = image.size | |
| if width <= max_size and height <= max_size: | |
| return image | |
| if width > height: | |
| new_width = max_size | |
| new_height = int(height * (max_size / width)) | |
| else: | |
| new_height = max_size | |
| new_width = int(width * (max_size / height)) | |
| return image.resize((new_width, new_height), Image.LANCZOS) | |
| def process_image(image_bytes): | |
| try: | |
| image = Image.open(BytesIO(image_bytes)) | |
| # Resize large images to prevent memory issues | |
| # resized = resize_image(image, MAX_IMAGE_SIZE) | |
| return image | |
| except Exception as e: | |
| # st.error(f"Error processing image: {str(e)}") | |
| return None | |
| def read_file_img(file_img): | |
| image_bytes = file_img.getvalue() | |
| img_pil = process_image(image_bytes) | |
| return img_pil | |
| def read_file_pdf(file_pdf): | |
| # image_bytes = file_pdf.getvalue() | |
| reader = PdfReader(file_pdf) | |
| return reader | |
| def extract_content_in_pdf(reader): | |
| raw_text = "" | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| raw_text += text + "\\n" | |
| return raw_text | |
| if __name__ == '__main__': | |
| pass |