Spaces:
Build error
Build error
| import os | |
| from pdf_extract_kit.utils.data_preprocess import load_pdf | |
| class BaseTask: | |
| def __init__(self, model): | |
| self.model = model | |
| def load_images(self, input_data): | |
| """ | |
| Loads images from a single image path or a directory containing multiple images. | |
| Args: | |
| input_data (str): Path to a single image file or a directory containing image files. | |
| Returns: | |
| list: List of paths to all images to be predicted. | |
| """ | |
| images = [] | |
| if os.path.isdir(input_data): | |
| # If input_data is a directory, check for nested directories | |
| for root, dirs, files in os.walk(input_data): | |
| if dirs: | |
| raise ValueError("Input directory should not contain nested directories: {}".format(input_data)) | |
| for file in files: | |
| if file.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| image_path = os.path.join(root, file) | |
| images.append(image_path) | |
| images = sorted(images) | |
| break # Only process the top-level directory | |
| else: | |
| # Determine the type of input data and process accordingly | |
| if input_data.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| # If input is a single image file | |
| images = [input_data] | |
| else: | |
| raise ValueError("Unsupported input data format: {}".format(input_data)) | |
| return images | |
| def load_pdf_images(self, input_data): | |
| """ | |
| Loads images from a single PDF file or directory containing multiple PDF files. | |
| Args: | |
| input_data (str): Path to a single PDF file or a directory containing PDF files. | |
| Returns: | |
| dict: Dictionary with image IDs (formed by PDF path and page number) as keys and corresponding PIL.Image objects as values. | |
| Note: Loading multiple PDFs at once is not recommended due to high memory consumption. Consider processing one PDF at a time externally using loops or multithreading. | |
| """ | |
| pdf_images = {} | |
| if os.path.isdir(input_data): | |
| # If input_data is a directory, check for nested directories | |
| for root, dirs, files in os.walk(input_data): | |
| if dirs: | |
| raise ValueError("Input directory should not contain nested directories: {}".format(input_data)) | |
| for file in files: | |
| if file.lower().endswith(('.pdf')): | |
| pdf_path = os.path.join(root, file) | |
| images = load_pdf(pdf_path) | |
| for i, img in enumerate(images): | |
| img_id = f"{os.path.splitext(file)[0]}_page_{i+1:04d}" | |
| pdf_images[img_id] = img | |
| # images = sorted(images) | |
| break # Only process the top-level directory | |
| else: | |
| # Determine the type of input data and process accordingly | |
| if input_data.lower().endswith(('.pdf')): | |
| # If input is a single image file | |
| images = load_pdf(input_data) | |
| for i, img in enumerate(images): | |
| img_id = f"{os.path.splitext(os.path.basename(input_data))[0]}_page_{i+1:04d}" | |
| pdf_images[img_id] = img | |
| else: | |
| raise ValueError("Unsupported input data format: {}".format(input_data)) | |
| return pdf_images |