import requests import os import logging from check_structure import check_existing_file, check_existing_folder def import_raw_data(raw_data_relative_path, filenames, bucket_folder_url): """import filenames from bucket_folder_url in raw_data_relative_path""" if check_existing_folder(raw_data_relative_path): os.makedirs(raw_data_relative_path) # download all the files for filename in filenames: input_file = os.path.join(bucket_folder_url, filename) output_file = os.path.join(raw_data_relative_path, filename) if check_existing_file(output_file): object_url = input_file print(f"downloading {input_file} as {os.path.basename(output_file)}") response = requests.get(object_url) if response.status_code == 200: # Process the response content as needed content = ( response.content ) # Utilisez response.content pour les fichiers binaires with open(output_file, "wb") as file: file.write(content) else: print(f"Error accessing the object {input_file}:", response.status_code) # Téléchargez le dossier 'img_train' img_train_folder_url = os.path.join(bucket_folder_url, "image_train/") img_train_local_path = os.path.join(raw_data_relative_path, "image_train/") if check_existing_folder(img_train_local_path): os.makedirs(img_train_local_path) try: response = requests.get(img_train_folder_url) if response.status_code == 200: file_list = response.text.splitlines() for img_url in file_list: img_filename = os.path.basename(img_url) output_file = os.path.join(img_train_local_path, img_filename) if check_existing_file(output_file): print(f"downloading {img_url} as {img_filename}") img_response = requests.get(img_url) if img_response.status_code == 200: with open(output_file, "wb") as img_file: img_file.write(img_response.content) else: print(f"Error downloading {img_url}:", img_response.status_code) else: print( f"Error accessing the object list {img_train_folder_url}:", response.status_code, ) except Exception as e: # nonlocalprint(f"An error occurred: {str(e)}") print(f"An error occurred: {str(e)}") def main( raw_data_relative_path="./data/raw", filenames=["X_test_update.csv", "X_train_update.csv", "Y_train_CVw08PX.csv"], bucket_folder_url="https://mlops-project-db.s3.eu-west-1.amazonaws.com/classification_e-commerce/", ): """Upload data from AWS s3 in ./data/raw""" import_raw_data(raw_data_relative_path, filenames, bucket_folder_url) logger = logging.getLogger(__name__) logger.info("making raw data set") if __name__ == "__main__": log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig(level=logging.INFO, format=log_fmt) main()