Spaces:
Paused
Paused
File size: 3,215 Bytes
eb5ec73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import requests
import os
import logging
from check_structure import check_existing_file, check_existing_folder
def import_raw_data(raw_data_relative_path, filenames, bucket_folder_url):
"""import filenames from bucket_folder_url in raw_data_relative_path"""
if check_existing_folder(raw_data_relative_path):
os.makedirs(raw_data_relative_path)
# download all the files
for filename in filenames:
input_file = os.path.join(bucket_folder_url, filename)
output_file = os.path.join(raw_data_relative_path, filename)
if check_existing_file(output_file):
object_url = input_file
print(f"downloading {input_file} as {os.path.basename(output_file)}")
response = requests.get(object_url)
if response.status_code == 200:
# Process the response content as needed
content = (
response.content
) # Utilisez response.content pour les fichiers binaires
with open(output_file, "wb") as file:
file.write(content)
else:
print(f"Error accessing the object {input_file}:", response.status_code)
# Téléchargez le dossier 'img_train'
img_train_folder_url = os.path.join(bucket_folder_url, "image_train/")
img_train_local_path = os.path.join(raw_data_relative_path, "image_train/")
if check_existing_folder(img_train_local_path):
os.makedirs(img_train_local_path)
try:
response = requests.get(img_train_folder_url)
if response.status_code == 200:
file_list = response.text.splitlines()
for img_url in file_list:
img_filename = os.path.basename(img_url)
output_file = os.path.join(img_train_local_path, img_filename)
if check_existing_file(output_file):
print(f"downloading {img_url} as {img_filename}")
img_response = requests.get(img_url)
if img_response.status_code == 200:
with open(output_file, "wb") as img_file:
img_file.write(img_response.content)
else:
print(f"Error downloading {img_url}:", img_response.status_code)
else:
print(
f"Error accessing the object list {img_train_folder_url}:",
response.status_code,
)
except Exception as e:
# nonlocalprint(f"An error occurred: {str(e)}")
print(f"An error occurred: {str(e)}")
def main(
raw_data_relative_path="./data/raw",
filenames=["X_test_update.csv", "X_train_update.csv", "Y_train_CVw08PX.csv"],
bucket_folder_url="https://mlops-project-db.s3.eu-west-1.amazonaws.com/classification_e-commerce/",
):
"""Upload data from AWS s3 in ./data/raw"""
import_raw_data(raw_data_relative_path, filenames, bucket_folder_url)
logger = logging.getLogger(__name__)
logger.info("making raw data set")
if __name__ == "__main__":
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
main()
|