import boto3 import os import pandas as pd import json def upload_files(origin_path, destination_path, aws_access_key, aws_secret_key): session = boto3.Session(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) s3 = session.resource('s3') bucket = s3.Bucket('gideon-corpus') for subdir, dirs, files in os.walk(origin_path): for file in files: full_path = os.path.join(subdir, file) with open(full_path, 'rb') as data: bucket.put_object(Key=destination_path + origin_path.split('/')[-1] + '/' + full_path[len(origin_path) + 1:], Body=data) def retrieve_logs(aws_access_key, aws_secret_key): s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) logs_response = s3.get_object(Bucket='gideon-corpus', Key='logs/logs.csv') logs_df = pd.read_csv(logs_response['Body']) return logs_df def retrieve_casedocs(case_num, aws_access_key, aws_secret_key): # Note: this is how stuff is stored on AWS s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) opinions_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/opinions.csv') opinions_df = pd.read_csv(opinions_response['Body']) metadata_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/metadata.json') metadata = json.loads(metadata_response['Body'].read().decode('utf-8')) return opinions_df, metadata def retrieve_all_casedocs(prefix, aws_access_key, aws_secret_key): s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) subdirectories = set() paginator = s3.get_paginator('list_objects_v2') for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'): if result.get('CommonPrefixes'): subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes')) subdirectories = list(subdirectories) subs = [s.split('/')[1] for s in subdirectories] casedocs = [] for s in subs: opinions_df, metadata = retrieve_casedocs(s, aws_access_key, aws_secret_key) casedocs.append((opinions_df, metadata)) return casedocs