Upload AWSHandler.py
Browse files- AWSHandler.py +48 -0
AWSHandler.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import boto3
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def upload_files(origin_path, destination_path, aws_access_key, aws_secret_key):
|
| 8 |
+
session = boto3.Session(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
|
| 9 |
+
s3 = session.resource('s3')
|
| 10 |
+
bucket = s3.Bucket('gideon-corpus')
|
| 11 |
+
|
| 12 |
+
for subdir, dirs, files in os.walk(origin_path):
|
| 13 |
+
for file in files:
|
| 14 |
+
full_path = os.path.join(subdir, file)
|
| 15 |
+
with open(full_path, 'rb') as data:
|
| 16 |
+
bucket.put_object(Key=destination_path + origin_path.split('/')[-1] + '/' + full_path[len(origin_path) + 1:], Body=data)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def retrieve_logs(aws_access_key, aws_secret_key):
|
| 20 |
+
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
|
| 21 |
+
logs_response = s3.get_object(Bucket='gideon-corpus', Key='logs/logs.csv')
|
| 22 |
+
logs_df = pd.read_csv(logs_response['Body'])
|
| 23 |
+
return logs_df
|
| 24 |
+
|
| 25 |
+
def retrieve_casedocs(case_num, aws_access_key, aws_secret_key): # Note: this is how stuff is stored on AWS
|
| 26 |
+
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
|
| 27 |
+
opinions_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/opinions.csv')
|
| 28 |
+
opinions_df = pd.read_csv(opinions_response['Body'])
|
| 29 |
+
metadata_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/metadata.json')
|
| 30 |
+
metadata = json.loads(metadata_response['Body'].read().decode('utf-8'))
|
| 31 |
+
return opinions_df, metadata
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def retrieve_all_casedocs(prefix, aws_access_key, aws_secret_key):
|
| 35 |
+
s3 = boto3.client(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
|
| 36 |
+
subdirectories = set()
|
| 37 |
+
paginator = s3.get_paginator('list_objects_v2')
|
| 38 |
+
for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'):
|
| 39 |
+
if result.get('CommonPrefixes'):
|
| 40 |
+
subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes'))
|
| 41 |
+
subdirectories = list(subdirectories)
|
| 42 |
+
subs = [s.split('/')[1] for s in subdirectories]
|
| 43 |
+
|
| 44 |
+
casedocs = []
|
| 45 |
+
for s in subs:
|
| 46 |
+
opinions_df, metadata = retrieve_casedocs(s, aws_access_key, aws_secret_key)
|
| 47 |
+
casedocs.append((opinions_df, metadata))
|
| 48 |
+
return casedocs
|