cools commited on
Commit
0a55f72
·
1 Parent(s): b0c9dfb

Upload AWSHandler.py

Browse files
Files changed (1) hide show
  1. AWSHandler.py +48 -0
AWSHandler.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ import pandas as pd
4
+ import json
5
+
6
+
7
+ def upload_files(origin_path, destination_path, aws_access_key, aws_secret_key):
8
+ session = boto3.Session(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
9
+ s3 = session.resource('s3')
10
+ bucket = s3.Bucket('gideon-corpus')
11
+
12
+ for subdir, dirs, files in os.walk(origin_path):
13
+ for file in files:
14
+ full_path = os.path.join(subdir, file)
15
+ with open(full_path, 'rb') as data:
16
+ bucket.put_object(Key=destination_path + origin_path.split('/')[-1] + '/' + full_path[len(origin_path) + 1:], Body=data)
17
+
18
+
19
+ def retrieve_logs(aws_access_key, aws_secret_key):
20
+ s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
21
+ logs_response = s3.get_object(Bucket='gideon-corpus', Key='logs/logs.csv')
22
+ logs_df = pd.read_csv(logs_response['Body'])
23
+ return logs_df
24
+
25
+ def retrieve_casedocs(case_num, aws_access_key, aws_secret_key): # Note: this is how stuff is stored on AWS
26
+ s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
27
+ opinions_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/opinions.csv')
28
+ opinions_df = pd.read_csv(opinions_response['Body'])
29
+ metadata_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/metadata.json')
30
+ metadata = json.loads(metadata_response['Body'].read().decode('utf-8'))
31
+ return opinions_df, metadata
32
+
33
+
34
+ def retrieve_all_casedocs(prefix, aws_access_key, aws_secret_key):
35
+ s3 = boto3.client(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
36
+ subdirectories = set()
37
+ paginator = s3.get_paginator('list_objects_v2')
38
+ for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'):
39
+ if result.get('CommonPrefixes'):
40
+ subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes'))
41
+ subdirectories = list(subdirectories)
42
+ subs = [s.split('/')[1] for s in subdirectories]
43
+
44
+ casedocs = []
45
+ for s in subs:
46
+ opinions_df, metadata = retrieve_casedocs(s, aws_access_key, aws_secret_key)
47
+ casedocs.append((opinions_df, metadata))
48
+ return casedocs