| import streamlit as st | |
| import cv2 | |
| import Scraper | |
| import Manager | |
| import ImageProcessor | |
| import TextProcessor | |
| import Tagger | |
| import Parser | |
| import os | |
| import pickle | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import shutil | |
| import pandas as pd | |
| import boto3 | |
| aws_access_key = os.getenv("aws_access_key") | |
| aws_secret_key = os.getenv("aws_secret_key") | |
| # Display the cases | |
| st.set_page_config(layout="wide") | |
| s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) | |
| def get_subdirectories(prefix): | |
| subdirectories = set() | |
| paginator = s3.get_paginator('list_objects_v2') | |
| for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'): | |
| if result.get('CommonPrefixes'): | |
| subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes')) | |
| subdirectories = list(subdirectories) | |
| subs = [s.split('/')[1] for s in subdirectories] | |
| return subs | |
| subs = get_subdirectories("Cases/") | |
| st.text(subs) | |
| # volume = st.slider('Volume', 500, 550, 525) | |
| # r = requests.get('https://www.loc.gov/collections/united-states-reports/?fa=partof%3Au.s.+reports%3A+volume+'+str(volume) +'&st=list&c=250') | |
| # soup = BeautifulSoup(r.text) | |
| # html_links = soup.findAll('link',attrs={'rel': 'alternate'}) | |
| # pdf_links = [] | |
| # for hl in html_links: | |
| # if len(hl['href'].split('/')[-1].split('.gif')[0])== 11: | |
| # pdf_links.append(hl['href'].split('.gif')[0]+'.pdf') | |
| # case_num = st.slider('Case Number', 0, len(pdf_links), 1, step=1) | |
| # print(pdf_links) | |
| # run = st.button("Run") | |
| # if run: | |
| # with st.spinner("Downloading"): | |
| # loc_link = pdf_links[case_num] | |
| # if os.path.exists('PDF Cases/Temp'): | |
| # shutil.rmtree('PDF Cases/Temp') | |
| # Scraper.download_loc(loc_link) | |
| # ImageProcessor.process_file('PDF Cases/Temp') | |
| # TextProcessor.process_file('PDF Cases/Temp') | |
| # Tagger.process_file('PDF Cases/Temp', draw=True) | |
| # st.header('Opinions') | |
| # opinions_df = pd.read_csv('PDF Cases/Temp/opinions.csv') | |
| # types = opinions_df['Type'].tolist() | |
| # author_sents = opinions_df['Author Sent'].tolist() | |
| # texts = opinions_df['Text'].tolist() | |
| # for (t, a_s) in zip(types, author_sents): | |
| # st.text(t + ":\t" + a_s) | |
| # tabs = st.tabs(types) | |
| # for (i, tab) in enumerate(tabs): | |
| # with tab: | |
| # paras = texts[i].split('<PARA>') | |
| # t = "</div><br/><div>".join(paras) | |
| # t = "<div>" + t | |
| # st.markdown(t, unsafe_allow_html=True) | |
| # st.divider() | |
| # cols = st.columns(4) | |
| # image_filenames = [f for f in os.listdir('PDF Cases/Temp') if 'processed.png' in f] | |
| # for (i,f) in enumerate(image_filenames): | |
| # image = cv2.imread('PDF Cases/Temp/' + str(i) + '-processed.png') | |
| # with cols[i%4]: | |
| # st.image(image) | |