Gideon / app.py
cools's picture
Update app.py
08f492d
raw
history blame
2.99 kB
import streamlit as st
import cv2
import Scraper
import Manager
import ImageProcessor
import TextProcessor
import Tagger
import Parser
import os
import pickle
from bs4 import BeautifulSoup
import requests
import shutil
import pandas as pd
import boto3
aws_access_key = os.getenv("aws_access_key")
aws_secret_key = os.getenv("aws_secret_key")
# Display the cases
st.set_page_config(layout="wide")
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
def get_subdirectories(prefix):
subdirectories = set()
paginator = s3.get_paginator('list_objects_v2')
for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'):
if result.get('CommonPrefixes'):
subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes'))
subdirectories = list(subdirectories)
subs = [s.split('/')[1] for s in subdirectories]
return subs
subs = get_subdirectories("Cases/")
st.text(subs)
# volume = st.slider('Volume', 500, 550, 525)
# r = requests.get('https://www.loc.gov/collections/united-states-reports/?fa=partof%3Au.s.+reports%3A+volume+'+str(volume) +'&st=list&c=250')
# soup = BeautifulSoup(r.text)
# html_links = soup.findAll('link',attrs={'rel': 'alternate'})
# pdf_links = []
# for hl in html_links:
# if len(hl['href'].split('/')[-1].split('.gif')[0])== 11:
# pdf_links.append(hl['href'].split('.gif')[0]+'.pdf')
# case_num = st.slider('Case Number', 0, len(pdf_links), 1, step=1)
# print(pdf_links)
# run = st.button("Run")
# if run:
# with st.spinner("Downloading"):
# loc_link = pdf_links[case_num]
# if os.path.exists('PDF Cases/Temp'):
# shutil.rmtree('PDF Cases/Temp')
# Scraper.download_loc(loc_link)
# ImageProcessor.process_file('PDF Cases/Temp')
# TextProcessor.process_file('PDF Cases/Temp')
# Tagger.process_file('PDF Cases/Temp', draw=True)
# st.header('Opinions')
# opinions_df = pd.read_csv('PDF Cases/Temp/opinions.csv')
# types = opinions_df['Type'].tolist()
# author_sents = opinions_df['Author Sent'].tolist()
# texts = opinions_df['Text'].tolist()
# for (t, a_s) in zip(types, author_sents):
# st.text(t + ":\t" + a_s)
# tabs = st.tabs(types)
# for (i, tab) in enumerate(tabs):
# with tab:
# paras = texts[i].split('<PARA>')
# t = "</div><br/><div>".join(paras)
# t = "<div>" + t
# st.markdown(t, unsafe_allow_html=True)
# st.divider()
# cols = st.columns(4)
# image_filenames = [f for f in os.listdir('PDF Cases/Temp') if 'processed.png' in f]
# for (i,f) in enumerate(image_filenames):
# image = cv2.imread('PDF Cases/Temp/' + str(i) + '-processed.png')
# with cols[i%4]:
# st.image(image)