Spaces:
Running
Running
File size: 10,933 Bytes
bc92a1b 6506ce9 bc92a1b 4680ed9 bc92a1b 4680ed9 bc92a1b 6506ce9 bc92a1b a49bcd0 bc92a1b 6506ce9 bc92a1b 4680ed9 bc92a1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
import torch
try:
print(f"Is CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
try:
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
except Exception as e:
print(f"Error getting CUDA device name: {str(e)}")
else:
print("No CUDA device available - using CPU")
except Exception as e:
print(f"Error checking CUDA availability: {str(e)}")
print("Continuing with CPU...")
import streamlit as st
import os
from huggingface_hub import login
from datetime import datetime
from openai import OpenAI
from src.auth import validate_login
from src.utils import create_excel, setup_logging, getconfig
from src.pipeline import process_data
setup_logging()
import logging
from io import BytesIO
logger = logging.getLogger(__name__)
# Local
from dotenv import load_dotenv
load_dotenv()
config = getconfig("config.cfg")
@st.cache_resource
def get_azure_openai_client():
"""Initialize and cache Azure OpenAI client for the session"""
try:
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.environ.get("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
if not all([AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_API_KEY]):
raise ValueError("Missing required Azure OpenAI environment variables. Please check your .env file.")
client = OpenAI(api_key=AZURE_OPENAI_API_KEY, base_url=AZURE_OPENAI_ENDPOINT)
logger.info("Azure OpenAI client initialized successfully")
return client
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
raise
def get_azure_deployment():
"""Get Azure OpenAI deployment name from config file"""
try:
config = getconfig("config.cfg")
deployment = config.get("deployments", "DEPLOYMENT")
logger.info(f"Using Azure OpenAI deployment: {deployment}")
return deployment
except Exception as e:
logger.error(f"Failed to read deployment from config: {str(e)}. Using default deployment.")
deployment = "gpt-4o-mini"
return deployment
# Main app logic
def main():
# Temporarily set authentication to True for testing
if 'authenticated' not in st.session_state:
st.session_state['authenticated'] = True
if st.session_state['authenticated']:
# Remove login success message for testing
hf_token = os.environ["HF_TOKEN"]
login(token=hf_token, add_to_git_credential=True)
# Initialize session state variables
if 'data_processed' not in st.session_state:
st.session_state['data_processed'] = False
st.session_state['df'] = None
# Main Streamlit app
st.title('Application Pre-Filtering Tool')
# Sidebar (filters)
with st.sidebar:
with st.expander("ℹ️ - Instructions", expanded=False):
st.markdown(
"""
1. **Download the Excel Template file (below)**
2. **[OPTIONAL]: Select the desired filtering sensitivity level (below)**
3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'**
4. **Upload the template file in the area to the right (or click browse files)**
5. **Click 'Start Analysis'**
The tool will start processing the uploaded application data. This can take some time
depending on the number of applications and the length of text in each. For example, a file with 1000 applications
could be expected to take approximately 5 minutes.
***NOTE** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
"""
)
# Excel file download
st.download_button(
label="Download Excel Template",
data=create_excel(),
file_name="upload_template.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# get sensitivity level for use in review / reject (ref. process_data function)
sens_options = {
"Low": 3,
"Medium": 4,
"High": 5,
}
sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
help = 'Decreasing the level of sensitivity results in less \
applications filtered out. This also \
reduces the probability of false negatives (FNs). The rate of \
FNs at the lowest setting is approximately 6 percent, and \
approaches 13 percent at the highest setting. \
NOTE: changing this setting does not affect the raw data in the CSV output file (only the labels)',
options = list(sens_options.keys()),
index = list(sens_options.keys()).index("High"),
horizontal = False)
sens_level = sens_options[sens_input]
logger.info(f"Sensitivity level applied: {sens_level}")
with st.expander("ℹ️ - About this app", expanded=False):
st.write(
"""
This tool provides an interface for running an automated preliminary assessment of applications for a call for applications.
The tool functions by running selected text fields from the application through a series of LLMs fine-tuned for text classification (ref. diagram below).
The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against
human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
""")
st.image('assets/pipeline.png')
uploaded_file = st.file_uploader("Select a file containing application pre-filtering data (see instructions in the sidebar)")
# Add session state variables if they don't exist
if 'show_button' not in st.session_state:
st.session_state['show_button'] = True
if 'processing' not in st.session_state:
st.session_state['processing'] = False
if 'data_processed' not in st.session_state:
st.session_state['data_processed'] = False
# Only show the button if show_button is True and file is uploaded and not processing
if uploaded_file is not None and st.session_state['show_button'] and not st.session_state['processing']:
if st.button("Start Analysis", key="start_analysis"):
st.session_state['show_button'] = False
st.session_state['processing'] = True
st.rerun()
# If we're processing, show the processing logic
if st.session_state['processing']:
try:
logger.info(f"File uploaded: {uploaded_file.name}")
if not st.session_state['data_processed']:
logger.info("Starting data processing")
try:
# Initialize Azure OpenAI client and get deployment name
azure_client = get_azure_openai_client()
azure_deployment = get_azure_deployment()
st.session_state['df'] = process_data(
uploaded_file,
sens_level,
azure_client,
azure_deployment
)
logger.info("Data processing completed successfully")
st.session_state['data_processed'] = True
except ValueError as e:
# Handle specific validation errors
logger.error(f"Validation error: {str(e)}")
st.error(str(e))
st.session_state['show_button'] = True
st.session_state['processing'] = False
st.rerun()
except Exception as e:
# Handle other unexpected errors
logger.error(f"Error in process_data: {str(e)}")
st.error("An unexpected error occurred. Please check your input file and try again.")
st.session_state['show_button'] = True
st.session_state['processing'] = False
st.rerun()
df = st.session_state['df']
def reset_button_state():
st.session_state['show_button'] = True
st.session_state['processing'] = False
st.session_state['data_processed'] = False
# Create Excel buffer
excel_buffer = BytesIO()
df.to_excel(excel_buffer, index=False, engine='openpyxl')
excel_buffer.seek(0)
current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
output_filename = f'processed_applications_{current_datetime}.xlsx'
st.download_button(
label="Download Analysis Data File",
data=excel_buffer,
file_name=output_filename,
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
on_click=reset_button_state
)
except Exception as e:
logger.error(f"Error processing file: {str(e)}")
st.error("Failed to process the file. Please ensure your column names match the template file.")
st.session_state['show_button'] = True
st.session_state['processing'] = False
st.rerun()
# Comment out for testing
# else:
# username = st.text_input("Username")
# password = st.text_input("Password", type="password")
# if st.button("Login"):
# if validate_login(username, password):
# st.session_state['authenticated'] = True
# st.rerun()
# else:
# st.error("Incorrect username or password")
main()
|