File size: 10,933 Bytes
bc92a1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6506ce9
 
 
bc92a1b
 
 
 
 
 
 
 
4680ed9
 
bc92a1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4680ed9
bc92a1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6506ce9
 
 
bc92a1b
 
 
 
 
 
 
 
 
 
 
 
 
 
a49bcd0
bc92a1b
 
 
 
 
 
 
 
 
 
 
6506ce9
bc92a1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4680ed9
 
 
 
 
 
 
 
 
bc92a1b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import torch
try:
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        try:
            print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
        except Exception as e:
            print(f"Error getting CUDA device name: {str(e)}")
    else:
        print("No CUDA device available - using CPU")
except Exception as e:
    print(f"Error checking CUDA availability: {str(e)}")
    print("Continuing with CPU...")

import streamlit as st
import os
from huggingface_hub import login
from datetime import datetime
from openai import OpenAI
from src.auth import validate_login
from src.utils import create_excel, setup_logging, getconfig
from src.pipeline import process_data

setup_logging()
import logging
from io import BytesIO

logger = logging.getLogger(__name__)

# Local
from dotenv import load_dotenv
load_dotenv()

config = getconfig("config.cfg")

@st.cache_resource
def get_azure_openai_client():
    """Initialize and cache Azure OpenAI client for the session"""
    try:
        AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
        AZURE_OPENAI_API_VERSION = os.environ.get("AZURE_OPENAI_API_VERSION")
        AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")

        if not all([AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_API_KEY]):
            raise ValueError("Missing required Azure OpenAI environment variables. Please check your .env file.")

        client = OpenAI(api_key=AZURE_OPENAI_API_KEY, base_url=AZURE_OPENAI_ENDPOINT)
        logger.info("Azure OpenAI client initialized successfully")
        return client
    except Exception as e:
        logger.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
        raise


def get_azure_deployment():
    """Get Azure OpenAI deployment name from config file"""
    try:
        config = getconfig("config.cfg")
        deployment = config.get("deployments", "DEPLOYMENT")
        logger.info(f"Using Azure OpenAI deployment: {deployment}")
        return deployment
    except Exception as e:
        logger.error(f"Failed to read deployment from config: {str(e)}. Using default deployment.")
        deployment = "gpt-4o-mini"
        return deployment


# Main app logic
def main():
    # Temporarily set authentication to True for testing
    if 'authenticated' not in st.session_state:
        st.session_state['authenticated'] = True

    if st.session_state['authenticated']:
        # Remove login success message for testing
        hf_token = os.environ["HF_TOKEN"]
        login(token=hf_token, add_to_git_credential=True)

        # Initialize session state variables
        if 'data_processed' not in st.session_state:
            st.session_state['data_processed'] = False
            st.session_state['df'] = None

        # Main Streamlit app
        st.title('Application Pre-Filtering Tool')

        # Sidebar (filters)
        with st.sidebar:
            with st.expander("ℹ️ - Instructions", expanded=False):
                st.markdown(
                    """
                    1. **Download the Excel Template file (below)**
                    2. **[OPTIONAL]: Select the desired filtering sensitivity level (below)**
                    3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'**
                    4. **Upload the template file in the area to the right (or click browse files)**
                    5. **Click 'Start Analysis'**

                    The tool will start processing the uploaded application data. This can take some time
                    depending on the number of applications and the length of text in each. For example, a file with 1000 applications
                    could be expected to take approximately 5 minutes.

                    ***NOTE** -  you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*

                    """
                )
            # Excel file download
            st.download_button(
                label="Download Excel Template",
                data=create_excel(),
                file_name="upload_template.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )

            # get sensitivity level for use in review / reject (ref. process_data function)
            sens_options = {
                "Low": 3,
                "Medium": 4,
                "High": 5,
            }

            sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
                                    help = 'Decreasing the level of sensitivity results in less \
                                    applications filtered out. This also \
                                    reduces the probability of false negatives (FNs). The rate of \
                                    FNs at the lowest setting is approximately 6 percent, and \
                                    approaches 13 percent at the highest setting. \
                                    NOTE: changing this setting does not affect the raw data in the CSV output file (only the labels)', 
                                    options = list(sens_options.keys()),
                                    index = list(sens_options.keys()).index("High"),
                                    horizontal = False)

            sens_level = sens_options[sens_input]
            logger.info(f"Sensitivity level applied: {sens_level}")

        with st.expander("ℹ️ - About this app", expanded=False):
            st.write(
                """
                This tool provides an interface for running an automated preliminary assessment of applications for a call for applications.

                The tool functions by running selected text fields from the application through a series of LLMs fine-tuned for text classification (ref. diagram below).
                The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against 
                human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
                
                """)
            st.image('assets/pipeline.png')

        uploaded_file = st.file_uploader("Select a file containing application pre-filtering data (see instructions in the sidebar)")

        # Add session state variables if they don't exist
        if 'show_button' not in st.session_state:
            st.session_state['show_button'] = True
        if 'processing' not in st.session_state:
            st.session_state['processing'] = False
        if 'data_processed' not in st.session_state:
            st.session_state['data_processed'] = False

        # Only show the button if show_button is True and file is uploaded and not processing
        if uploaded_file is not None and st.session_state['show_button'] and not st.session_state['processing']:
            if st.button("Start Analysis", key="start_analysis"):
                st.session_state['show_button'] = False
                st.session_state['processing'] = True
                st.rerun()

        # If we're processing, show the processing logic
        if st.session_state['processing']:
            try:
                logger.info(f"File uploaded: {uploaded_file.name}")
                
                if not st.session_state['data_processed']:
                    logger.info("Starting data processing")
                    try:
                        # Initialize Azure OpenAI client and get deployment name
                        azure_client = get_azure_openai_client()
                        azure_deployment = get_azure_deployment()

                        st.session_state['df'] = process_data(
                            uploaded_file,
                            sens_level,
                            azure_client,
                            azure_deployment
                        )
                        logger.info("Data processing completed successfully")
                        st.session_state['data_processed'] = True
                    except ValueError as e:
                        # Handle specific validation errors
                        logger.error(f"Validation error: {str(e)}")
                        st.error(str(e))
                        st.session_state['show_button'] = True
                        st.session_state['processing'] = False
                        st.rerun()
                    except Exception as e:
                        # Handle other unexpected errors
                        logger.error(f"Error in process_data: {str(e)}")
                        st.error("An unexpected error occurred. Please check your input file and try again.")
                        st.session_state['show_button'] = True
                        st.session_state['processing'] = False
                        st.rerun()

                df = st.session_state['df']

                def reset_button_state():
                    st.session_state['show_button'] = True
                    st.session_state['processing'] = False
                    st.session_state['data_processed'] = False
                
                # Create Excel buffer
                excel_buffer = BytesIO()
                df.to_excel(excel_buffer, index=False, engine='openpyxl')
                excel_buffer.seek(0)
                
                current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
                output_filename = f'processed_applications_{current_datetime}.xlsx'
                
                st.download_button(
                    label="Download Analysis Data File",
                    data=excel_buffer,
                    file_name=output_filename,
                    mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                    on_click=reset_button_state
                )

            except Exception as e:
                logger.error(f"Error processing file: {str(e)}")
                st.error("Failed to process the file. Please ensure your column names match the template file.")
                st.session_state['show_button'] = True
                st.session_state['processing'] = False
                st.rerun()


    # Comment out for testing
    # else:
    #     username = st.text_input("Username")
    #     password = st.text_input("Password", type="password")
    #     if st.button("Login"):
    #         if validate_login(username, password):
    #             st.session_state['authenticated'] = True
    #             st.rerun()
    #         else:
    #             st.error("Incorrect username or password")



main()