import streamlit as st
import pickle
from sentence_transformers import SentenceTransformer
import pandas as pd
from io import StringIO
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import plotly.express as px
from statistics import mode

st.title("Extract job function, department and role for a given job title")

@st.cache_resource
def get_artifacts():
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    knn1 = pickle.load(open("model_function.pkl",'rb'))
    knn2 = pickle.load(open("model_department.pkl",'rb'))
    knn3 = pickle.load(open("model_role.pkl",'rb'))

    knn4 = pickle.load(open("model_function_taxonomy.pkl",'rb'))
    knn5 = pickle.load(open("model_department_taxonomy.pkl",'rb'))
    knn6 = pickle.load(open("model_role_taxonomy.pkl",'rb'))
    thresholds = pickle.load(open("thresholds.pkl",'rb'))

    return model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds

def get_all_labels(job_title):
    x = model.encode([job_title])
    predicted_function = knn1.predict(x)[0]
    x = model.encode([job_title + ' ' + predicted_function])
    predicted_department = knn2.predict(x)[0]
    x = model.encode([job_title + ' ' + predicted_function + ' ' + predicted_department])
    predicted_role = knn3.predict(x)[0]

    return predicted_function, predicted_department, predicted_role

def get_taxonomy_V1(df):            
    ################## Predict Function #######################
    X = model.encode(df['Job Title'])

    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['function'], metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_
    
    X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
    valy_ = knn4.predict(X2)

    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Function'] = val_predicted

    ################## Predict Departmebnt #######################
    X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))

    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['department'], metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_
    
    X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
    valy_ = knn5.predict(X2)

    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Department'] = val_predicted

    ################## Predict Role #######################
    X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))

    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=thresholds['role'], metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_
    
    X2 = [X[np.where(labels == id)[0],:].mean(0) for id in np.unique(labels)]
    valy_ = knn6.predict(X2)

    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Role'] = val_predicted
    
    return df

def get_taxonomy_V2(df):
    from sklearn.cluster import OPTICS, cluster_optics_dbscan
    
    df.columns = ['Job Title']
    
    ################## Predict Function #######################
    X = model.encode(df['Job Title'])

    val_pred = knn1.predict(X)

    df['Pred1'] = val_pred
    df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Pred1'], axis=1)
    X = model.encode(df['text'])

    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_

    valy_ = []
    for id in np.unique(labels):
        valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
        
    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Function'] = val_predicted

    ################## Predict Departmebnt #######################
    X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'], axis=1))

    val_pred = knn2.predict(X)

    df['Pred1'] = val_pred
    df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Pred1'], axis=1)
    X = model.encode(df['text'])
    
    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_

    valy_ = []
    for id in np.unique(labels):
        valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
        
    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Department'] = val_predicted

    ################## Predict Role #######################
    X = model.encode(df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'], axis=1))

    val_pred = knn3.predict(X)

    df['Pred1'] = val_pred
    df['text'] = df.apply(lambda x: x['Job Title'] + ' ' + x['Function'] + ' ' + x['Department'] + ' ' + x['Pred1'], axis=1)
    X = model.encode(df['text'])
    
    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.22, metric='cosine', linkage='average')
    clust.fit(X)
    labels = clust.labels_

    valy_ = []
    for id in np.unique(labels):
        valy_.append(mode([val_pred[i] for i in np.where(labels == id)[0]]))
        
    val_predicted = []
    for i in labels:
        id = np.unique(labels).tolist().index(i)
        val_predicted.append(str(valy_[id]))

    df['Role'] = val_predicted
    
    return df
    
if __name__ == '__main__':
    model, knn1, knn2, knn3, knn4, knn5, knn6, thresholds = get_artifacts()
    job_title = st.text_input('Put the job title here - ', value="")
    if job_title != "":
        predicted_function, predicted_department, predicted_role = get_all_labels(job_title)
        st.markdown("Function: " + predicted_function)
        st.markdown("Department: " + predicted_department)
        st.markdown("Role: " + predicted_role)
        
    uploaded_file = st.file_uploader("Or, choose a csv file to see taxonomy")
    if uploaded_file is not None:
        # To read file as bytes:
        bytes_data = uploaded_file.getvalue()
    
        # To convert to a string based IO:
        stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
    
        # To read file as string:
        string_data = stringio.read()
    
        # Can be used wherever a "file-like" object is accepted:
        df = pd.read_csv(uploaded_file, header=None)

        #predicted_functions, predicted_departments, predicted_roles = [], [], []
        df.columns = ['Job Title']
        #for i in range(df.shape[0]):
        #    predicted_function, predicted_department, predicted_role = get_all_labels(df['Job Title'].iloc[i])
        #    predicted_functions.append(predicted_function)
        #    predicted_departments.append(predicted_department)
        #    predicted_roles.append(predicted_role)

        #df['Function'] = predicted_functions
        #df['Department'] = predicted_departments
        #df['Role'] = predicted_roles

        df = get_taxonomy_V2(df)
        df = df[['Job Title','Function','Department','Role']]
        
        st.table(df)

        st.download_button(
           "Press to Download",
           df.to_csv(index=False).encode('utf-8'),
           "job_titles.csv",
           "text/csv",
           key='download-csv'
        )

        fig = px.sunburst(df, path=["Function", 'Department', 'Role', 'Job Title'])
        st.plotly_chart(fig, use_container_width=True)