Spaces:

IAUAI
/

drop_prediction

Runtime error

File size: 14,285 Bytes

import gradio as gr
import pandas as pd
import pickle
import os
from docx import Document
from docx.shared import Inches
from docx.dml.color import ColorFormat
import sklearn
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE 
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import docx
from docx.enum.dml import MSO_THEME_COLOR_INDEX

def add_hyperlink(paragraph, text, url):
    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element and a new w:rPr element
    new_run = docx.oxml.shared.OxmlElement('w:r')
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    # Create a new Run object and add the hyperlink into it
    r = paragraph.add_run ()
    r._r.append (hyperlink)

    # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
    # Delete this if using a template that has the hyperlink style in it
    r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
    r.font.underline = True

    return hyperlink

def savedoc(document,name):
    def delete_paragraph(paragraph):
      p = paragraph._element
      p.getparent().remove(p)
      p._p = p._element = None
    for para in document.paragraphs:
        if para.text == '' and para.text != ' ':
          delete_paragraph(para)
    document.save(name)
    
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score
def measures(predicted, y_test):
    accuracy = accuracy_score(y_test, predicted)
    precision = precision_score(y_test, predicted)
    recall = recall_score(y_test, predicted)
    f1 = f1_score(y_test, predicted)
    matrix = confusion_matrix(y_test, predicted)
    return accuracy

def greet(operation,filer):
  try:
      if filer == None:
          return None,"Invalid file submitted"
      import os
      coset = pd.read_csv(filer.name)
      coset = coset.dropna(how='any')
      document = Document('temp.docx')
      allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1']
      if operation == "retrain":
        allowedcols = allowedcols[1:]
        for col in coset.columns:
          if col not in allowedcols:
            return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols)
        wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1'])
        def termize(x):
            if str(x)[-1] == "1":
              return 0
            elif str(x)[-1] == "2":
              return 1
            else:
              return 2
        def shorten_major(x):
            if "Computer Science" in x:
              return "CS"
            elif "Computer Information" in x:
              return "CIS"
            elif "Artificial" in x:
              return "AI"
            elif "Cyber" in x:
                return "CYS"
        def binarize_grade(y):
            todrop = ['TR','DN','NP','IP']
            for element in todrop:
                if element in y:
                    return -1
            if 'W' in y:
                return 1
            else:
                return 0
            
        wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major)
        wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade)
        wanted['TERM'] = wanted['TERM'].apply(termize)
        deleteRow = wanted[wanted['GRADE'] == -1].index
        wanted.drop(deleteRow, inplace=True)
        majors = []
        catalog = []
        acad_prog = []
        instructor = []
        def numberize(y):
            if y not in majors:
                majors.append(y)
                return majors.index(y)
            else:
                return majors.index(y)
            
        def catalogize(z):
            if z not in catalog:
                catalog.append(z)
                return catalog.index(z)
            else:
                return catalog.index(z)
            
        def acadize(w):
            if w not in acad_prog:
                acad_prog.append(w)
                return acad_prog.index(w)
            else:
                return acad_prog.index(w)
        def instructerize(w):
            if w not in instructor:
                instructor.append(w)
                return instructor.index(w)
            else:
                return instructor.index(w)

        def removestring(w):
            if any(c.isalpha() for c in w):
                return w[:-1]
            else:
                return w
            
        wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize)
        wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize)
        wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize)
        wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize)
        document.add_paragraph(' ')
        document.add_heading('Retraining report', 0)
        document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records')
        records = []

        X = wanted.drop(columns=['GRADE'])
        y = wanted['GRADE']
        smote = BorderlineSMOTE(random_state = 11)
        X_smote, y_smote = smote.fit_resample(X, y)
        kf = StratifiedKFold(n_splits=10)
        models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'),
        RandomForestClassifier(max_depth=100),
        LGBMClassifier(n_estimators=200, num_leaves=60),
        VotingClassifier(estimators=[('knn',
                                      KNeighborsClassifier(leaf_size=10,
                                                          metric='manhattan')),
                                    ('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])]
        metrics = dict()
        for model in models1:
            model.fit(X_smote,y_smote)
            preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,);
            metrics[model] = measures(preds,y_smote.values)
            records.append(((str(type(model).__name__),str(metrics[model]))))
        document.add_paragraph(' ')
        records = tuple(records)

        table = document.add_table(rows=1, cols=2)
        hdr_cells = table.rows[0].cells
        hdr_cells[0].text = 'Name'
        hdr_cells[1].text = 'Accuracy'
        for ind,qty in records:
            paragraph = document.add_paragraph()
            row_cells = table.add_row().cells
            row_cells[0].text = str(ind)
            row_cells[1].text = str(qty)
        table.style = 'TableGrid'
          
        dir_name = str(os.getcwd())
        test = os.listdir(dir_name)
        number = 0
        for item in test:
            if item.endswith(".sav") and int(item.split("=")[0]) >= number:
                number = int(item.split("=")[0])
                #os.remove(os.path.join(dir_name, item))
        acc = metrics[max(metrics, key=metrics.get)]
        model = max(metrics, key=metrics.get)
        number = number + 1
        filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav'

        datavalues = {"majors":str(majors),
        'acad_prog':str(acad_prog),
        'catalog':str(catalog),
        'instructor':str(instructor)
        }

        dfv = pd.DataFrame(datavalues,index=[0])
        dfv.to_csv(str(number)+"="+"values.csv")

        document.add_paragraph(" ")
        document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%')
        pickle.dump(model, open(filename, 'wb'))
        document.add_paragraph(" ")
        p = document.add_paragraph('For more like this contact us at ')
        add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com")
        savedoc(document,'retraining_report.docx')
        #document.save('retraining_report.docx')
        return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%')
      allowedcols.remove('GRADE')
      for col in coset.columns:
        if col not in allowedcols:
          return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols)
      majors = []
      catalog = []
      acad_prog = []
      instructor = []
      dir_name = str(os.getcwd())
      test = os.listdir(dir_name)
      modelname = ""
      maxnum = 0
      for item in test:
          if item.endswith(".sav") and int(item.split("=")[0]) > maxnum:
              maxnum = int(item.split("=")[0])
              modelname = item
      if maxnum == 0:
          return None,"No model found, please use retrain operation to build one"
      dfv = pd.read_csv(str(maxnum)+"=values.csv")

      cols = [majors,acad_prog,catalog,instructor]
      indexc = 0

      for column in dfv.columns:
          if "[" in str(dfv[column][0]):
            l = dfv[column][0].replace("'",'')
            cols[indexc][:] = str(l).strip('][').split(', ')
            
            for i,e in enumerate(cols[indexc]):
              cols[indexc][i] = e.replace(' ','')
            print(cols[indexc])
            indexc = indexc + 1
      #modelname = "VotingClassifier=0.95756598831352.sav"
      loaded_model = pickle.load(open(modelname, 'rb'))
      droppers = 0
      total = 0
      document.add_paragraph(' ')
      document.add_heading('Subjects drop prediction report', 0)
      document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information')

      records = []
      for row in coset.iterrows():
          row = list(row)[1]
          semester = 1
          row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '')
          row['TERM'] = str(row['TERM'])
          if row['TERM'][-1] == 2:
              semester = 2
          elif row['TERM'][-1] == 5:
              semester = 3
          c_id = catalog.index(str(row['CATALOG_NBR']))
          in_id = instructor.index(str(row['INSTRUCTOR_ID']))
          p_id = acad_prog.index(row['PROGRAM'])
          major = 0
          x = row['PROGRAM.1']
          if "Computer Science" in x:
              major = 0
          elif "Computer Information" in x:
              major = 1
          elif "Artificial" in x:
              major = 3
          elif "Cyber" in x:
              major = 2
          gpa = row['CGPA']
          prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0]
          total = total + 1
          records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction)))
          if prediction == 1:
              droppers = droppers + 1
      document.add_paragraph(' ')
      records = tuple(records)

      table = document.add_table(rows=1, cols=9)
      hdr_cells = table.rows[0].cells
      hdr_cells[0].text = 'Index'
      hdr_cells[1].text = 'Student ID'
      hdr_cells[2].text = 'Term'
      hdr_cells[3].text = 'Catalog ID'
      hdr_cells[4].text = 'Instructor ID'
      hdr_cells[5].text = 'Cummulative GPA'
      hdr_cells[6].text = 'Academic Program'
      hdr_cells[7].text = 'Major'
      hdr_cells[8].text = 'Possible Drop Prediction'
      for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records:
          paragraph = document.add_paragraph()
          row_cells = table.add_row().cells
          row_cells[0].text = ind
          row_cells[1].text = str(qty)
          row_cells[2].text = id1
          row_cells[3].text = desc
          row_cells[4].text = inst
          row_cells[5].text = cgpa
          row_cells[6].text = aprog
          row_cells[7].text = maj
          if pred == "1":
              pred = "Yes"
          else:
              pred = "No"
          row_cells[8].text = pred
          
      table.style = 'TableGrid'
      #document.add_page_break()
      document.add_paragraph(" ")
      modelname = modelname.split("=")
      lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)'
      document.add_paragraph(lastpara)
      savedoc(document,'drop_prediction_report.docx')
      #document.save('drop_prediction_report.docx')
      return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")"
  except Exception as e:
    return None,str(e)

iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True)
iface.launch()