File size: 14,285 Bytes
5ea5f23
 
 
96a6dc2
5ea5f23
 
 
ff82210
10a3892
ff82210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83a4df6
49fb311
d30b88c
49fb311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9441c29
 
 
 
 
 
 
 
 
 
49fb311
ff82210
 
 
 
 
 
 
 
5ea5f23
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c55b82
ee57959
a56ae9d
 
 
ff82210
 
 
 
a56ae9d
 
 
 
 
 
 
 
 
 
 
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9441c29
a56ae9d
 
ff82210
a56ae9d
 
 
 
2aaef92
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9441c29
ff82210
 
a56ae9d
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
ff82210
 
a56ae9d
ff82210
a56ae9d
 
 
 
 
 
 
 
 
 
 
 
ff82210
a56ae9d
9441c29
a56ae9d
 
 
 
 
 
 
 
fda11d1
303406a
fda11d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import gradio as gr
import pandas as pd
import pickle
import os
from docx import Document
from docx.shared import Inches
from docx.dml.color import ColorFormat
import sklearn
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE 
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import docx
from docx.enum.dml import MSO_THEME_COLOR_INDEX

def add_hyperlink(paragraph, text, url):
    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element and a new w:rPr element
    new_run = docx.oxml.shared.OxmlElement('w:r')
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    # Create a new Run object and add the hyperlink into it
    r = paragraph.add_run ()
    r._r.append (hyperlink)

    # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
    # Delete this if using a template that has the hyperlink style in it
    r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
    r.font.underline = True

    return hyperlink

def savedoc(document,name):
    def delete_paragraph(paragraph):
      p = paragraph._element
      p.getparent().remove(p)
      p._p = p._element = None
    for para in document.paragraphs:
        if para.text == '' and para.text != ' ':
          delete_paragraph(para)
    document.save(name)
    
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score
def measures(predicted, y_test):
    accuracy = accuracy_score(y_test, predicted)
    precision = precision_score(y_test, predicted)
    recall = recall_score(y_test, predicted)
    f1 = f1_score(y_test, predicted)
    matrix = confusion_matrix(y_test, predicted)
    return accuracy

def greet(operation,filer):
  try:
      if filer == None:
          return None,"Invalid file submitted"
      import os
      coset = pd.read_csv(filer.name)
      coset = coset.dropna(how='any')
      document = Document('temp.docx')
      allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1']
      if operation == "retrain":
        allowedcols = allowedcols[1:]
        for col in coset.columns:
          if col not in allowedcols:
            return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols)
        wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1'])
        def termize(x):
            if str(x)[-1] == "1":
              return 0
            elif str(x)[-1] == "2":
              return 1
            else:
              return 2
        def shorten_major(x):
            if "Computer Science" in x:
              return "CS"
            elif "Computer Information" in x:
              return "CIS"
            elif "Artificial" in x:
              return "AI"
            elif "Cyber" in x:
                return "CYS"
        def binarize_grade(y):
            todrop = ['TR','DN','NP','IP']
            for element in todrop:
                if element in y:
                    return -1
            if 'W' in y:
                return 1
            else:
                return 0
            
        wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major)
        wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade)
        wanted['TERM'] = wanted['TERM'].apply(termize)
        deleteRow = wanted[wanted['GRADE'] == -1].index
        wanted.drop(deleteRow, inplace=True)
        majors = []
        catalog = []
        acad_prog = []
        instructor = []
        def numberize(y):
            if y not in majors:
                majors.append(y)
                return majors.index(y)
            else:
                return majors.index(y)
            
        def catalogize(z):
            if z not in catalog:
                catalog.append(z)
                return catalog.index(z)
            else:
                return catalog.index(z)
            
        def acadize(w):
            if w not in acad_prog:
                acad_prog.append(w)
                return acad_prog.index(w)
            else:
                return acad_prog.index(w)
        def instructerize(w):
            if w not in instructor:
                instructor.append(w)
                return instructor.index(w)
            else:
                return instructor.index(w)

        def removestring(w):
            if any(c.isalpha() for c in w):
                return w[:-1]
            else:
                return w
            
        wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize)
        wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize)
        wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize)
        wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize)
        document.add_paragraph(' ')
        document.add_heading('Retraining report', 0)
        document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records')
        records = []

        X = wanted.drop(columns=['GRADE'])
        y = wanted['GRADE']
        smote = BorderlineSMOTE(random_state = 11)
        X_smote, y_smote = smote.fit_resample(X, y)
        kf = StratifiedKFold(n_splits=10)
        models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'),
        RandomForestClassifier(max_depth=100),
        LGBMClassifier(n_estimators=200, num_leaves=60),
        VotingClassifier(estimators=[('knn',
                                      KNeighborsClassifier(leaf_size=10,
                                                          metric='manhattan')),
                                    ('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])]
        metrics = dict()
        for model in models1:
            model.fit(X_smote,y_smote)
            preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,);
            metrics[model] = measures(preds,y_smote.values)
            records.append(((str(type(model).__name__),str(metrics[model]))))
        document.add_paragraph(' ')
        records = tuple(records)

        table = document.add_table(rows=1, cols=2)
        hdr_cells = table.rows[0].cells
        hdr_cells[0].text = 'Name'
        hdr_cells[1].text = 'Accuracy'
        for ind,qty in records:
            paragraph = document.add_paragraph()
            row_cells = table.add_row().cells
            row_cells[0].text = str(ind)
            row_cells[1].text = str(qty)
        table.style = 'TableGrid'
          
        dir_name = str(os.getcwd())
        test = os.listdir(dir_name)
        number = 0
        for item in test:
            if item.endswith(".sav") and int(item.split("=")[0]) >= number:
                number = int(item.split("=")[0])
                #os.remove(os.path.join(dir_name, item))
        acc = metrics[max(metrics, key=metrics.get)]
        model = max(metrics, key=metrics.get)
        number = number + 1
        filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav'

        datavalues = {"majors":str(majors),
        'acad_prog':str(acad_prog),
        'catalog':str(catalog),
        'instructor':str(instructor)
        }

        dfv = pd.DataFrame(datavalues,index=[0])
        dfv.to_csv(str(number)+"="+"values.csv")

        document.add_paragraph(" ")
        document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%')
        pickle.dump(model, open(filename, 'wb'))
        document.add_paragraph(" ")
        p = document.add_paragraph('For more like this contact us at ')
        add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com")
        savedoc(document,'retraining_report.docx')
        #document.save('retraining_report.docx')
        return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%')
      allowedcols.remove('GRADE')
      for col in coset.columns:
        if col not in allowedcols:
          return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols)
      majors = []
      catalog = []
      acad_prog = []
      instructor = []
      dir_name = str(os.getcwd())
      test = os.listdir(dir_name)
      modelname = ""
      maxnum = 0
      for item in test:
          if item.endswith(".sav") and int(item.split("=")[0]) > maxnum:
              maxnum = int(item.split("=")[0])
              modelname = item
      if maxnum == 0:
          return None,"No model found, please use retrain operation to build one"
      dfv = pd.read_csv(str(maxnum)+"=values.csv")

      cols = [majors,acad_prog,catalog,instructor]
      indexc = 0

      for column in dfv.columns:
          if "[" in str(dfv[column][0]):
            l = dfv[column][0].replace("'",'')
            cols[indexc][:] = str(l).strip('][').split(', ')
            
            for i,e in enumerate(cols[indexc]):
              cols[indexc][i] = e.replace(' ','')
            print(cols[indexc])
            indexc = indexc + 1
      #modelname = "VotingClassifier=0.95756598831352.sav"
      loaded_model = pickle.load(open(modelname, 'rb'))
      droppers = 0
      total = 0
      document.add_paragraph(' ')
      document.add_heading('Subjects drop prediction report', 0)
      document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information')

      records = []
      for row in coset.iterrows():
          row = list(row)[1]
          semester = 1
          row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '')
          row['TERM'] = str(row['TERM'])
          if row['TERM'][-1] == 2:
              semester = 2
          elif row['TERM'][-1] == 5:
              semester = 3
          c_id = catalog.index(str(row['CATALOG_NBR']))
          in_id = instructor.index(str(row['INSTRUCTOR_ID']))
          p_id = acad_prog.index(row['PROGRAM'])
          major = 0
          x = row['PROGRAM.1']
          if "Computer Science" in x:
              major = 0
          elif "Computer Information" in x:
              major = 1
          elif "Artificial" in x:
              major = 3
          elif "Cyber" in x:
              major = 2
          gpa = row['CGPA']
          prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0]
          total = total + 1
          records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction)))
          if prediction == 1:
              droppers = droppers + 1
      document.add_paragraph(' ')
      records = tuple(records)

      table = document.add_table(rows=1, cols=9)
      hdr_cells = table.rows[0].cells
      hdr_cells[0].text = 'Index'
      hdr_cells[1].text = 'Student ID'
      hdr_cells[2].text = 'Term'
      hdr_cells[3].text = 'Catalog ID'
      hdr_cells[4].text = 'Instructor ID'
      hdr_cells[5].text = 'Cummulative GPA'
      hdr_cells[6].text = 'Academic Program'
      hdr_cells[7].text = 'Major'
      hdr_cells[8].text = 'Possible Drop Prediction'
      for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records:
          paragraph = document.add_paragraph()
          row_cells = table.add_row().cells
          row_cells[0].text = ind
          row_cells[1].text = str(qty)
          row_cells[2].text = id1
          row_cells[3].text = desc
          row_cells[4].text = inst
          row_cells[5].text = cgpa
          row_cells[6].text = aprog
          row_cells[7].text = maj
          if pred == "1":
              pred = "Yes"
          else:
              pred = "No"
          row_cells[8].text = pred
          
      table.style = 'TableGrid'
      #document.add_page_break()
      document.add_paragraph(" ")
      modelname = modelname.split("=")
      lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)'
      document.add_paragraph(lastpara)
      savedoc(document,'drop_prediction_report.docx')
      #document.save('drop_prediction_report.docx')
      return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")"
  except Exception as e:
    return None,str(e)

iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True)
iface.launch()