Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import pickle | |
| import os | |
| from docx import Document | |
| from docx.shared import Inches | |
| from docx.dml.color import ColorFormat | |
| import sklearn | |
| from lightgbm import LGBMClassifier | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
| from imblearn.under_sampling import RandomUnderSampler | |
| from sklearn.preprocessing import MinMaxScaler | |
| from imblearn.over_sampling import SMOTE, BorderlineSMOTE | |
| from imblearn.pipeline import Pipeline as imbpipeline | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.model_selection import cross_val_score, cross_val_predict | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn import model_selection | |
| from sklearn.neural_network import MLPClassifier | |
| from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.metrics import confusion_matrix | |
| from sklearn.feature_selection import SequentialFeatureSelector | |
| from sklearn.model_selection import GridSearchCV, StratifiedKFold | |
| import docx | |
| from docx.enum.dml import MSO_THEME_COLOR_INDEX | |
| def add_hyperlink(paragraph, text, url): | |
| # This gets access to the document.xml.rels file and gets a new relation id value | |
| part = paragraph.part | |
| r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) | |
| # Create the w:hyperlink tag and add needed values | |
| hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') | |
| hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, ) | |
| # Create a w:r element and a new w:rPr element | |
| new_run = docx.oxml.shared.OxmlElement('w:r') | |
| rPr = docx.oxml.shared.OxmlElement('w:rPr') | |
| # Join all the xml elements together add add the required text to the w:r element | |
| new_run.append(rPr) | |
| new_run.text = text | |
| hyperlink.append(new_run) | |
| # Create a new Run object and add the hyperlink into it | |
| r = paragraph.add_run () | |
| r._r.append (hyperlink) | |
| # A workaround for the lack of a hyperlink style (doesn't go purple after using the link) | |
| # Delete this if using a template that has the hyperlink style in it | |
| r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK | |
| r.font.underline = True | |
| return hyperlink | |
| def savedoc(document,name): | |
| def delete_paragraph(paragraph): | |
| p = paragraph._element | |
| p.getparent().remove(p) | |
| p._p = p._element = None | |
| for para in document.paragraphs: | |
| if para.text == '' and para.text != ' ': | |
| delete_paragraph(para) | |
| document.save(name) | |
| from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score | |
| def measures(predicted, y_test): | |
| accuracy = accuracy_score(y_test, predicted) | |
| precision = precision_score(y_test, predicted) | |
| recall = recall_score(y_test, predicted) | |
| f1 = f1_score(y_test, predicted) | |
| matrix = confusion_matrix(y_test, predicted) | |
| return accuracy | |
| def greet(operation,filer): | |
| try: | |
| if filer == None: | |
| return None,"Invalid file submitted" | |
| import os | |
| coset = pd.read_csv(filer.name) | |
| coset = coset.dropna(how='any') | |
| document = Document('temp.docx') | |
| allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1'] | |
| if operation == "retrain": | |
| allowedcols = allowedcols[1:] | |
| for col in coset.columns: | |
| if col not in allowedcols: | |
| return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols) | |
| wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1']) | |
| def termize(x): | |
| if str(x)[-1] == "1": | |
| return 0 | |
| elif str(x)[-1] == "2": | |
| return 1 | |
| else: | |
| return 2 | |
| def shorten_major(x): | |
| if "Computer Science" in x: | |
| return "CS" | |
| elif "Computer Information" in x: | |
| return "CIS" | |
| elif "Artificial" in x: | |
| return "AI" | |
| elif "Cyber" in x: | |
| return "CYS" | |
| def binarize_grade(y): | |
| todrop = ['TR','DN','NP','IP'] | |
| for element in todrop: | |
| if element in y: | |
| return -1 | |
| if 'W' in y: | |
| return 1 | |
| else: | |
| return 0 | |
| wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major) | |
| wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade) | |
| wanted['TERM'] = wanted['TERM'].apply(termize) | |
| deleteRow = wanted[wanted['GRADE'] == -1].index | |
| wanted.drop(deleteRow, inplace=True) | |
| majors = [] | |
| catalog = [] | |
| acad_prog = [] | |
| instructor = [] | |
| def numberize(y): | |
| if y not in majors: | |
| majors.append(y) | |
| return majors.index(y) | |
| else: | |
| return majors.index(y) | |
| def catalogize(z): | |
| if z not in catalog: | |
| catalog.append(z) | |
| return catalog.index(z) | |
| else: | |
| return catalog.index(z) | |
| def acadize(w): | |
| if w not in acad_prog: | |
| acad_prog.append(w) | |
| return acad_prog.index(w) | |
| else: | |
| return acad_prog.index(w) | |
| def instructerize(w): | |
| if w not in instructor: | |
| instructor.append(w) | |
| return instructor.index(w) | |
| else: | |
| return instructor.index(w) | |
| def removestring(w): | |
| if any(c.isalpha() for c in w): | |
| return w[:-1] | |
| else: | |
| return w | |
| wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize) | |
| wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize) | |
| wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize) | |
| wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize) | |
| document.add_paragraph(' ') | |
| document.add_heading('Retraining report', 0) | |
| document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records') | |
| records = [] | |
| X = wanted.drop(columns=['GRADE']) | |
| y = wanted['GRADE'] | |
| smote = BorderlineSMOTE(random_state = 11) | |
| X_smote, y_smote = smote.fit_resample(X, y) | |
| kf = StratifiedKFold(n_splits=10) | |
| models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'), | |
| RandomForestClassifier(max_depth=100), | |
| LGBMClassifier(n_estimators=200, num_leaves=60), | |
| VotingClassifier(estimators=[('knn', | |
| KNeighborsClassifier(leaf_size=10, | |
| metric='manhattan')), | |
| ('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])] | |
| metrics = dict() | |
| for model in models1: | |
| model.fit(X_smote,y_smote) | |
| preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,); | |
| metrics[model] = measures(preds,y_smote.values) | |
| records.append(((str(type(model).__name__),str(metrics[model])))) | |
| document.add_paragraph(' ') | |
| records = tuple(records) | |
| table = document.add_table(rows=1, cols=2) | |
| hdr_cells = table.rows[0].cells | |
| hdr_cells[0].text = 'Name' | |
| hdr_cells[1].text = 'Accuracy' | |
| for ind,qty in records: | |
| paragraph = document.add_paragraph() | |
| row_cells = table.add_row().cells | |
| row_cells[0].text = str(ind) | |
| row_cells[1].text = str(qty) | |
| table.style = 'TableGrid' | |
| dir_name = str(os.getcwd()) | |
| test = os.listdir(dir_name) | |
| number = 0 | |
| for item in test: | |
| if item.endswith(".sav") and int(item.split("=")[0]) >= number: | |
| number = int(item.split("=")[0]) | |
| #os.remove(os.path.join(dir_name, item)) | |
| acc = metrics[max(metrics, key=metrics.get)] | |
| model = max(metrics, key=metrics.get) | |
| number = number + 1 | |
| filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav' | |
| datavalues = {"majors":str(majors), | |
| 'acad_prog':str(acad_prog), | |
| 'catalog':str(catalog), | |
| 'instructor':str(instructor) | |
| } | |
| dfv = pd.DataFrame(datavalues,index=[0]) | |
| dfv.to_csv(str(number)+"="+"values.csv") | |
| document.add_paragraph(" ") | |
| document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%') | |
| pickle.dump(model, open(filename, 'wb')) | |
| document.add_paragraph(" ") | |
| p = document.add_paragraph('For more like this contact us at ') | |
| add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com") | |
| savedoc(document,'retraining_report.docx') | |
| #document.save('retraining_report.docx') | |
| return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%') | |
| allowedcols.remove('GRADE') | |
| for col in coset.columns: | |
| if col not in allowedcols: | |
| return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols) | |
| majors = [] | |
| catalog = [] | |
| acad_prog = [] | |
| instructor = [] | |
| dir_name = str(os.getcwd()) | |
| test = os.listdir(dir_name) | |
| modelname = "" | |
| maxnum = 0 | |
| for item in test: | |
| if item.endswith(".sav") and int(item.split("=")[0]) > maxnum: | |
| maxnum = int(item.split("=")[0]) | |
| modelname = item | |
| if maxnum == 0: | |
| return None,"No model found, please use retrain operation to build one" | |
| dfv = pd.read_csv(str(maxnum)+"=values.csv") | |
| cols = [majors,acad_prog,catalog,instructor] | |
| indexc = 0 | |
| for column in dfv.columns: | |
| if "[" in str(dfv[column][0]): | |
| l = dfv[column][0].replace("'",'') | |
| cols[indexc][:] = str(l).strip('][').split(', ') | |
| for i,e in enumerate(cols[indexc]): | |
| cols[indexc][i] = e.replace(' ','') | |
| print(cols[indexc]) | |
| indexc = indexc + 1 | |
| #modelname = "VotingClassifier=0.95756598831352.sav" | |
| loaded_model = pickle.load(open(modelname, 'rb')) | |
| droppers = 0 | |
| total = 0 | |
| document.add_paragraph(' ') | |
| document.add_heading('Subjects drop prediction report', 0) | |
| document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information') | |
| records = [] | |
| for row in coset.iterrows(): | |
| row = list(row)[1] | |
| semester = 1 | |
| row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '') | |
| row['TERM'] = str(row['TERM']) | |
| if row['TERM'][-1] == 2: | |
| semester = 2 | |
| elif row['TERM'][-1] == 5: | |
| semester = 3 | |
| c_id = catalog.index(str(row['CATALOG_NBR'])) | |
| in_id = instructor.index(str(row['INSTRUCTOR_ID'])) | |
| p_id = acad_prog.index(row['PROGRAM']) | |
| major = 0 | |
| x = row['PROGRAM.1'] | |
| if "Computer Science" in x: | |
| major = 0 | |
| elif "Computer Information" in x: | |
| major = 1 | |
| elif "Artificial" in x: | |
| major = 3 | |
| elif "Cyber" in x: | |
| major = 2 | |
| gpa = row['CGPA'] | |
| prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0] | |
| total = total + 1 | |
| records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction))) | |
| if prediction == 1: | |
| droppers = droppers + 1 | |
| document.add_paragraph(' ') | |
| records = tuple(records) | |
| table = document.add_table(rows=1, cols=9) | |
| hdr_cells = table.rows[0].cells | |
| hdr_cells[0].text = 'Index' | |
| hdr_cells[1].text = 'Student ID' | |
| hdr_cells[2].text = 'Term' | |
| hdr_cells[3].text = 'Catalog ID' | |
| hdr_cells[4].text = 'Instructor ID' | |
| hdr_cells[5].text = 'Cummulative GPA' | |
| hdr_cells[6].text = 'Academic Program' | |
| hdr_cells[7].text = 'Major' | |
| hdr_cells[8].text = 'Possible Drop Prediction' | |
| for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records: | |
| paragraph = document.add_paragraph() | |
| row_cells = table.add_row().cells | |
| row_cells[0].text = ind | |
| row_cells[1].text = str(qty) | |
| row_cells[2].text = id1 | |
| row_cells[3].text = desc | |
| row_cells[4].text = inst | |
| row_cells[5].text = cgpa | |
| row_cells[6].text = aprog | |
| row_cells[7].text = maj | |
| if pred == "1": | |
| pred = "Yes" | |
| else: | |
| pred = "No" | |
| row_cells[8].text = pred | |
| table.style = 'TableGrid' | |
| #document.add_page_break() | |
| document.add_paragraph(" ") | |
| modelname = modelname.split("=") | |
| lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)' | |
| document.add_paragraph(lastpara) | |
| savedoc(document,'drop_prediction_report.docx') | |
| #document.save('drop_prediction_report.docx') | |
| return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")" | |
| except Exception as e: | |
| return None,str(e) | |
| iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True) | |
| iface.launch() |