Spaces:

IAUAI
/

drop_prediction

Runtime error

App Files Files Community

drop_prediction / app.py

XPMaster

Update app.py

6c55b82 over 2 years ago

raw

history blame contribute delete

14.3 kB

	import gradio as gr
	import pandas as pd
	import pickle
	import os
	from docx import Document
	from docx.shared import Inches
	from docx.dml.color import ColorFormat
	import sklearn
	from lightgbm import LGBMClassifier
	import numpy as np
	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
	from imblearn.under_sampling import RandomUnderSampler
	from sklearn.preprocessing import MinMaxScaler
	from imblearn.over_sampling import SMOTE, BorderlineSMOTE
	from imblearn.pipeline import Pipeline as imbpipeline
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score, cross_val_predict
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn import model_selection
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.metrics import confusion_matrix
	from sklearn.feature_selection import SequentialFeatureSelector
	from sklearn.model_selection import GridSearchCV, StratifiedKFold
	import docx
	from docx.enum.dml import MSO_THEME_COLOR_INDEX

	def add_hyperlink(paragraph, text, url):
	# This gets access to the document.xml.rels file and gets a new relation id value
	part = paragraph.part
	r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

	# Create the w:hyperlink tag and add needed values
	hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
	hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

	# Create a w:r element and a new w:rPr element
	new_run = docx.oxml.shared.OxmlElement('w:r')
	rPr = docx.oxml.shared.OxmlElement('w:rPr')

	# Join all the xml elements together add add the required text to the w:r element
	new_run.append(rPr)
	new_run.text = text
	hyperlink.append(new_run)

	# Create a new Run object and add the hyperlink into it
	r = paragraph.add_run ()
	r._r.append (hyperlink)

	# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
	# Delete this if using a template that has the hyperlink style in it
	r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
	r.font.underline = True

	return hyperlink

	def savedoc(document,name):
	def delete_paragraph(paragraph):
	p = paragraph._element
	p.getparent().remove(p)
	p._p = p._element = None
	for para in document.paragraphs:
	if para.text == '' and para.text != ' ':
	delete_paragraph(para)
	document.save(name)

	from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score
	def measures(predicted, y_test):
	accuracy = accuracy_score(y_test, predicted)
	precision = precision_score(y_test, predicted)
	recall = recall_score(y_test, predicted)
	f1 = f1_score(y_test, predicted)
	matrix = confusion_matrix(y_test, predicted)
	return accuracy

	def greet(operation,filer):
	try:
	if filer == None:
	return None,"Invalid file submitted"
	import os
	coset = pd.read_csv(filer.name)
	coset = coset.dropna(how='any')
	document = Document('temp.docx')
	allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1']
	if operation == "retrain":
	allowedcols = allowedcols[1:]
	for col in coset.columns:
	if col not in allowedcols:
	return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols)
	wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1'])
	def termize(x):
	if str(x)[-1] == "1":
	return 0
	elif str(x)[-1] == "2":
	return 1
	else:
	return 2
	def shorten_major(x):
	if "Computer Science" in x:
	return "CS"
	elif "Computer Information" in x:
	return "CIS"
	elif "Artificial" in x:
	return "AI"
	elif "Cyber" in x:
	return "CYS"
	def binarize_grade(y):
	todrop = ['TR','DN','NP','IP']
	for element in todrop:
	if element in y:
	return -1
	if 'W' in y:
	return 1
	else:
	return 0

	wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major)
	wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade)
	wanted['TERM'] = wanted['TERM'].apply(termize)
	deleteRow = wanted[wanted['GRADE'] == -1].index
	wanted.drop(deleteRow, inplace=True)
	majors = []
	catalog = []
	acad_prog = []
	instructor = []
	def numberize(y):
	if y not in majors:
	majors.append(y)
	return majors.index(y)
	else:
	return majors.index(y)

	def catalogize(z):
	if z not in catalog:
	catalog.append(z)
	return catalog.index(z)
	else:
	return catalog.index(z)

	def acadize(w):
	if w not in acad_prog:
	acad_prog.append(w)
	return acad_prog.index(w)
	else:
	return acad_prog.index(w)
	def instructerize(w):
	if w not in instructor:
	instructor.append(w)
	return instructor.index(w)
	else:
	return instructor.index(w)

	def removestring(w):
	if any(c.isalpha() for c in w):
	return w[:-1]
	else:
	return w

	wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize)
	wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize)
	wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize)
	wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize)
	document.add_paragraph(' ')
	document.add_heading('Retraining report', 0)
	document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records')
	records = []

	X = wanted.drop(columns=['GRADE'])
	y = wanted['GRADE']
	smote = BorderlineSMOTE(random_state = 11)
	X_smote, y_smote = smote.fit_resample(X, y)
	kf = StratifiedKFold(n_splits=10)
	models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'),
	RandomForestClassifier(max_depth=100),
	LGBMClassifier(n_estimators=200, num_leaves=60),
	VotingClassifier(estimators=[('knn',
	KNeighborsClassifier(leaf_size=10,
	metric='manhattan')),
	('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])]
	metrics = dict()
	for model in models1:
	model.fit(X_smote,y_smote)
	preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,);
	metrics[model] = measures(preds,y_smote.values)
	records.append(((str(type(model).__name__),str(metrics[model]))))
	document.add_paragraph(' ')
	records = tuple(records)

	table = document.add_table(rows=1, cols=2)
	hdr_cells = table.rows[0].cells
	hdr_cells[0].text = 'Name'
	hdr_cells[1].text = 'Accuracy'
	for ind,qty in records:
	paragraph = document.add_paragraph()
	row_cells = table.add_row().cells
	row_cells[0].text = str(ind)
	row_cells[1].text = str(qty)
	table.style = 'TableGrid'

	dir_name = str(os.getcwd())
	test = os.listdir(dir_name)
	number = 0
	for item in test:
	if item.endswith(".sav") and int(item.split("=")[0]) >= number:
	number = int(item.split("=")[0])
	#os.remove(os.path.join(dir_name, item))
	acc = metrics[max(metrics, key=metrics.get)]
	model = max(metrics, key=metrics.get)
	number = number + 1
	filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav'

	datavalues = {"majors":str(majors),
	'acad_prog':str(acad_prog),
	'catalog':str(catalog),
	'instructor':str(instructor)
	}

	dfv = pd.DataFrame(datavalues,index=[0])
	dfv.to_csv(str(number)+"="+"values.csv")

	document.add_paragraph(" ")
	document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%')
	pickle.dump(model, open(filename, 'wb'))
	document.add_paragraph(" ")
	p = document.add_paragraph('For more like this contact us at ')
	add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com")
	savedoc(document,'retraining_report.docx')
	#document.save('retraining_report.docx')
	return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%')
	allowedcols.remove('GRADE')
	for col in coset.columns:
	if col not in allowedcols:
	return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols)
	majors = []
	catalog = []
	acad_prog = []
	instructor = []
	dir_name = str(os.getcwd())
	test = os.listdir(dir_name)
	modelname = ""
	maxnum = 0
	for item in test:
	if item.endswith(".sav") and int(item.split("=")[0]) > maxnum:
	maxnum = int(item.split("=")[0])
	modelname = item
	if maxnum == 0:
	return None,"No model found, please use retrain operation to build one"
	dfv = pd.read_csv(str(maxnum)+"=values.csv")

	cols = [majors,acad_prog,catalog,instructor]
	indexc = 0

	for column in dfv.columns:
	if "[" in str(dfv[column][0]):
	l = dfv[column][0].replace("'",'')
	cols[indexc][:] = str(l).strip('][').split(', ')

	for i,e in enumerate(cols[indexc]):
	cols[indexc][i] = e.replace(' ','')
	print(cols[indexc])
	indexc = indexc + 1
	#modelname = "VotingClassifier=0.95756598831352.sav"
	loaded_model = pickle.load(open(modelname, 'rb'))
	droppers = 0
	total = 0
	document.add_paragraph(' ')
	document.add_heading('Subjects drop prediction report', 0)
	document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information')

	records = []
	for row in coset.iterrows():
	row = list(row)[1]
	semester = 1
	row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '')
	row['TERM'] = str(row['TERM'])
	if row['TERM'][-1] == 2:
	semester = 2
	elif row['TERM'][-1] == 5:
	semester = 3
	c_id = catalog.index(str(row['CATALOG_NBR']))
	in_id = instructor.index(str(row['INSTRUCTOR_ID']))
	p_id = acad_prog.index(row['PROGRAM'])
	major = 0
	x = row['PROGRAM.1']
	if "Computer Science" in x:
	major = 0
	elif "Computer Information" in x:
	major = 1
	elif "Artificial" in x:
	major = 3
	elif "Cyber" in x:
	major = 2
	gpa = row['CGPA']
	prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0]
	total = total + 1
	records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction)))
	if prediction == 1:
	droppers = droppers + 1
	document.add_paragraph(' ')
	records = tuple(records)

	table = document.add_table(rows=1, cols=9)
	hdr_cells = table.rows[0].cells
	hdr_cells[0].text = 'Index'
	hdr_cells[1].text = 'Student ID'
	hdr_cells[2].text = 'Term'
	hdr_cells[3].text = 'Catalog ID'
	hdr_cells[4].text = 'Instructor ID'
	hdr_cells[5].text = 'Cummulative GPA'
	hdr_cells[6].text = 'Academic Program'
	hdr_cells[7].text = 'Major'
	hdr_cells[8].text = 'Possible Drop Prediction'
	for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records:
	paragraph = document.add_paragraph()
	row_cells = table.add_row().cells
	row_cells[0].text = ind
	row_cells[1].text = str(qty)
	row_cells[2].text = id1
	row_cells[3].text = desc
	row_cells[4].text = inst
	row_cells[5].text = cgpa
	row_cells[6].text = aprog
	row_cells[7].text = maj
	if pred == "1":
	pred = "Yes"
	else:
	pred = "No"
	row_cells[8].text = pred

	table.style = 'TableGrid'
	#document.add_page_break()
	document.add_paragraph(" ")
	modelname = modelname.split("=")
	lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)'
	document.add_paragraph(lastpara)
	savedoc(document,'drop_prediction_report.docx')
	#document.save('drop_prediction_report.docx')
	return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")"
	except Exception as e:
	return None,str(e)

	iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True)
	iface.launch()