import numpy as np import pandas as pd from collections import Counter from sklearn.model_selection import train_test_split import gradio as gd class ClassID3Decisiontree: def __init__(self): self.tree = {} def fit(self,X,y): #存储特征名称 self.features = list(X.columns) #递归构建决策树 self.tree = self.id3(X,y,self.features) def id3(self,X,y,features): #判断是否都是同一类 if len(set(y)) == 1: return y.iloc[0] #没有特征进行选择则选取最多样本数返回 if len(features) == 0: return Counter(y).most_common(1)[0][0] #求取最优特征 best_features = self.choose_best_features(X,y,features) # print(best_features) #构架 tree = {best_features: {}} #去掉当前最优特征 features = [i for i in features if i != best_features] #X中去掉每一行的最有特征值,并进行循环构建决策树 for value in X[best_features].unique(): sub_X = X[X[best_features] == value].drop([best_features],axis = 1) sub_y = y[X[best_features] == value] tree[best_features][value] = self.id3(sub_X,sub_y,features) return tree def choose_best_features(self,X,y,features): best_gain = 0 best_features = None # print(best_features) #信息熵 all_entropy = self.calc_all_entropy(y) for feature in features: #每个特征信息增益 info_gain = all_entropy - self.calc_conditions_entropy(X,y,feature) if info_gain > best_gain: best_gain = info_gain best_features = feature # print(best_features) # if info_gain <= 0: # return None return best_features #求信息熵 def calc_all_entropy(self,y): counts = np.array([Counter(y)['否'],Counter(y)['是']]) # print(counts) probabilities = counts / len(y) # print(probabilities) entrop = -np.sum([i * np.log2(i) for i in probabilities if i > 0]) return entrop #求信息增益中的后半部分 def calc_conditions_entropy(self,X,y,feature): single_entrop = 0 for value in X[feature].unique(): sub_y = y[X[feature] == value] prob = len(sub_y) / len(y) single_entrop += prob * self.calc_all_entropy(sub_y) return single_entrop #预测 def predict(self,X): #遍历每一行进行预测 results = np.array([self._predict(self.tree,sample) for index,sample in X.iterrows()]) return results def _predict(self,tree,sample): #判断是否到底 if not isinstance(tree,dict): return tree root = next(iter(tree)) feature_value = sample[root] #特征值在预测树中则继续进行,否则返回空值失败 if feature_value in tree[root]: return self._predict(tree[root][feature_value],sample) else: return None def run(SeZe,GenDi,QiaoSheng,WengLi,QiBu,ChuGan): #将读入数据集 with open('./1.txt',mode ='r',encoding = 'utf-8') as f: data = f.read() with open('./1.txt',mode ='w',encoding = 'utf-8') as f: f.write(data.replace(',',' ')) text = pd.read_table('./1.txt',sep = ' ') X = text.drop(['编号','好瓜'],axis = 1) y = text['好瓜'] #分为训练集和测试集 # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #调用类 model = ClassID3Decisiontree() #训练和预测 model.fit(X,y) #输入 dict = { "色泽":[SeZe], "根蒂":[GenDi], "敲声":[QiaoSheng], "纹理":[WengLi], "脐部":[QiBu], "触感":[ChuGan] } user_input = pd.DataFrame( dict ) predictions = model.predict(user_input) # print(X_test) # print(predictions) #判断有多大比例符合预期 return f'你的瓜是好瓜? {predictions}' if predictions else "条件不足以判断好瓜坏瓜" SeZe = ["青绿","乌黑","浅白"] GenDi = ["硬挺","稍蜷","蜷缩"] QiaoSheng = ["浊响","沉闷","清脆"] WengLi = ["清晰","稍糊","模糊"] QiBu = ["凹陷","平坦","稍凹"] ChuGan = ["硬滑","软粘"] gd_watermelon = gd.Interface(fn = run,inputs = [gd.Radio(SeZe),gd.Radio(GenDi),gd.Radio(QiaoSheng),gd.Radio(WengLi),gd.Radio(QiBu),gd.Radio(ChuGan)],outputs = 'text',live = True) gd_watermelon.launch()