cassiomo commited on
Commit
ac58d6a
·
verified ·
1 Parent(s): 0999259

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -439
app.py CHANGED
@@ -563,445 +563,6 @@ st.plotly_chart(fig_bar)
563
  # # In[ ]:
564
  #
565
 
566
- final_df = pd.read_csv('./data/training.csv')
567
- final_df.tail()
568
-
569
-
570
- # # GROUP STAGE MODELING
571
-
572
- # ### Choosing a model
573
-
574
- # In[4]:
575
-
576
-
577
- # I save the original data frame in a flag to then train the final pipeline
578
- pipe_DF = final_df
579
- # Dummies for categorical columns
580
- final_df = pd.get_dummies(final_df)
581
-
582
-
583
- # I split the dataset into training, testing and validation.
584
-
585
- # In[5]:
586
-
587
-
588
- X = final_df.drop('Team1_Result',axis=1)
589
- y = final_df['Team1_Result']
590
- from sklearn.model_selection import train_test_split
591
- X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
592
- X_hold_test, X_test, y_hold_test, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)
593
-
594
-
595
- # Scaling
596
-
597
- # In[6]:
598
-
599
-
600
- from sklearn.preprocessing import StandardScaler
601
- scaler = StandardScaler()
602
- X_train = scaler.fit_transform(X_train)
603
- X_test = scaler.transform(X_test)
604
- X_hold_test = scaler.transform(X_hold_test)
605
-
606
-
607
- # Defining function to display the confusion matrix quickly.
608
-
609
- # In[7]:
610
-
611
-
612
- from sklearn.metrics import classification_report,ConfusionMatrixDisplay
613
- def metrics_display(model):
614
- model.fit(X_train,y_train)
615
- y_pred = model.predict(X_test)
616
- print(classification_report(y_test,y_pred))
617
- ConfusionMatrixDisplay.from_predictions(y_test,y_pred);
618
-
619
-
620
- # * **Random Forest**
621
-
622
- # In[8]:
623
-
624
-
625
- from sklearn.ensemble import RandomForestClassifier
626
- metrics_display(RandomForestClassifier())
627
-
628
-
629
- # * **Ada Boost Classifier**
630
-
631
- # In[9]:
632
-
633
-
634
- from sklearn.ensemble import AdaBoostClassifier
635
- metrics_display(AdaBoostClassifier())
636
-
637
-
638
- # * **XGB Boost**
639
-
640
- # In[10]:
641
-
642
-
643
- from xgboost import XGBClassifier
644
- metrics_display(XGBClassifier(use_label_encoder=False))
645
-
646
-
647
- # * **Neural network**
648
- #
649
- #
650
-
651
- # In[11]:
652
-
653
-
654
- import keras
655
- from keras import Sequential
656
- from keras.layers import Dense,Dropout
657
- from keras import Input
658
-
659
- X_train.shape
660
-
661
-
662
- # In[12]:
663
-
664
-
665
- model = Sequential()
666
- model.add(Input(shape=(404,)))
667
- model.add(Dense(300,activation='relu'))
668
- model.add(Dropout(0.3))
669
- model.add(Dense(200,activation='relu'))
670
- model.add(Dropout(0.3))
671
- model.add(Dense(100,activation='relu'))
672
- model.add(Dropout(0.3))
673
- model.add(Dense(3,activation='softmax'))
674
- model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
675
- model.fit(X_train,y_train,epochs=10,validation_split=0.2)
676
-
677
- y_pred1 = model.predict(X_test)
678
- y_pred1 = np.argmax(y_pred1,axis=1)
679
- print(classification_report(y_test,y_pred1))
680
- ConfusionMatrixDisplay.from_predictions(y_test,y_pred1)
681
-
682
-
683
- # The XGBoost model performs better than the others, so I will tune its hyperparameters and evaluate the performance based on the validation dataset.
684
-
685
- # ### XGB Boost - Tuning & Hold-out Validation
686
-
687
- # In[13]:
688
-
689
-
690
- from sklearn.model_selection import GridSearchCV
691
- from sklearn.metrics import accuracy_score
692
-
693
- # Make a dictionary of hyperparameter values to search
694
- search_space = {
695
- "n_estimators" : [200,250,300,350,400,450,500],
696
- "max_depth" : [3,4,5,6,7,8,9],
697
- "gamma" : [0.001,0.01,0.1],
698
- "learning_rate" : [0.001,0.01,0.1]
699
- }
700
-
701
-
702
- # In[14]:
703
-
704
-
705
- # make a GridSearchCV object
706
- GS = GridSearchCV(estimator = XGBClassifier(use_label_encoder=False),
707
- param_grid = search_space,
708
- scoring = 'accuracy',
709
- cv = 5,
710
- verbose = 4)
711
-
712
-
713
- # Uncomment the following line to enable the tuning. The best result I found was: gamma = 0.01, learning_rate = 0.01, n_estimators = 300, max_depth = 4
714
-
715
- # In[15]:
716
-
717
-
718
- #GS.fit(X_train,y_train)
719
-
720
-
721
- # To get only the best hyperparameter values
722
-
723
- # In[16]:
724
-
725
-
726
- #print(GS.best_params_)
727
-
728
-
729
- # Initially, I validate the model with its default parameters, and then I will validate it with its tuned parameters.
730
-
731
- # * **Default Hyperparameters**
732
-
733
- # In[17]:
734
-
735
-
736
- model = XGBClassifier()
737
- model.fit(X_train,y_train)
738
- y_pred = model.predict(X_hold_test)
739
- print(classification_report(y_hold_test,y_pred))
740
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
741
-
742
-
743
- # * **Tuned Hyperparameters**
744
-
745
- # In[18]:
746
-
747
-
748
- model = XGBClassifier(use_label_encoder = False, gamma = 0.01, learning_rate = 0.01, n_estimators = 300, max_depth = 4)
749
- model.fit(X_train,y_train)
750
- y_pred = model.predict(X_hold_test)
751
- print(classification_report(y_hold_test,y_pred))
752
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
753
-
754
-
755
- # The model improves a bit, so I will create a pipe to use the model later easily.
756
-
757
- # ### Creating a pipeline for the XGB model
758
-
759
- # In[19]:
760
-
761
-
762
- from sklearn.preprocessing import OneHotEncoder
763
- from sklearn.compose import make_column_transformer
764
- column_trans = make_column_transformer(
765
- (OneHotEncoder(),['Team1', 'Team2']),remainder='passthrough')
766
-
767
- pipe_X = pipe_DF.drop('Team1_Result',axis=1)
768
- pipe_y = pipe_DF['Team1_Result']
769
-
770
- from sklearn.pipeline import make_pipeline
771
- pipe_League = make_pipeline(column_trans,StandardScaler(with_mean=False),XGBClassifier(use_label_encoder=False, gamma= 0.01, learning_rate= 0.01, n_estimators= 300, max_depth= 4))
772
- pipe_League.fit(pipe_X,pipe_y)
773
-
774
-
775
- # In[20]:
776
-
777
-
778
- import joblib
779
- joblib.dump(pipe_League,"./groups_stage_prediction.pkl")
780
-
781
-
782
- # # KNOCKOUT STAGE MODELING
783
-
784
- # ### Choosing the model
785
- #
786
- # Removing Draw status.
787
-
788
- # In[21]:
789
-
790
-
791
- knock_df = pipe_DF[pipe_DF['Team1_Result'] != 2]
792
-
793
-
794
- # In[22]:
795
-
796
-
797
- pipe_knock_df = knock_df
798
- knock_df = pd.get_dummies(knock_df)
799
- X = knock_df.drop('Team1_Result',axis=1)
800
- y = knock_df['Team1_Result']
801
-
802
- X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
803
- X_hold_test, X_test, y_hold_test, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)
804
-
805
-
806
- # * **Ada Boost Classifier**
807
-
808
- # In[23]:
809
-
810
-
811
- metrics_display(AdaBoostClassifier())
812
-
813
-
814
- # * **Random Forest**
815
- #
816
- #
817
- #
818
-
819
- # In[26]:
820
-
821
-
822
- metrics_display(RandomForestClassifier())
823
-
824
-
825
- # * **XGB Boost**
826
-
827
- # In[27]:
828
-
829
-
830
- metrics_display(XGBClassifier(use_label_encoder=False))
831
-
832
-
833
- # * **Neural network**
834
-
835
- # In[28]:
836
-
837
-
838
- X_train.shape
839
-
840
-
841
- # In[30]:
842
-
843
-
844
- model = Sequential()
845
- model.add(Input(shape=(399,)))
846
- model.add(Dense(300,activation='relu'))
847
- model.add(Dropout(0.3))
848
- model.add(Dense(200,activation='relu'))
849
- model.add(Dropout(0.3))
850
- model.add(Dense(100,activation='relu'))
851
- model.add(Dropout(0.3))
852
- model.add(Dense(2,activation='softmax'))
853
- model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
854
- model.fit(X_train,y_train,epochs=10,validation_split=0.2)
855
-
856
- y_pred1 = model.predict(X_test)
857
- y_pred1 = np.argmax(y_pred1,axis=1)
858
- print(classification_report(y_test,y_pred1))
859
- ConfusionMatrixDisplay.from_predictions(y_test,y_pred1)
860
-
861
-
862
- # All models have very similar performance. Therefore I will tune the Random Forest model and the XGB Boost.
863
-
864
- # ### Random Forest - Tuning & Hold-out Validation
865
-
866
- # In[31]:
867
-
868
-
869
- search_space = {
870
- "max_depth" : [11,12,13,14,15,16],
871
- "max_leaf_nodes" : [170,180,190,200,210,220,230],
872
- "min_samples_leaf" : [3,4,5,6,7,8],
873
- "n_estimators" : [310,320,330,340,350]
874
- }
875
-
876
-
877
- # In[32]:
878
-
879
-
880
- GS = GridSearchCV(estimator = RandomForestClassifier(),
881
- param_grid = search_space,
882
- scoring = 'accuracy',
883
- cv = 5,
884
- verbose = 4)
885
-
886
-
887
- # Uncomment the following lines to enable the tuning. The best result I found was: max_depth = 16, n_estimators = 320, max_leaf_nodes = 190, min_samples_leaf = 5
888
-
889
- # In[33]:
890
-
891
-
892
- #GS.fit(X_train,y_train)
893
-
894
-
895
- # In[34]:
896
-
897
-
898
- #print(GS.best_params_)
899
-
900
-
901
- # * **Default Hyperparameters**
902
-
903
- # In[35]:
904
-
905
-
906
- model = RandomForestClassifier()
907
- model.fit(X_train,y_train)
908
- y_pred = model.predict(X_hold_test)
909
- print(classification_report(y_hold_test,y_pred))
910
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
911
-
912
-
913
- # * **Tuned Hyperparameters**
914
-
915
- # In[36]:
916
-
917
-
918
- model = RandomForestClassifier(max_depth= 16, n_estimators=320, max_leaf_nodes= 190, min_samples_leaf= 5)
919
- model.fit(X_train,y_train)
920
- y_pred = model.predict(X_hold_test)
921
- print(classification_report(y_hold_test,y_pred))
922
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
923
-
924
-
925
- # The Random Forest greatly improves performance with the tuned hyperparameters; let's see the XGB Boost model.
926
-
927
- # ### XGB Boost - Tuning & Hold-out Validation
928
-
929
- # In[37]:
930
-
931
-
932
- search_space = {
933
- "n_estimators" : [300,350,400,450,500,550,600],
934
- "max_depth" : [3,4,5,6,7,8,9],
935
- "gamma" : [0.001,0.01,0.1],
936
- "learning_rate" : [0.001,0.01]
937
- }
938
-
939
-
940
- # In[38]:
941
-
942
-
943
- GS = GridSearchCV(estimator = XGBClassifier(use_label_encoder=False),
944
- param_grid = search_space,
945
- scoring = 'accuracy',
946
- cv = 5,
947
- verbose = 4)
948
-
949
-
950
- # In[39]:
951
-
952
-
953
- #GS.fit(X_train,y_train)
954
-
955
-
956
- # In[40]:
957
-
958
-
959
- #print(GS.best_params_) # to get only the best hyperparameter values that we searched for
960
-
961
-
962
- # Uncomment the following lines to enable the tuning. The best result I found was: gamma = 0.01, learning_rate = 0.01, max_depth = 5, n_estimators = 500
963
-
964
- # * **Default Hyperparameters**
965
-
966
- # In[41]:
967
-
968
-
969
- model = XGBClassifier()
970
- model.fit(X_train,y_train)
971
- y_pred = model.predict(X_hold_test)
972
- print(classification_report(y_hold_test,y_pred))
973
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
974
-
975
-
976
- # * **Tuned Hyperparameters**
977
-
978
- # In[42]:
979
-
980
-
981
- model = XGBClassifier(gamma=0.01,learning_rate=0.01, max_depth=5, n_estimators=500)
982
- model.fit(X_train,y_train)
983
- y_pred = model.predict(X_hold_test)
984
- print(classification_report(y_hold_test,y_pred))
985
- ConfusionMatrixDisplay.from_predictions(y_hold_test,y_pred);
986
-
987
-
988
- # The model does not improve notably. However, it does improve compared to the Random Forest.
989
-
990
- # ### Creating a pipeline for the XGB Boost model
991
-
992
- # In[43]:
993
-
994
-
995
- pipe_X = pipe_knock_df.drop('Team1_Result',axis=1)
996
- pipe_y = pipe_knock_df['Team1_Result']
997
- pipe_knock = make_pipeline(column_trans,StandardScaler(with_mean=False),XGBClassifier(gamma=0.01,learning_rate=0.01, max_depth=5, n_estimators=500))
998
- pipe_knock.fit(pipe_X,pipe_y)
999
-
1000
-
1001
- # In[44]:
1002
-
1003
-
1004
- joblib.dump(pipe_knock,"./knockout_stage_prediction.pkl")
1005
 
1006
  st.title("FIFA winner predication")
1007
  st.write('This app predict 2022 FIFA winner')
 
563
  # # In[ ]:
564
  #
565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
  st.title("FIFA winner predication")
568
  st.write('This app predict 2022 FIFA winner')