from datasets import Dataset import numpy as np import pandas as pd import tensorflow as tf #import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification from sklearn.metrics import classification_report from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding from transformers import AutoTokenizer import matplotlib.pyplot as plt from sklearn.metrics import ConfusionMatrixDisplay tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def load_training(basepath='training_data/'): training_data=[] testing_data=[] for i in range(17): df=pd.read_csv(basepath+"FeatureSet_%d.csv" %(i+1)) train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)#Split based on each category train_df['label']=train_df['label']-1 test_df['label']=test_df['label']-1 training_data.append(train_df) testing_data.append(test_df) train_df=pd.concat(training_data) test_df=pd.concat(testing_data) return train_df,test_df def tokenize_data(examples): return tokenizer(examples["text_data"], truncation=True, padding=True) def buildtraining(train_df, test_df,save_directory='topic_classifier_model'): train_dataset = Dataset.from_pandas(train_df)#These are arrow files test_dataset = Dataset.from_pandas(test_df)#These are arrow files tokenized_train = train_dataset.map(tokenize_data, batched=True) tokenized_test = test_dataset.map(tokenize_data, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir="./distilbert_results", learning_rate=2e-5, #Small learning rate per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=15, warmup_steps=5, weight_decay=0.2, #Bigger means more regularization for over-fitting logging_strategy="epoch" ) labels = train_df["label"].unique() cat = train_df["category"].unique() # Create label-to-id and id-to-label mappings label2id = {cat[idx]: idx for idx, categ in enumerate(labels)} id2label = {idx: cat[idx] for idx, categ in enumerate(labels)} # Define Trainer object for training the model model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels),label2id=label2id,id2label=id2label) trainer = Trainer( model=model, device_map='cpu', args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_test, tokenizer=tokenizer, data_collator=data_collator, compute_metrics='balanced_accuracy_score' ) # Train the model trainer.train() # Save the trained model trainer.save_model(save_directory) def prediction_metrics(test_df,save_directory='topic_classifier_model'): save_directory=save_directory loaded_tokenizer = DistilBertTokenizerFast.from_pretrained(save_directory) loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory) test_text=test_df['text_data'].to_list() #test_dataset = Dataset.from_pandas(test_df)#These are arrow files #tokenized_test = test_dataset.map(tokenize_data, batched=True) #print(type(tokenized_test['text_data'])) #print(tokenized_test['text_data'][0]) labels=test_df['label'].to_list() cat = test_df["category"].unique() #for l in tokenized_test['text_data']:print(l) predict_input = loaded_tokenizer( text=test_text, truncation=True, padding=True, return_tensors="tf") output = loaded_model(predict_input)[0] #print(output) prediction_value = tf.argmax(output, axis=1).numpy()#All answers #print(prediction_value) accuracy = np.mean(prediction_value == np.array(labels)) print(f"\nAccuracy: {accuracy:.4f}") print(classification_report(np.array(labels), prediction_value)) ConfusionMatrixDisplay.from_predictions(y_true=np.array(labels), y_pred=prediction_value,display_labels=cat) plt.show() if __name__ == '__main__': train_df,test_df=load_training(basepath='training_data/') buildtraining(train_df, test_df) prediction_metrics(test_df)