In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix

def load_and_prepare_data(train_path):
    df = pd.read_csv(train_path)
    df = df.rename(columns={"Label": "label"})
    return Dataset.from_pandas(df)

def load_and_prepare_test_data(test_path):
    df = pd.read_csv(test_path)
    df = df.rename(columns={"Label": "label"})
    return Dataset.from_pandas(df), df

def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples['Sentence'], padding='max_length', truncation=True, max_length=128)
    tokenized = dataset.map(tokenize_function, batched=True)
    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return tokenized

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=1).numpy()
    report = classification_report(labels, preds, output_dict=True)
    acc = report['accuracy']
    f1 = report['macro avg']['f1-score']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    return {
        'accuracy': acc,
        'f1_macro': f1,
        'precision_macro': precision,
        'recall_macro': recall
    }

def train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_base_dir):
    print(f"\n--- Fine-tuning model: {model_name} ---")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    tokenized_train = tokenize_dataset(train_dataset, tokenizer)

    training_args = TrainingArguments(
        output_dir=f"{output_base_dir}/model",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=False,
        logging_dir=f"{output_base_dir}/logs",
        logging_steps=50,
        save_total_limit=2,
        seed=42,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model()

    results_list = []

    for i, (test_dataset, raw_test_df) in enumerate(zip(test_datasets, raw_test_dfs), start=1):
        print(f"\nEvaluacija na test skupu test-{i}")
        tokenized_test = tokenize_dataset(test_dataset, tokenizer)
        predictions_output = trainer.predict(tokenized_test)

        preds = torch.argmax(torch.tensor(predictions_output.predictions), axis=1).numpy()
        labels = predictions_output.label_ids

        report = classification_report(labels, preds, target_names=['negative', 'neutral', 'positive'], output_dict=True)

        accuracy = report['accuracy']
        f1_macro = report['macro avg']['f1-score']
        precision_macro = report['macro avg']['precision']
        recall_macro = report['macro avg']['recall']

        results_list.append({
            'Test Set': f'test-{i}',
            'Accuracy': accuracy,
            'F1 Macro': f1_macro,
            'Precision Macro': precision_macro,
            'Recall Macro': recall_macro
        })

        print("Confusion Matrix:")
        print(confusion_matrix(labels, preds))
        print("\nClassification Report:")
        print(classification_report(labels, preds, target_names=['negative', 'neutral', 'positive']))

        output_df = raw_test_df.copy()
        output_df['predicted_label'] = preds
        output_df['correct'] = output_df['label'] == output_df['predicted_label']
        output_csv = f"{output_base_dir}/predictions_test_{i}.csv"
        output_df.to_csv(output_csv, index=False)
        print(f"Predikcije spremljene u {output_csv}")

    # Izračun prosjeka za sve metrike
    df_results = pd.DataFrame(results_list)
    df_results.loc['Average'] = df_results.mean(numeric_only=True)

    print("\nSažetak metrika po test skupovima s prosjekom:")
    print(df_results)

    df_results.to_csv(f"{output_base_dir}/summary_metrics_with_average.csv", index=True)
    print(f"Sažetak metrika spremljen u {output_base_dir}/summary_metrics_with_average.csv")

if __name__ == "__main__":
    train_files = {
        "train_combined": "TRAIN.csv",
        "train_2": "train-2.csv"
    }

    test_files = ["test-1.csv", "test-2.csv", "test-3.csv"]
    test_datasets = []
    raw_test_dfs = []
    for f in test_files:
        ds, df = load_and_prepare_test_data(f)
        test_datasets.append(ds)
        raw_test_dfs.append(df)

    model_name = "EMBEDDIA/crosloengual-bert"

    for train_name, train_path in train_files.items():
        print(f"\n\n=== Treniranje i evaluacija za trening skup: {train_name} ===")
        train_dataset = load_and_prepare_data(train_path)
        output_dir = f"results_{train_name}_croslo"
        train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_dir)




=== Treniranje i evaluacija za trening skup: train_combined ===

--- Fine-tuning model: EMBEDDIA/crosloengual-bert ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7577 [00:00<?, ? examples/s]



Step,Training Loss
50,0.8555
100,0.7487
150,0.6196
200,0.6183
250,0.6308
300,0.6394
350,0.6365
400,0.5959
450,0.5985
500,0.4642





Evaluacija na test skupu test-1


Map:   0%|          | 0/653 [00:00<?, ? examples/s]



Confusion Matrix:
[[111  47   7]
 [ 77 328  25]
 [  3  28  27]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.58      0.67      0.62       165
     neutral       0.81      0.76      0.79       430
    positive       0.46      0.47      0.46        58

    accuracy                           0.71       653
   macro avg       0.62      0.63      0.62       653
weighted avg       0.72      0.71      0.72       653

Predikcije spremljene u results_train_combined_croslo/predictions_test_1.csv

Evaluacija na test skupu test-2


Map:   0%|          | 0/741 [00:00<?, ? examples/s]



Confusion Matrix:
[[198  15   3]
 [ 16 411   4]
 [  5  11  78]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.92      0.91       216
     neutral       0.94      0.95      0.95       431
    positive       0.92      0.83      0.87        94

    accuracy                           0.93       741
   macro avg       0.92      0.90      0.91       741
weighted avg       0.93      0.93      0.93       741

Predikcije spremljene u results_train_combined_croslo/predictions_test_2.csv

Evaluacija na test skupu test-3


Map:   0%|          | 0/793 [00:00<?, ? examples/s]



Confusion Matrix:
[[204  56   7]
 [  7 254   2]
 [  9 116 138]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.93      0.76      0.84       267
     neutral       0.60      0.97      0.74       263
    positive       0.94      0.52      0.67       263

    accuracy                           0.75       793
   macro avg       0.82      0.75      0.75       793
weighted avg       0.82      0.75      0.75       793

Predikcije spremljene u results_train_combined_croslo/predictions_test_3.csv

Sažetak metrika po test skupovima s prosjekom:
        Test Set  Accuracy  F1 Macro  Precision Macro  Recall Macro
0         test-1  0.713629  0.624216         0.617558      0.633678
1         test-2  0.927126  0.909619         0.920753      0.900017
2         test-3  0.751576  0.749418         0.820764      0.751513
Average      NaN  0.797444  0.761084         0.786359      0.761736
Sažetak metrika spremljen u results_train_combined_croslo/summary_

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2221 [00:00<?, ? examples/s]



Step,Training Loss
50,0.8488
100,0.6109
150,0.5496
200,0.3818
250,0.4017
300,0.3261
350,0.2331
400,0.2182



Evaluacija na test skupu test-1


Map:   0%|          | 0/653 [00:00<?, ? examples/s]



Confusion Matrix:
[[114  36  15]
 [ 85 302  43]
 [  7  22  29]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.55      0.69      0.61       165
     neutral       0.84      0.70      0.76       430
    positive       0.33      0.50      0.40        58

    accuracy                           0.68       653
   macro avg       0.58      0.63      0.59       653
weighted avg       0.72      0.68      0.69       653

Predikcije spremljene u results_train_2_croslo/predictions_test_1.csv

Evaluacija na test skupu test-2


Map:   0%|          | 0/741 [00:00<?, ? examples/s]



Confusion Matrix:
[[170  36  10]
 [ 45 366  20]
 [ 15  24  55]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.79      0.76       216
     neutral       0.86      0.85      0.85       431
    positive       0.65      0.59      0.61        94

    accuracy                           0.80       741
   macro avg       0.75      0.74      0.74       741
weighted avg       0.80      0.80      0.80       741

Predikcije spremljene u results_train_2_croslo/predictions_test_2.csv

Evaluacija na test skupu test-3


Map:   0%|          | 0/793 [00:00<?, ? examples/s]



Confusion Matrix:
[[193  59  15]
 [ 20 234   9]
 [ 19 116 128]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.72      0.77       267
     neutral       0.57      0.89      0.70       263
    positive       0.84      0.49      0.62       263

    accuracy                           0.70       793
   macro avg       0.75      0.70      0.70       793
weighted avg       0.75      0.70      0.70       793

Predikcije spremljene u results_train_2_croslo/predictions_test_3.csv

Sažetak metrika po test skupovima s prosjekom:
        Test Set  Accuracy  F1 Macro  Precision Macro  Recall Macro
0         test-1  0.681470  0.593037         0.575207      0.631078
1         test-2  0.797571  0.743666         0.748448      0.740444
2         test-3  0.699874  0.695614         0.748710      0.699757
Average      NaN  0.726305  0.677439         0.690788      0.690426
Sažetak metrika spremljen u results_train_2_croslo/summary_metrics_with_a