Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import logging
|
|
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
| 6 |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
|
@@ -58,7 +59,9 @@ def setup_training():
|
|
| 58 |
# Tokenize the dataset
|
| 59 |
logging.info("Tokenizing the dataset")
|
| 60 |
def tokenize_function(examples):
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
|
| 63 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
| 64 |
logging.info("Dataset tokenization completed")
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
+
import numpy as np
|
| 5 |
from datasets import load_dataset
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
| 7 |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
|
|
|
| 59 |
# Tokenize the dataset
|
| 60 |
logging.info("Tokenizing the dataset")
|
| 61 |
def tokenize_function(examples):
|
| 62 |
+
# Concatenate all feature columns into a single input
|
| 63 |
+
features = np.stack([examples[col] for col in config['text_columns']], axis=1)
|
| 64 |
+
return tokenizer(features.tolist(), padding="max_length", truncation=True)
|
| 65 |
|
| 66 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
| 67 |
logging.info("Dataset tokenization completed")
|