Oranblock commited on
Commit
d6bb7e9
·
verified ·
1 Parent(s): 163907f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import logging
 
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
6
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
@@ -58,7 +59,9 @@ def setup_training():
58
  # Tokenize the dataset
59
  logging.info("Tokenizing the dataset")
60
  def tokenize_function(examples):
61
- return tokenizer(examples[config['text_column']], padding="max_length", truncation=True)
 
 
62
 
63
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
64
  logging.info("Dataset tokenization completed")
 
1
  import os
2
  import json
3
  import logging
4
+ import numpy as np
5
  from datasets import load_dataset
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
7
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 
59
  # Tokenize the dataset
60
  logging.info("Tokenizing the dataset")
61
  def tokenize_function(examples):
62
+ # Concatenate all feature columns into a single input
63
+ features = np.stack([examples[col] for col in config['text_columns']], axis=1)
64
+ return tokenizer(features.tolist(), padding="max_length", truncation=True)
65
 
66
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
67
  logging.info("Dataset tokenization completed")