File size: 799 Bytes
7f3db57
 
 
2eb5a40
7f3db57
 
2eb5a40
7f3db57
 
2eb5a40
7f3db57
 
2eb5a40
 
7f3db57
2eb5a40
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# src/feature_engineering.py
from transformers import DistilBertTokenizer
import logging
from src.config import MODEL_NAME, MAX_LENGTH, LOG_FILE

def setup_logging():
    logging.basicConfig(filename=LOG_FILE, level=logging.INFO, 
                        format="%(asctime)s - %(levelname)s - %(message)s")

def tokenize_texts(dataset, tokenizer=None):
    """Tokenize texts using DistilBERT tokenizer."""
    setup_logging()
    if tokenizer is None:
        tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
    logging.info("Tokenizing texts")
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset, tokenizer