# src/feature_engineering.py from transformers import DistilBertTokenizer import logging from src.config import MODEL_NAME, MAX_LENGTH, LOG_FILE def setup_logging(): logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def tokenize_texts(dataset, tokenizer=None): """Tokenize texts using DistilBERT tokenizer.""" setup_logging() if tokenizer is None: tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) logging.info("Tokenizing texts") def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH) tokenized_dataset = dataset.map(tokenize_function, batched=True) return tokenized_dataset, tokenizer