Ubuntu-Customer-Centre-Inquiries / src /feature_engineering.py
ayush2917's picture
Update src/feature_engineering.py
2eb5a40 verified
# src/feature_engineering.py
from transformers import DistilBertTokenizer
import logging
from src.config import MODEL_NAME, MAX_LENGTH, LOG_FILE
def setup_logging():
logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s")
def tokenize_texts(dataset, tokenizer=None):
"""Tokenize texts using DistilBERT tokenizer."""
setup_logging()
if tokenizer is None:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
logging.info("Tokenizing texts")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
return tokenized_dataset, tokenizer