Spaces:
No application file
No application file
| # src/feature_engineering.py | |
| from transformers import DistilBertTokenizer | |
| import logging | |
| from src.config import MODEL_NAME, MAX_LENGTH, LOG_FILE | |
| def setup_logging(): | |
| logging.basicConfig(filename=LOG_FILE, level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s") | |
| def tokenize_texts(dataset, tokenizer=None): | |
| """Tokenize texts using DistilBERT tokenizer.""" | |
| setup_logging() | |
| if tokenizer is None: | |
| tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) | |
| logging.info("Tokenizing texts") | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH) | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| return tokenized_dataset, tokenizer |