ayush2917 commited on
Commit
7f3db57
·
verified ·
1 Parent(s): d5c5d64

Create feature_engineering.py

Browse files
Files changed (1) hide show
  1. src/feature_engineering.py +19 -0
src/feature_engineering.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/feature_engineering.py
2
+ from transformers import DistilBertTokenizer
3
+ import torch
4
+ from src.config import MAX_LENGTH
5
+ import logging
6
+
7
+ def setup_logging():
8
+ logging.basicConfig(filename="logs/app.log", level=logging.INFO,
9
+ format="%(asctime)s - %(levelname)s - %(message)s")
10
+
11
+ def tokenize_texts(texts):
12
+ """Tokenize texts using DistilBERT tokenizer."""
13
+ setup_logging()
14
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
15
+ logging.info("Tokenizing texts")
16
+ encodings = tokenizer(
17
+ texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt"
18
+ )
19
+ return encodings