from sklearn.feature_extraction.text import TfidfVectorizer from textblob import TextBlob import numpy as np from scipy.sparse import hstack def create_features(df): tfidf = TfidfVectorizer(max_features=1000) X_tfidf = tfidf.fit_transform(df['clean_text']) df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split())) df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity) X_features = hstack([ X_tfidf, np.array(df['ticket_length']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1) ]) return X_features, tfidf