import gradio as gr import numpy as np import pandas as pd import re from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from transformers import BertTokenizer, TFBertModel from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from nltk.corpus import stopwords import tensorflow as tf import nltk # Download stopwords nltk.download('stopwords') # Load tokenizer and model tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") bert_model = TFBertModel.from_pretrained("bert-base-uncased") # Load dataset file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv" movies_df = pd.read_csv(file_path) # Clean text def remove_tags(txt): result = re.sub(r'<[^>]+>', '', txt) result = re.sub(r'https?://\S+', '', result) result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result) return result.lower() def remove_stop_words(txt): stop_words = set(stopwords.words('english')) return ' '.join([word for word in txt.split() if word not in stop_words]) movies_df['review'] = movies_df['review'].apply(remove_tags) movies_df['review'] = movies_df['review'].apply(remove_stop_words) movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0) # Train-test split X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42) # Convert labels to TensorFlow format y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32) y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32) # Compute BERT embeddings def bert_embeddings_batch(texts, batch_size=32, max_length=64): embeddings = [] for i in range(0, len(texts), batch_size): batch_texts = texts[i:i + batch_size] inputs = tokenizer( batch_texts.tolist(), return_tensors="tf", padding=True, truncation=True, max_length=max_length ) outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) cls_embeddings = outputs.last_hidden_state[:, 0, :] embeddings.append(cls_embeddings.numpy()) return np.vstack(embeddings) # Compute embeddings X_train_embeddings = bert_embeddings_batch(X_train) X_test_embeddings = bert_embeddings_batch(X_test) # Define classifier classifier = Sequential([ Dense(128, activation='relu', input_shape=(768,)), Dense(1, activation='sigmoid') ]) classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Train classifier classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1) # Evaluate test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test) print(f"Test Accuracy: {test_accuracy}") # Predictions and confusion matrix y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32") conf_matrix = confusion_matrix(y_test.numpy(), y_pred) class_report = classification_report(y_test.numpy(), y_pred) print("Confusion Matrix:") print(conf_matrix) print("\nClassification Report:") print(class_report) # Save the trained model to a file classifier.save("movie_sentiment_model.h5")