Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| from tensorflow.keras.models import Sequential | |
| from tensorflow.keras.layers import Dense | |
| from transformers import BertTokenizer, TFBertModel | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import confusion_matrix, classification_report | |
| from nltk.corpus import stopwords | |
| import tensorflow as tf | |
| import nltk | |
| # Download stopwords | |
| nltk.download('stopwords') | |
| # Load tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| bert_model = TFBertModel.from_pretrained("bert-base-uncased") | |
| # Load dataset | |
| file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv" | |
| movies_df = pd.read_csv(file_path) | |
| # Clean text | |
| def remove_tags(txt): | |
| result = re.sub(r'<[^>]+>', '', txt) | |
| result = re.sub(r'https?://\S+', '', result) | |
| result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result) | |
| return result.lower() | |
| def remove_stop_words(txt): | |
| stop_words = set(stopwords.words('english')) | |
| return ' '.join([word for word in txt.split() if word not in stop_words]) | |
| movies_df['review'] = movies_df['review'].apply(remove_tags) | |
| movies_df['review'] = movies_df['review'].apply(remove_stop_words) | |
| movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0) | |
| # Train-test split | |
| X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42) | |
| # Convert labels to TensorFlow format | |
| y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32) | |
| y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32) | |
| # Compute BERT embeddings | |
| def bert_embeddings_batch(texts, batch_size=32, max_length=64): | |
| embeddings = [] | |
| for i in range(0, len(texts), batch_size): | |
| batch_texts = texts[i:i + batch_size] | |
| inputs = tokenizer( | |
| batch_texts.tolist(), | |
| return_tensors="tf", | |
| padding=True, | |
| truncation=True, | |
| max_length=max_length | |
| ) | |
| outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) | |
| cls_embeddings = outputs.last_hidden_state[:, 0, :] | |
| embeddings.append(cls_embeddings.numpy()) | |
| return np.vstack(embeddings) | |
| # Compute embeddings | |
| X_train_embeddings = bert_embeddings_batch(X_train) | |
| X_test_embeddings = bert_embeddings_batch(X_test) | |
| # Define classifier | |
| classifier = Sequential([ | |
| Dense(128, activation='relu', input_shape=(768,)), | |
| Dense(1, activation='sigmoid') | |
| ]) | |
| classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
| # Train classifier | |
| classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1) | |
| # Evaluate | |
| test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test) | |
| print(f"Test Accuracy: {test_accuracy}") | |
| # Predictions and confusion matrix | |
| y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32") | |
| conf_matrix = confusion_matrix(y_test.numpy(), y_pred) | |
| class_report = classification_report(y_test.numpy(), y_pred) | |
| print("Confusion Matrix:") | |
| print(conf_matrix) | |
| print("\nClassification Report:") | |
| print(class_report) | |
| # Save the trained model to a file | |
| classifier.save("movie_sentiment_model.h5") | |