MaroueneA's picture
Update same as GradioCompararion 2 and set tmp dir if not existent
c144351 verified
import gradio as gr
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch
from sentence_transformers import SentenceTransformer
import umap
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tempfile
from collections import Counter
import os
temp_dir = '/tmp/gradio_tmp'
os.makedirs(temp_dir, exist_ok=True) # Creates the directory if it does not exist
os.environ['GRADIO_TEMP_DIR'] = temp_dir
# Load the models and their tokenizers
model_paths = {
"roberta-base-offensive": "./models/roberta-base-offensive",
"distilbert-base-uncased-offensive": "./models/distilbert-base-uncased-offensive",
"bert-offensive":"./models/bert-offensive",
"deberta-offensive":"./models/deberta-offensive"
}
models = {name: AutoModelForSequenceClassification.from_pretrained(path) for name, path in model_paths.items()}
tokenizers = {name: AutoTokenizer.from_pretrained(path) for name, path in model_paths.items()}
# Load the dataset
dataset = load_dataset("tweet_eval", "offensive")
# Initialize Sentence Transformer for embedding generation
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')
def encode(texts, tokenizer):
return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
def predict(model, inputs):
model.eval()
with torch.no_grad():
outputs = model(**inputs)
preds = outputs.logits.argmax(-1).cpu().numpy()
return preds
def calculate_metrics(labels, preds):
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
conf_matrix = confusion_matrix(labels, preds)
return accuracy, precision, recall, f1, conf_matrix
def generate_confusion_matrix(conf_matrix, model_name):
plt.figure(figsize=(5, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d")
plt.title(f'Confusion Matrix: {model_name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
plt.savefig(temp_file.name)
plt.close()
return temp_file.name
def generate_embeddings_and_plot(categories):
all_texts = sum(categories.values(), [])
embeddings = model_embedding.encode(all_texts)
# UMAP reduction
umap_reducer = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
umap_embeddings = umap_reducer.fit_transform(embeddings)
# t-SNE reduction
tsne_embeddings = TSNE(n_components=2, perplexity=30).fit_transform(embeddings)
# Plotting helper function to avoid repetition
def plot_embeddings(embeddings, title, file_suffix):
plt.figure(figsize=(10, 8))
colors = {"correct_both": "green", "incorrect_both": "red", "correct_model1_only": "blue", "correct_model2_only": "orange"}
for category, color in colors.items():
indices = [i for i, text in enumerate(all_texts) if text in categories[category]]
plt.scatter(embeddings[indices, 0], embeddings[indices, 1], label=category, color=color, alpha=0.6)
plt.legend()
plt.title(title)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f'_{file_suffix}.png')
plt.savefig(temp_file.name)
plt.close()
return temp_file.name
# Generate and save plots
umap_plot_path = plot_embeddings(umap_embeddings, "UMAP Projection of Text Categories", "umap")
tsne_plot_path = plot_embeddings(tsne_embeddings, "t-SNE Projection of Text Categories", "tsne")
return umap_plot_path, tsne_plot_path
def compare_models(model1, model2):
# Assuming dataset['test']['text'] returns a list of strings:
test_texts = dataset['test']['text'] # This is directly usable if it's a list
# Directly use the labels as a list, without calling .tolist()
labels = dataset['test']['label']
inputs1 = encode(test_texts, tokenizers[model1])
inputs2 = encode(test_texts, tokenizers[model2])
preds1 = predict(models[model1], inputs1)
preds2 = predict(models[model2], inputs2)
metrics1 = calculate_metrics(labels, preds1)
metrics2 = calculate_metrics(labels, preds2)
categories = {
"correct_both": [],
"incorrect_both": [],
"correct_model1_only": [],
"correct_model2_only": []
}
for i, label in enumerate(labels):
text = test_texts[i]
if preds1[i] == label and preds2[i] == label:
categories["correct_both"].append(text)
elif preds1[i] != label and preds2[i] != label:
categories["incorrect_both"].append(text)
elif preds1[i] == label and preds2[i] != label:
categories["correct_model1_only"].append(text)
elif preds1[i] != label and preds2[i] == label:
categories["correct_model2_only"].append(text)
# Generate metrics DataFrame
metrics_df = pd.DataFrame({
"Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
model1: metrics1[:-1],
model2: metrics2[:-1],
})
metrics_df["% Difference"] = ((metrics_df[model1] - metrics_df[model2]) / metrics_df[model2] * 100).apply(lambda x: f"{x:.2f}%")
# Confusion matrices and visualizations
conf_matrix_path1 = generate_confusion_matrix(metrics1[-1], model1)
conf_matrix_path2 = generate_confusion_matrix(metrics2[-1], model2)
umap_plot_path, tsne_plot_path = generate_embeddings_and_plot(categories)
return metrics_df, conf_matrix_path1, conf_matrix_path2, umap_plot_path, tsne_plot_path, categories
from sklearn.cluster import KMeans
def generate_embeddings_and_cluster(categories):
all_texts = sum(categories.values(), [])
embeddings = model_embedding.encode(all_texts)
# Category labels for all texts
category_labels = [cat for cat, texts in categories.items() for _ in range(len(texts))]
# Calculate overall category distribution
overall_distribution = Counter(category_labels)
overall_distribution_percent = {k: v / len(category_labels) * 100 for k, v in overall_distribution.items()}
# K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42).fit(embeddings)
labels = kmeans.labels_
# Map each text to its cluster and category
cluster_categories = [[] for _ in range(3)] # Assuming 3 clusters
for label, category in zip(labels, category_labels):
cluster_categories[label].append(category)
# Calculate category distribution within each cluster
cluster_distributions = []
for i, cluster in enumerate(cluster_categories):
distribution = Counter(cluster)
distribution_percent = {k: v / len(cluster) * 100 for k, v in distribution.items()}
cluster_distributions.append(distribution_percent)
# Perform UMAP dimensionality reduction for visualization
umap_reducer = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
reduced_embeddings = umap_reducer.fit_transform(embeddings)
# Visualization
plt.figure(figsize=(10, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.title("K-means Clustering of Text Embeddings")
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
# Save the plot
cluster_plot_path = tempfile.NamedTemporaryFile(delete=False, suffix='_cluster.png').name
plt.savefig(cluster_plot_path)
plt.close()
return cluster_plot_path, overall_distribution_percent, cluster_distributions
def setup_gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("## Model Comparison and Text Analysis")
with gr.Row():
model1_input = gr.Dropdown(list(model_paths.keys()), label="Model 1")
model2_input = gr.Dropdown(list(model_paths.keys()), label="Model 2")
submit_button = gr.Button("Compare")
metrics_output = gr.Dataframe()
with gr.Row():
model1_cm_output = gr.Image(label="Confusion Matrix for Model 1")
model2_cm_output = gr.Image(label="Confusion Matrix for Model 2")
with gr.Row():
umap_visualization_output = gr.Image(label="UMAP Text Categorization Visualization")
tsne_visualization_output = gr.Image(label="t-SNE Text Categorization Visualization")
clustering_visualization_output = gr.Image(label="K-means Clustering Visualization")
category_distribution_output = gr.Dataframe(label="Category Distribution Comparison")
def update_interface(model1, model2):
metrics_df, cm_path1, cm_path2, umap_viz_path, tsne_viz_path, categories = compare_models(model1, model2)
cluster_viz_path, overall_distribution_percent, cluster_distributions = generate_embeddings_and_cluster(categories)
# Prepare DataFrame for category distribution comparison
distribution_data = []
for cluster_index, cluster_distribution in enumerate(cluster_distributions, start=1):
for category, percent in cluster_distribution.items():
distribution_data.append({
"Cluster": f"Cluster {cluster_index}",
"Category": category,
"Percentage": f"{percent:.2f}%",
"Difference from Overall": f"{percent - overall_distribution_percent.get(category, 0):.2f}%"
})
distribution_df = pd.DataFrame(distribution_data)
return metrics_df, cm_path1, cm_path2, umap_viz_path, tsne_viz_path, cluster_viz_path, distribution_df
submit_button.click(
update_interface,
inputs=[model1_input, model2_input],
outputs=[metrics_output, model1_cm_output, model2_cm_output, umap_visualization_output, tsne_visualization_output, clustering_visualization_output, category_distribution_output]
)
return demo
demo = setup_gradio_interface()
demo.launch(share=True)