| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import numpy as np |
| from scipy import stats |
| import os |
|
|
| def set_style(): |
| """Set the style for all plots""" |
| |
| plt.style.use('default') |
| |
| |
| plt.rcParams['figure.figsize'] = (12, 6) |
| plt.rcParams['font.size'] = 10 |
| plt.rcParams['axes.titlesize'] = 14 |
| plt.rcParams['axes.labelsize'] = 12 |
| plt.rcParams['axes.grid'] = True |
| plt.rcParams['grid.alpha'] = 0.3 |
| |
| |
| colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC', '#99FFCC', '#FFB366'] |
| return colors |
|
|
| def create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir): |
| """Create and save language distribution plot""" |
| plt.figure(figsize=(14, 8)) |
| |
| |
| x = np.arange(len(lang_dist)) |
| |
| |
| bars = plt.bar(x, lang_dist.values, color=colors) |
| plt.title('Language Distribution in Multilingual Toxic Comment Dataset', pad=20) |
| plt.xlabel('Language', labelpad=10) |
| plt.ylabel('Number of Comments', labelpad=10) |
| |
| |
| plt.xticks(x, lang_dist.index, rotation=45) |
| |
| |
| for i, bar in enumerate(bars): |
| height = bar.get_height() |
| plt.text(bar.get_x() + bar.get_width()/2., height + (max(lang_dist.values) * 0.01), |
| f'{int(height):,}\n({lang_percent.values[i]:.1f}%)', |
| ha='center', va='bottom', fontsize=10) |
| |
| |
| plt.margins(y=0.2) |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(image_dir, 'language_distribution.png'), dpi=300, bbox_inches='tight') |
| plt.close() |
|
|
| def create_toxicity_heatmap(df, toxicity_cols, image_dir): |
| """Create and save toxicity correlation heatmap""" |
| plt.figure(figsize=(12, 10)) |
| |
| |
| correlation = df[toxicity_cols].corr() |
| |
| |
| mean_corr = correlation.mean() |
| sorted_cols = mean_corr.sort_values(ascending=False).index |
| correlation = correlation.loc[sorted_cols, sorted_cols] |
| |
| |
| im = plt.imshow(correlation, cmap='RdYlBu_r', aspect='equal', vmin=0, vmax=1) |
| plt.colorbar(im, label='Correlation Coefficient') |
| |
| |
| for i in range(len(correlation)): |
| for j in range(len(correlation)): |
| corr_value = correlation.iloc[i, j] |
| |
| text_color = 'white' if abs(corr_value) > 0.7 else 'black' |
| |
| fontweight = 'bold' if i == j else 'normal' |
| plt.text(j, i, f'{corr_value:.2f}', |
| ha='center', va='center', |
| color=text_color, |
| fontweight=fontweight, |
| fontsize=10) |
| |
| |
| plt.title('Correlation between Different Types of Toxicity\n(Sorted by Average Correlation)', |
| pad=20, fontsize=14) |
| |
| |
| formatted_labels = [col.replace('_', ' ').title() for col in correlation.columns] |
| plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=45, ha='right') |
| plt.yticks(range(len(formatted_labels)), formatted_labels) |
| |
| |
| plt.grid(False) |
| |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(image_dir, 'toxicity_correlation.png'), dpi=300, bbox_inches='tight') |
| plt.close() |
|
|
| def create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir): |
| """Create and save toxicity distribution by language plot""" |
| plt.figure(figsize=(15, 8)) |
| |
| x = np.arange(len(lang_dist.index)) |
| width = 0.15 |
| multiplier = 0 |
| |
| for attribute, color in zip(toxicity_cols, colors): |
| |
| attribute_means = [(df[df['lang'] == lang][attribute] > 0).mean() * 100 |
| for lang in lang_dist.index] |
| |
| offset = width * multiplier |
| rects = plt.bar(x + offset, attribute_means, width, |
| label=attribute.replace('_', ' ').title(), |
| color=color, alpha=0.8) |
| |
| |
| for rect in rects: |
| height = rect.get_height() |
| plt.text(rect.get_x() + rect.get_width()/2., height, |
| f'{height:.1f}%', ha='center', va='bottom', fontsize=8) |
| |
| multiplier += 1 |
| |
| plt.xlabel('Language') |
| plt.ylabel('Percentage of Toxic Comments (%)') |
| plt.title('Distribution of Toxicity Types by Language') |
| plt.xticks(x + width * 2.5, lang_dist.index, rotation=45) |
| plt.legend(loc='upper right', bbox_to_anchor=(1, 1)) |
| plt.grid(True, alpha=0.3) |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(image_dir, 'toxicity_by_language.png'), dpi=300, bbox_inches='tight') |
| plt.close() |
|
|
| def create_class_distribution_plot(df, lang_dist, image_dir): |
| """Create and save class distribution across languages plot""" |
| plt.figure(figsize=(16, 10)) |
| |
| |
| toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
| display_names = [col.replace('_', ' ').title() for col in toxicity_cols] |
| |
| |
| class_dist = {} |
| non_toxic_dist = {} |
| for lang in lang_dist.index: |
| lang_df = df[df['lang'] == lang] |
| total = len(lang_df) |
| |
| |
| toxic_matrix = lang_df[toxicity_cols].astype(bool) |
| |
| |
| non_toxic_mask = ~toxic_matrix.any(axis=1) |
| non_toxic_percent = (non_toxic_mask.sum() / total) * 100 |
| non_toxic_dist[lang] = non_toxic_percent |
| |
| |
| class_dist[lang] = [(toxic_matrix[col].sum() / total) * 100 for col in toxicity_cols] |
| |
| |
| x = np.arange(len(lang_dist.index)) |
| |
| |
| colors = plt.cm.Set3(np.linspace(0, 1, len(toxicity_cols) + 1)) |
| |
| |
| non_toxic_values = [non_toxic_dist[lang] for lang in lang_dist.index] |
| non_toxic_bar = plt.bar(x, non_toxic_values, label='Non-Toxic', color=colors[0], alpha=0.9) |
| |
| |
| for j, v in enumerate(non_toxic_values): |
| if v > 1: |
| plt.text(x[j], v/2, f'{v:.1f}%', |
| ha='center', va='center', |
| color='black', |
| fontweight='bold', |
| fontsize=9) |
| |
| |
| bottom = np.array(non_toxic_values) |
| |
| |
| bars = [non_toxic_bar] |
| for i, (col, display_name) in enumerate(zip(toxicity_cols, display_names)): |
| values = [class_dist[lang][i] for lang in lang_dist.index] |
| bar = plt.bar(x, values, bottom=bottom, label=display_name, color=colors[i+1], alpha=0.9) |
| bars.append(bar) |
| |
| |
| for j, v in enumerate(values): |
| if v > 1: |
| center = bottom[j] + v/2 |
| text_color = 'black' if v > 10 else 'black' |
| plt.text(x[j], center, f'{v:.1f}%', |
| ha='center', va='center', |
| color=text_color, |
| fontweight='bold', |
| fontsize=9) |
| bottom = bottom + np.array(values) |
| |
| plt.xlabel('Language', labelpad=10, fontsize=12) |
| plt.ylabel('Percentage of Comments', labelpad=10, fontsize=12) |
| plt.title('Distribution of Non-Toxic and Toxic Comments by Language', pad=20, fontsize=14) |
| plt.xticks(x, lang_dist.index, rotation=45, fontsize=10) |
| |
| |
| plt.legend(title='Comment Types', |
| bbox_to_anchor=(1.15, 1), |
| loc='upper left', |
| fontsize=10, |
| title_fontsize=12) |
| |
| |
| plt.grid(True, axis='y', alpha=0.3) |
| |
| |
| plt.margins(y=0.1) |
| plt.tight_layout() |
| plt.savefig(os.path.join(image_dir, 'class_distribution.png'), dpi=300, bbox_inches='tight') |
| plt.close() |
|
|
| def analyze_language_distribution(): |
| """Analyze language distribution and toxicity patterns in the dataset""" |
| |
| image_dir = 'images' |
| os.makedirs(image_dir, exist_ok=True) |
| |
| |
| colors = set_style() |
| |
| |
| print("Reading dataset...") |
| input_file = 'dataset/split/train.csv' |
| df = pd.read_csv(input_file) |
| |
| |
| lang_dist = df['lang'].value_counts() |
| lang_percent = df['lang'].value_counts(normalize=True) * 100 |
| |
| |
| print("\nDataset Overview:") |
| print("-" * 50) |
| print("Input file: ", input_file) |
| print(f"Total number of comments: {len(df):,}") |
| print(f"Number of languages: {df['lang'].nunique()}") |
| |
| print("\nLanguage Distribution:") |
| print("-" * 50) |
| for lang, count in lang_dist.items(): |
| print(f"{lang}: {count:,} comments ({lang_percent[lang]:.2f}%)") |
| |
| |
| create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir) |
| |
| |
| toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
| |
| |
| create_toxicity_heatmap(df, toxicity_cols, image_dir) |
| |
| |
| create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir) |
| |
| |
| create_class_distribution_plot(df, lang_dist, image_dir) |
| |
| |
| print("\nClass Distribution by Language:") |
| print("-" * 50) |
| |
| for lang in lang_dist.index: |
| lang_df = df[df['lang'] == lang] |
| total = len(lang_df) |
| |
| print(f"\n{lang.upper()} (Total: {total:,} comments)") |
| |
| |
| toxic_counts = lang_df[toxicity_cols].astype(bool).sum(axis=1) |
| class_dist = toxic_counts.value_counts().sort_index() |
| |
| for n_classes, count in class_dist.items(): |
| percentage = (count / total) * 100 |
| print(f"{n_classes} toxic classes: {count:,} ({percentage:.2f}%)") |
| |
| |
| print("\nDetailed Toxicity Analysis by Language:") |
| print("-" * 50) |
| |
| for lang in lang_dist.index: |
| lang_df = df[df['lang'] == lang] |
| print(f"\n{lang.upper()} (Total: {len(lang_df):,} comments)") |
| |
| |
| for col in toxicity_cols: |
| toxic_count = (lang_df[col] > 0).sum() |
| toxic_percent = (toxic_count / len(lang_df)) * 100 |
| |
| |
| ci = stats.norm.interval(0.95, |
| loc=toxic_percent/100, |
| scale=np.sqrt((toxic_percent/100 * (1-toxic_percent/100)) / len(lang_df))) |
| ci_lower, ci_upper = ci[0] * 100, ci[1] * 100 |
| |
| print(f"- {col.replace('_', ' ').title()}:") |
| print(f" Count: {toxic_count:,} ({toxic_percent:.2f}%)") |
| print(f" 95% CI: [{ci_lower:.2f}%, {ci_upper:.2f}%]") |
| |
| |
| print("\nStatistical Analysis:") |
| print("-" * 50) |
| |
| |
| toxic_class_counts = pd.crosstab(df['lang'], df[toxicity_cols].astype(bool).sum(axis=1)) |
| chi2, p_value, _, _ = stats.chi2_contingency(toxic_class_counts) |
| print("\nChi-square test for number of toxic classes by language:") |
| print(f"Chi-square statistic: {chi2:.2f}") |
| print(f"p-value: {p_value:.10f}") |
| print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}") |
| |
| |
| for col in toxicity_cols: |
| binary_col = (df[col] > 0).astype(int) |
| contingency_table = pd.crosstab(df['lang'], binary_col) |
| chi2, p_value, _, _ = stats.chi2_contingency(contingency_table) |
| print(f"\nChi-square test for {col.replace('_', ' ').title()}:") |
| print(f"Chi-square statistic: {chi2:.2f}") |
| print(f"p-value: {p_value:.10f}") |
| print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}") |
|
|
| if __name__ == "__main__": |
| analyze_language_distribution() |