DouDou
Upload data2/step22/ppt.py with huggingface_hub
1f571a3 verified
import jsonlines
import matplotlib.pyplot as plt
from collections import Counter
import os
# 统计文件名后缀分布
suffix_counter = Counter()
file_path = "./output/alignment.jsonl"
if not os.path.exists(file_path):
print(f"Error: {file_path} not found.")
else:
with jsonlines.open(file_path) as reader:
for obj in reader:
# Use simple split as in original code, but safer
parts = obj['file'].split(".")
suffix = parts[-1] if len(parts) > 1 else "no_suffix"
suffix_counter[suffix] += 1
# Sort data for better presentation
sorted_suffixes = suffix_counter.most_common()
# Total count
total_files = sum(suffix_counter.values())
# Try to use rich for a beautiful table
try:
from rich.console import Console
from rich.table import Table
from rich import box
console = Console()
table = Table(title="Language Distribution", box=box.ROUNDED)
table.add_column("Language (Suffix)", style="cyan", justify="left")
table.add_column("Count", style="magenta", justify="right")
table.add_column("Percentage", style="green", justify="right")
for suffix, count in sorted_suffixes:
percentage = (count / total_files) * 100
table.add_row(suffix, str(count), f"{percentage:.1f}%")
console.print(table)
except ImportError:
# Fallback to standard print
print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}")
print("-" * 45)
for suffix, count in sorted_suffixes:
percentage = (count / total_files) * 100
print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%")
# Plotting Bar Chart
if sorted_suffixes:
labels, values = zip(*sorted_suffixes)
# 1. Bar Chart
plt.figure(figsize=(12, 6))
bars = plt.bar(labels, values, color='skyblue')
plt.title('Language Distribution', fontsize=16)
plt.xlabel('Language', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
# Add value labels on top of bars
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom')
plt.tight_layout()
plt.savefig("suffix.png")
print("\nBar chart saved to suffix.png")
# 2. Table Image
plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2)) # Adjust height based on number of rows
plt.axis('off')
cell_text = []
for suffix, count in sorted_suffixes:
percentage = (count / total_files) * 100
cell_text.append([suffix, str(count), f"{percentage:.1f}%"])
col_labels = ["Language", "Count", "Percentage"]
col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"]
table_plot = plt.table(cellText=cell_text,
colLabels=col_labels,
colColours=col_colors,
loc='center',
cellLoc='center')
table_plot.auto_set_font_size(False)
table_plot.set_fontsize(12)
table_plot.scale(1.2, 1.5)
plt.title('Language Distribution Table', fontsize=16, y=1.0) # Adjust title position if needed
# We don't use tight_layout here as it might cut off parts of the table sometimes,
# but for simple tables bbox_inches='tight' in savefig usually works best.
plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300)
print("Table saved to suffix_table.png")
# plt.show() # Commented out to avoid blocking if headless