import jsonlines import matplotlib.pyplot as plt from collections import Counter import os # 统计文件名后缀分布 suffix_counter = Counter() file_path = "./output/alignment.jsonl" if not os.path.exists(file_path): print(f"Error: {file_path} not found.") else: with jsonlines.open(file_path) as reader: for obj in reader: # Use simple split as in original code, but safer parts = obj['file'].split(".") suffix = parts[-1] if len(parts) > 1 else "no_suffix" suffix_counter[suffix] += 1 # Sort data for better presentation sorted_suffixes = suffix_counter.most_common() # Total count total_files = sum(suffix_counter.values()) # Try to use rich for a beautiful table try: from rich.console import Console from rich.table import Table from rich import box console = Console() table = Table(title="Language Distribution", box=box.ROUNDED) table.add_column("Language (Suffix)", style="cyan", justify="left") table.add_column("Count", style="magenta", justify="right") table.add_column("Percentage", style="green", justify="right") for suffix, count in sorted_suffixes: percentage = (count / total_files) * 100 table.add_row(suffix, str(count), f"{percentage:.1f}%") console.print(table) except ImportError: # Fallback to standard print print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}") print("-" * 45) for suffix, count in sorted_suffixes: percentage = (count / total_files) * 100 print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%") # Plotting Bar Chart if sorted_suffixes: labels, values = zip(*sorted_suffixes) # 1. Bar Chart plt.figure(figsize=(12, 6)) bars = plt.bar(labels, values, color='skyblue') plt.title('Language Distribution', fontsize=16) plt.xlabel('Language', fontsize=12) plt.ylabel('Count', fontsize=12) plt.xticks(rotation=45) # Add value labels on top of bars for bar in bars: height = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2., height, f'{int(height)}', ha='center', va='bottom') plt.tight_layout() plt.savefig("suffix.png") print("\nBar chart saved to suffix.png") # 2. Table Image plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2)) # Adjust height based on number of rows plt.axis('off') cell_text = [] for suffix, count in sorted_suffixes: percentage = (count / total_files) * 100 cell_text.append([suffix, str(count), f"{percentage:.1f}%"]) col_labels = ["Language", "Count", "Percentage"] col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"] table_plot = plt.table(cellText=cell_text, colLabels=col_labels, colColours=col_colors, loc='center', cellLoc='center') table_plot.auto_set_font_size(False) table_plot.set_fontsize(12) table_plot.scale(1.2, 1.5) plt.title('Language Distribution Table', fontsize=16, y=1.0) # Adjust title position if needed # We don't use tight_layout here as it might cut off parts of the table sometimes, # but for simple tables bbox_inches='tight' in savefig usually works best. plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300) print("Table saved to suffix_table.png") # plt.show() # Commented out to avoid blocking if headless