| | import jsonlines |
| | import matplotlib.pyplot as plt |
| | from collections import Counter |
| | import os |
| |
|
| | |
| | suffix_counter = Counter() |
| |
|
| | file_path = "./output/alignment.jsonl" |
| | if not os.path.exists(file_path): |
| | print(f"Error: {file_path} not found.") |
| | else: |
| | with jsonlines.open(file_path) as reader: |
| | for obj in reader: |
| | |
| | parts = obj['file'].split(".") |
| | suffix = parts[-1] if len(parts) > 1 else "no_suffix" |
| | suffix_counter[suffix] += 1 |
| |
|
| | |
| | sorted_suffixes = suffix_counter.most_common() |
| |
|
| | |
| | total_files = sum(suffix_counter.values()) |
| |
|
| | |
| | try: |
| | from rich.console import Console |
| | from rich.table import Table |
| | from rich import box |
| |
|
| | console = Console() |
| | table = Table(title="Language Distribution", box=box.ROUNDED) |
| |
|
| | table.add_column("Language (Suffix)", style="cyan", justify="left") |
| | table.add_column("Count", style="magenta", justify="right") |
| | table.add_column("Percentage", style="green", justify="right") |
| |
|
| | for suffix, count in sorted_suffixes: |
| | percentage = (count / total_files) * 100 |
| | table.add_row(suffix, str(count), f"{percentage:.1f}%") |
| |
|
| | console.print(table) |
| | except ImportError: |
| | |
| | print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}") |
| | print("-" * 45) |
| | for suffix, count in sorted_suffixes: |
| | percentage = (count / total_files) * 100 |
| | print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%") |
| |
|
| | |
| | if sorted_suffixes: |
| | labels, values = zip(*sorted_suffixes) |
| | |
| | |
| | plt.figure(figsize=(12, 6)) |
| | bars = plt.bar(labels, values, color='skyblue') |
| | plt.title('Language Distribution', fontsize=16) |
| | plt.xlabel('Language', fontsize=12) |
| | plt.ylabel('Count', fontsize=12) |
| | plt.xticks(rotation=45) |
| | |
| | |
| | for bar in bars: |
| | height = bar.get_height() |
| | plt.text(bar.get_x() + bar.get_width()/2., height, |
| | f'{int(height)}', |
| | ha='center', va='bottom') |
| |
|
| | plt.tight_layout() |
| | plt.savefig("suffix.png") |
| | print("\nBar chart saved to suffix.png") |
| |
|
| | |
| | plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2)) |
| | plt.axis('off') |
| | |
| | cell_text = [] |
| | for suffix, count in sorted_suffixes: |
| | percentage = (count / total_files) * 100 |
| | cell_text.append([suffix, str(count), f"{percentage:.1f}%"]) |
| | |
| | col_labels = ["Language", "Count", "Percentage"] |
| | col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"] |
| | |
| | table_plot = plt.table(cellText=cell_text, |
| | colLabels=col_labels, |
| | colColours=col_colors, |
| | loc='center', |
| | cellLoc='center') |
| | |
| | table_plot.auto_set_font_size(False) |
| | table_plot.set_fontsize(12) |
| | table_plot.scale(1.2, 1.5) |
| | |
| | plt.title('Language Distribution Table', fontsize=16, y=1.0) |
| | |
| | |
| | |
| | plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300) |
| | print("Table saved to suffix_table.png") |
| | |