File size: 3,814 Bytes
1f571a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import jsonlines
import matplotlib.pyplot as plt
from collections import Counter
import os

# 统计文件名后缀分布
suffix_counter = Counter()

file_path = "./output/alignment.jsonl"
if not os.path.exists(file_path):
    print(f"Error: {file_path} not found.")
else:
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            # Use simple split as in original code, but safer
            parts = obj['file'].split(".")
            suffix = parts[-1] if len(parts) > 1 else "no_suffix"
            suffix_counter[suffix] += 1

    # Sort data for better presentation
    sorted_suffixes = suffix_counter.most_common()

    # Total count
    total_files = sum(suffix_counter.values())

    # Try to use rich for a beautiful table
    try:
        from rich.console import Console
        from rich.table import Table
        from rich import box

        console = Console()
        table = Table(title="Language Distribution", box=box.ROUNDED)

        table.add_column("Language (Suffix)", style="cyan", justify="left")
        table.add_column("Count", style="magenta", justify="right")
        table.add_column("Percentage", style="green", justify="right")

        for suffix, count in sorted_suffixes:
            percentage = (count / total_files) * 100
            table.add_row(suffix, str(count), f"{percentage:.1f}%")

        console.print(table)
    except ImportError:
        # Fallback to standard print
        print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}")
        print("-" * 45)
        for suffix, count in sorted_suffixes:
            percentage = (count / total_files) * 100
            print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%")

    # Plotting Bar Chart
    if sorted_suffixes:
        labels, values = zip(*sorted_suffixes)
        
        # 1. Bar Chart
        plt.figure(figsize=(12, 6))
        bars = plt.bar(labels, values, color='skyblue')
        plt.title('Language Distribution', fontsize=16)
        plt.xlabel('Language', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45)
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}',
                    ha='center', va='bottom')

        plt.tight_layout()
        plt.savefig("suffix.png")
        print("\nBar chart saved to suffix.png")

        # 2. Table Image
        plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2))  # Adjust height based on number of rows
        plt.axis('off')
        
        cell_text = []
        for suffix, count in sorted_suffixes:
            percentage = (count / total_files) * 100
            cell_text.append([suffix, str(count), f"{percentage:.1f}%"])
            
        col_labels = ["Language", "Count", "Percentage"]
        col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"]
        
        table_plot = plt.table(cellText=cell_text,
                          colLabels=col_labels,
                          colColours=col_colors,
                          loc='center',
                          cellLoc='center')
        
        table_plot.auto_set_font_size(False)
        table_plot.set_fontsize(12)
        table_plot.scale(1.2, 1.5)
        
        plt.title('Language Distribution Table', fontsize=16, y=1.0) # Adjust title position if needed
        # We don't use tight_layout here as it might cut off parts of the table sometimes, 
        # but for simple tables bbox_inches='tight' in savefig usually works best.
        
        plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300)
        print("Table saved to suffix_table.png")
        # plt.show() # Commented out to avoid blocking if headless