File size: 2,759 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd

DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"

def main():
    df = pd.read_csv(DATA_PATH)

    # -------------------------------
    # Handle label column safely
    # -------------------------------
    if "Label" not in df.columns:
        if "Label (0- HUMAN, 1-AI)" in df.columns:
            df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
        else:
            print("⚠️ Label column not found (labels may be added later).")

    print("\n🧪 SANITY CHECK REPORT\n")

    print(f"Total samples: {len(df)}")
    print(f"Total columns: {len(df.columns)}\n")

    # -------------------------------
    # Critical column null checks
    # -------------------------------
    print("🔍 Null value check (critical columns):")
    critical_cols = ["normalized_code", "Language"]
    if "Label" in df.columns:
        critical_cols.append("Label")

    for col in critical_cols:
        if col in df.columns:
            null_count = df[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
        else:
            print(f"  {col}: COLUMN NOT FOUND")

    # -------------------------------
    # Code quality checks
    # -------------------------------
    empty_code = df["normalized_code"].astype(str).str.strip().eq("").sum()
    line_counts = df["normalized_code"].astype(str).str.split("\n").apply(len)

    print("\n🧾 Code quality check:")
    print(f"  Empty normalized_code rows: {empty_code}")
    print(f"  Very short code (<3 lines): {(line_counts < 3).sum()}")

    # -------------------------------
    # Label sanity
    # -------------------------------
    if "Label" in df.columns:
        unique_labels = sorted(df["Label"].unique())
        print("\n🏷️ Label check:")
        print(f"  Unique labels found: {unique_labels}")

    # -------------------------------
    # Line count statistics
    # -------------------------------
    print("\n📏 Line count statistics:")
    if "original_line_count" in df.columns and "normalized_line_count" in df.columns:
        print(df[["original_line_count", "normalized_line_count"]].describe())
    else:
        print("  Line count columns not found.")

    # -------------------------------
    # Language-wise sanity summary
    # -------------------------------
    print("\n🌐 Language-wise summary:")
    for lang in df["Language"].unique():
        lang_df = df[df["Language"] == lang]
        print(f"\nLanguage: {lang}")
        print(f"  Samples: {len(lang_df)}")
        short = (lang_df["normalized_code"].str.split("\n").apply(len) < 3).sum()
        print(f"  Very short code (<3 lines): {short}")

    print("\nSanity check completed successfully ✅")

if __name__ == "__main__":
    main()