File size: 3,844 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""Phase 1c: exploratory data analysis.

Produces (a) a stats dict you can dump to JSON for the report, and
(b) PNG plots saved to the eda dir. Keep these in your capstone appendix.
"""
from __future__ import annotations

import json
import re
import sys
from pathlib import Path

import matplotlib

matplotlib.use("Agg")  # headless backend for servers/CI
import matplotlib.pyplot as plt  # noqa: E402
import pandas as pd  # noqa: E402
import seaborn as sns  # noqa: E402

sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.config import load_config  # noqa: E402

sns.set_theme(style="whitegrid")
_WORD_RE = re.compile(r"\b\w+\b")


def _doc_words(s: str) -> int:
    return len(_WORD_RE.findall(s))


def compute_stats(df: pd.DataFrame) -> dict:
    doc_words = df["docstring"].map(_doc_words)
    code_lines = df["code"].str.count("\n") + 1
    code_chars = df["code"].str.len()
    return {
        "n_rows": int(len(df)),
        "languages": df["language"].value_counts().to_dict(),
        "docstring_words": {
            "mean": round(float(doc_words.mean()), 2),
            "median": int(doc_words.median()),
            "p95": int(doc_words.quantile(0.95)),
            "max": int(doc_words.max()),
        },
        "code_lines": {
            "mean": round(float(code_lines.mean()), 2),
            "median": int(code_lines.median()),
            "p95": int(code_lines.quantile(0.95)),
            "max": int(code_lines.max()),
        },
        "code_chars": {
            "mean": round(float(code_chars.mean()), 2),
            "median": int(code_chars.median()),
        },
    }


def make_plots(df: pd.DataFrame, out_dir: str, funnel: pd.DataFrame | None = None):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    saved = []

    # Docstring length distribution.
    fig, ax = plt.subplots(figsize=(7, 4))
    sns.histplot(df["docstring"].map(_doc_words), bins=40, ax=ax)
    ax.set(title="Docstring length (words)", xlabel="words", ylabel="count")
    p = out / "docstring_length.png"
    fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

    # Code length distribution (lines).
    fig, ax = plt.subplots(figsize=(7, 4))
    sns.histplot((df["code"].str.count("\n") + 1).clip(upper=80), bins=40, ax=ax)
    ax.set(title="Code length (lines, clipped at 80)", xlabel="lines", ylabel="count")
    p = out / "code_length.png"
    fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

    # Language distribution.
    fig, ax = plt.subplots(figsize=(7, 4))
    df["language"].value_counts().plot(kind="bar", ax=ax)
    ax.set(title="Rows per language", xlabel="language", ylabel="count")
    p = out / "language_distribution.png"
    fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

    # Cleaning funnel (if provided).
    if funnel is not None:
        fig, ax = plt.subplots(figsize=(7, 4))
        ax.barh(funnel["step"], funnel["rows_remaining"])
        ax.invert_yaxis()
        ax.set(title="Cleaning funnel (rows remaining)", xlabel="rows")
        p = out / "cleaning_funnel.png"
        fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

    return saved


def run_eda(df: pd.DataFrame, cfg=None, funnel: pd.DataFrame | None = None) -> dict:
    cfg = cfg or load_config()
    stats = compute_stats(df)
    plots = make_plots(df, cfg.paths.eda_dir, funnel)
    stats["plots"] = plots
    with open(Path(cfg.paths.eda_dir) / "eda_stats.json", "w") as f:
        json.dump(stats, f, indent=2)
    return stats


if __name__ == "__main__":
    from src.data.clean import clean
    from src.data.load import load_raw

    cfg = load_config()
    cleaned, funnel = clean(load_raw(cfg), cfg)
    print(json.dumps(run_eda(cleaned, cfg, funnel), indent=2))