| """ |
| S3Shastra Research Paper β Publication-Quality Figure Generator |
| =============================================================== |
| Generates 6 figures for the Metadata Profiling / ML Pipeline research paper. |
| |
| Figures: |
| 1. Conceptual Overview of Metadata Profiling |
| 2. Dataset Balancing Strategy (Bar Chart) |
| 3. Character N-Gram & TF-IDF Vectorization Process |
| 4. 3D SVM Decision Hyperplane Visualization |
| 5. Confusion Matrix & Key Evaluation Metrics (Recall Optimization) |
| 6. Full End-to-End System Architecture Diagram |
| |
| Usage: |
| python generate_figures.py |
| -> Saves all figures as high-res PNGs to ./figures/ |
| """ |
|
|
| import os |
| import numpy as np |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import matplotlib.patches as mpatches |
| import matplotlib.gridspec as gridspec |
| from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, ArrowStyle |
| from matplotlib.colors import LinearSegmentedColormap |
| from mpl_toolkits.mplot3d import Axes3D |
| from mpl_toolkits.mplot3d.art3d import Poly3DCollection |
| import matplotlib.patheffects as pe |
|
|
| OUT_DIR = os.path.join(os.path.dirname(__file__), "figures") |
| os.makedirs(OUT_DIR, exist_ok=True) |
|
|
| |
| C_PRIMARY = "#1a1a2e" |
| C_ACCENT = "#e94560" |
| C_SAFE = "#0f9b8e" |
| C_DANGER = "#e94560" |
| C_BLUE = "#4361ee" |
| C_GOLD = "#f9c74f" |
| C_PURPLE = "#7209b7" |
| C_BG = "#f8f9fa" |
| C_GRID = "#dee2e6" |
| C_TEXT = "#212529" |
| C_LIGHT = "#e9ecef" |
|
|
| plt.rcParams.update({ |
| "font.family": "sans-serif", |
| "font.sans-serif": ["Segoe UI", "Helvetica Neue", "Arial", "DejaVu Sans"], |
| "axes.facecolor": C_BG, |
| "figure.facecolor": "white", |
| "axes.edgecolor": C_GRID, |
| "axes.grid": False, |
| "text.color": C_TEXT, |
| "axes.labelcolor": C_TEXT, |
| "xtick.color": C_TEXT, |
| "ytick.color": C_TEXT, |
| }) |
|
|
|
|
| |
| |
| |
| def figure1_conceptual_overview(): |
| fig, ax = plt.subplots(figsize=(16, 7)) |
| ax.set_xlim(0, 16) |
| ax.set_ylim(0, 7) |
| ax.axis("off") |
| fig.patch.set_facecolor("white") |
|
|
| |
| ax.text(8, 6.6, "Conceptual Overview of Metadata Profiling", |
| fontsize=20, fontweight="bold", ha="center", va="top", color=C_PRIMARY) |
| ax.text(8, 6.15, "Binary classification of cloud-object filenames via character-level ML", |
| fontsize=11, ha="center", va="top", color="#6c757d", style="italic") |
|
|
| |
| input_files = [ |
| ("credentials.bak", C_DANGER), |
| ("api_secret.env", C_DANGER), |
| ("readme.md", C_SAFE), |
| ("passport_scan.pdf",C_DANGER), |
| ("logo.png", C_SAFE), |
| ("ssh_private.key", C_DANGER), |
| ("index.html", C_SAFE), |
| ] |
| box_x, box_w, box_h = 0.3, 3.0, 0.55 |
| y_start = 5.3 |
| ax.text(1.8, 5.75, "Incoming Filenames", fontsize=12, fontweight="bold", |
| ha="center", va="center", color=C_PRIMARY) |
| ax.text(1.8, 5.48, "(unpredictable, mixed)", fontsize=8.5, ha="center", |
| va="center", color="#6c757d") |
| for i, (name, col) in enumerate(input_files): |
| y = y_start - i * 0.65 |
| rect = FancyBboxPatch((box_x, y - box_h/2), box_w, box_h, |
| boxstyle="round,pad=0.08", facecolor="white", |
| edgecolor=col, linewidth=1.6) |
| ax.add_patch(rect) |
| ax.text(box_x + box_w/2, y, name, fontsize=9.5, ha="center", va="center", |
| fontfamily="monospace", color=col, fontweight="bold") |
|
|
| |
| ax.annotate("", xy=(5.0, 3.3), xytext=(3.6, 3.3), |
| arrowprops=dict(arrowstyle="-|>", color=C_PRIMARY, lw=2.5)) |
|
|
| |
| ml_box = FancyBboxPatch((5.0, 1.6), 6.0, 3.8, |
| boxstyle="round,pad=0.2", facecolor="#eef1ff", |
| edgecolor=C_BLUE, linewidth=2.5) |
| ax.add_patch(ml_box) |
| ax.text(8.0, 5.05, "ML Pipeline β Metadata Profiling", |
| fontsize=13, fontweight="bold", ha="center", va="center", color=C_BLUE) |
|
|
| steps = [ |
| ("[1] Char N-Gram\n Tokenization", 4.35), |
| ("[2] TF-IDF\n Vectorization", 3.55), |
| ("[3] LinearSVC\n Classification", 2.75), |
| ("[4] Regex Fallback\n Explainability", 1.95), |
| ] |
| for label, y in steps: |
| inner = FancyBboxPatch((5.55, y - 0.28), 4.9, 0.6, |
| boxstyle="round,pad=0.06", facecolor="white", |
| edgecolor=C_BLUE, linewidth=1.2, alpha=0.9) |
| ax.add_patch(inner) |
| ax.text(8.0, y + 0.02, label, fontsize=9.5, ha="center", va="center", |
| color=C_PRIMARY, fontweight="bold") |
|
|
| |
| ax.annotate("", xy=(12.8, 4.3), xytext=(11.1, 4.0), |
| arrowprops=dict(arrowstyle="-|>", color=C_SAFE, lw=2.5)) |
| ax.annotate("", xy=(12.8, 2.3), xytext=(11.1, 2.8), |
| arrowprops=dict(arrowstyle="-|>", color=C_DANGER, lw=2.5)) |
|
|
| |
| benign_box = FancyBboxPatch((12.8, 3.7), 2.8, 1.2, |
| boxstyle="round,pad=0.15", facecolor="#d4edda", |
| edgecolor=C_SAFE, linewidth=2.2) |
| ax.add_patch(benign_box) |
| ax.text(14.2, 4.6, "BENIGN", fontsize=14, fontweight="bold", |
| ha="center", va="center", color=C_SAFE) |
| ax.text(14.2, 4.18, "Flag = 0", fontsize=12, ha="center", va="center", |
| color=C_SAFE, fontfamily="monospace") |
| ax.text(14.2, 3.88, "readme.md, logo.png β¦", fontsize=8, ha="center", |
| va="center", color="#495057", style="italic") |
|
|
| |
| sens_box = FancyBboxPatch((12.8, 1.6), 2.8, 1.2, |
| boxstyle="round,pad=0.15", facecolor="#f8d7da", |
| edgecolor=C_DANGER, linewidth=2.2) |
| ax.add_patch(sens_box) |
| ax.text(14.2, 2.5, "SENSITIVE", fontsize=14, fontweight="bold", |
| ha="center", va="center", color=C_DANGER) |
| ax.text(14.2, 2.08, "Flag = 1", fontsize=12, ha="center", va="center", |
| color=C_DANGER, fontfamily="monospace") |
| ax.text(14.2, 1.78, "credentials.bak, ssh_key β¦", fontsize=8, ha="center", |
| va="center", color="#495057", style="italic") |
|
|
| |
| ax.text(8, 0.8, "The pipeline ingests arbitrary cloud-object filenames, extracts character-level features,\n" |
| "and outputs a binary sensitivity flag with an explainable trigger keyword.", |
| fontsize=9.5, ha="center", va="center", color="#6c757d", |
| bbox=dict(boxstyle="round,pad=0.3", facecolor="white", edgecolor=C_GRID, linewidth=1)) |
|
|
| fig.tight_layout(pad=0.5) |
| path = os.path.join(OUT_DIR, "figure1_conceptual_overview.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| def figure2_dataset_balancing(): |
| fig = plt.figure(figsize=(15, 7)) |
| gs = gridspec.GridSpec(1, 2, width_ratios=[1.1, 1], wspace=0.35) |
|
|
| |
| ax1 = fig.add_subplot(gs[0]) |
| categories = ["Sensitive\n(Label=1)", "Benign\n(Label=0)"] |
|
|
| |
| |
| |
| original_sens = 1232 |
| original_benign = 2838 |
|
|
| |
| balanced_sens = 2100 |
| balanced_benign = 2100 |
|
|
| x = np.arange(len(categories)) |
| w = 0.32 |
| bars1 = ax1.bar(x - w/2, [original_sens, original_benign], w, |
| label="Original Distribution", color=[C_DANGER, C_SAFE], |
| edgecolor="white", linewidth=1.5, alpha=0.5) |
| bars2 = ax1.bar(x + w/2, [balanced_sens, balanced_benign], w, |
| label="After Balancing", color=[C_DANGER, C_SAFE], |
| edgecolor="white", linewidth=1.5, alpha=1.0, |
| hatch="//") |
|
|
| for bar, val in zip(bars1, [original_sens, original_benign]): |
| ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40, |
| f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold", |
| color="#6c757d") |
| for bar, val in zip(bars2, [balanced_sens, balanced_benign]): |
| ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40, |
| f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold", |
| color=C_PRIMARY) |
|
|
| ax1.set_ylabel("Number of Samples", fontsize=12, fontweight="bold") |
| ax1.set_title("Original vs. Balanced Distribution", fontsize=14, fontweight="bold", |
| color=C_PRIMARY, pad=12) |
| ax1.set_xticks(x) |
| ax1.set_xticklabels(categories, fontsize=12) |
| ax1.legend(fontsize=10, frameon=True, fancybox=True, shadow=True) |
| ax1.set_ylim(0, 3400) |
| ax1.spines["top"].set_visible(False) |
| ax1.spines["right"].set_visible(False) |
| ax1.yaxis.grid(True, alpha=0.3, linestyle="--") |
|
|
| |
| ax2 = fig.add_subplot(gs[1]) |
| ax2.axis("off") |
| ax2.set_xlim(0, 10) |
| ax2.set_ylim(0, 10) |
|
|
| ax2.text(5, 9.5, "Synthetic Augmentation Strategies", fontsize=14, |
| fontweight="bold", ha="center", va="top", color=C_PRIMARY) |
|
|
| strategies = [ |
| ("1", "Base Keywords", "91 sensitive keywords added directly\nas standalone samples", C_DANGER), |
| ("2", "Extension Variants", "Each keyword Γ 6 common extensions\n(.txt, .json, .csv, .xml, .yaml, .log)", C_PURPLE), |
| ("3", "Suffix Mutations", "keyword_temp{ext} variations to\nsimulate real naming conventions", C_BLUE), |
| ("4", "Explicit Examples", "49 hand-crafted sensitive +\n48 benign real-world filenames", C_GOLD), |
| ("5", "Benign Dilution", "90 benign words Γ 15 extensions Γ 2\nvariants to prevent extension bias", C_SAFE), |
| ("6", "class_weight", "LinearSVC class_weight='balanced'\nautomatically adjusts decision boundary", "#6c757d"), |
| ] |
|
|
| for i, (num, title, desc, color) in enumerate(strategies): |
| y = 8.5 - i * 1.4 |
| circle = plt.Circle((1.0, y), 0.38, facecolor=color, edgecolor="white", |
| linewidth=2, zorder=5) |
| ax2.add_patch(circle) |
| ax2.text(1.0, y, num, fontsize=13, fontweight="bold", ha="center", |
| va="center", color="white", zorder=6) |
| ax2.text(2.0, y + 0.18, title, fontsize=11, fontweight="bold", |
| va="center", color=C_PRIMARY) |
| ax2.text(2.0, y - 0.25, desc, fontsize=8.5, va="center", color="#6c757d") |
|
|
| fig.suptitle("Figure 2: Dataset Balancing Strategy", fontsize=16, fontweight="bold", |
| color=C_PRIMARY, y=1.01) |
| fig.tight_layout() |
| path = os.path.join(OUT_DIR, "figure2_dataset_balancing.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| def figure3_ngram_tfidf(): |
| fig = plt.figure(figsize=(16, 9)) |
| gs = gridspec.GridSpec(2, 2, height_ratios=[1, 1.2], hspace=0.4, wspace=0.35) |
|
|
| |
| ax_slide = fig.add_subplot(gs[0, 0]) |
| ax_slide.axis("off") |
| ax_slide.set_xlim(0, 12) |
| ax_slide.set_ylim(0, 6) |
| ax_slide.set_title("Step 1: Character N-Gram Extraction", fontsize=13, |
| fontweight="bold", color=C_PRIMARY, pad=10) |
|
|
| word = "credentials.bak" |
| |
| char_w = 0.65 |
| x_start = 0.5 |
| y_word = 4.8 |
| for i, ch in enumerate(word): |
| rect = FancyBboxPatch((x_start + i * char_w, y_word - 0.3), char_w - 0.05, 0.6, |
| boxstyle="round,pad=0.03", facecolor="#e8eaf6", |
| edgecolor=C_BLUE, linewidth=1.2) |
| ax_slide.add_patch(rect) |
| ax_slide.text(x_start + i * char_w + char_w/2 - 0.025, y_word, |
| ch, fontsize=11, ha="center", va="center", |
| fontfamily="monospace", fontweight="bold", color=C_PRIMARY) |
|
|
| ax_slide.text(x_start + len(word) * char_w / 2, 5.5, |
| '"credentials.bak"', fontsize=11, ha="center", va="center", |
| fontfamily="monospace", color=C_BLUE, fontweight="bold") |
|
|
| |
| trigrams = ["cre", "red", "ede", "den", "ent", "nti", "tia", "ial", "als", "ls.", "s.b", ".ba", "bak"] |
| colors_3 = [C_DANGER if g in ["als", "ls.", "s.b", ".ba", "bak"] else C_BLUE for g in trigrams] |
| ax_slide.text(0.5, 3.8, "3-grams (sliding window):", fontsize=9.5, |
| fontweight="bold", color=C_PRIMARY) |
| for i, (tg, col) in enumerate(zip(trigrams, colors_3)): |
| row, col_idx = divmod(i, 7) |
| bx = 0.5 + col_idx * 1.5 |
| by = 3.2 - row * 0.7 |
| rect = FancyBboxPatch((bx, by - 0.22), 1.3, 0.44, |
| boxstyle="round,pad=0.04", facecolor="white", |
| edgecolor=col, linewidth=1.3) |
| ax_slide.add_patch(rect) |
| ax_slide.text(bx + 0.65, by, tg, fontsize=9.5, ha="center", va="center", |
| fontfamily="monospace", fontweight="bold", color=col) |
|
|
| |
| ax_slide.text(0.5, 1.6, "4-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY) |
| fourgrams = ["cred", "rede", "eden", "s.ba", ".bak"] |
| for i, fg in enumerate(fourgrams): |
| bx = 2.5 + i * 1.7 |
| rect = FancyBboxPatch((bx, 1.38), 1.5, 0.44, |
| boxstyle="round,pad=0.04", facecolor="white", |
| edgecolor=C_PURPLE, linewidth=1.2) |
| ax_slide.add_patch(rect) |
| ax_slide.text(bx + 0.75, 1.6, fg, fontsize=9, ha="center", va="center", |
| fontfamily="monospace", fontweight="bold", color=C_PURPLE) |
|
|
| ax_slide.text(0.5, 0.7, "5-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY) |
| fivegrams = ["crede", "reden", "ls.ba", "s.bak"] |
| for i, fg in enumerate(fivegrams): |
| bx = 2.5 + i * 1.9 |
| rect = FancyBboxPatch((bx, 0.48), 1.7, 0.44, |
| boxstyle="round,pad=0.04", facecolor="white", |
| edgecolor=C_GOLD, linewidth=1.2) |
| ax_slide.add_patch(rect) |
| ax_slide.text(bx + 0.85, 0.7, fg, fontsize=9, ha="center", va="center", |
| fontfamily="monospace", fontweight="bold", color="#b8860b") |
|
|
| |
| ax_hist = fig.add_subplot(gs[0, 1]) |
| top_grams = ["bak", ".ba", "s.b", "cre", "ent", "den", "ial", "tia", "als", "red"] |
| freqs = [3, 3, 2, 2, 2, 2, 1, 1, 1, 1] |
| gram_colors = [C_DANGER if g in ["bak", ".ba", "s.b"] else C_BLUE for g in top_grams] |
| bars = ax_hist.barh(range(len(top_grams)), freqs, color=gram_colors, |
| edgecolor="white", linewidth=1.2, height=0.65) |
| ax_hist.set_yticks(range(len(top_grams))) |
| ax_hist.set_yticklabels([f'"{g}"' for g in top_grams], fontfamily="monospace", fontsize=10) |
| ax_hist.set_xlabel("Frequency (across corpus)", fontsize=11, fontweight="bold") |
| ax_hist.set_title("Step 2: N-Gram Frequency Distribution", fontsize=13, |
| fontweight="bold", color=C_PRIMARY, pad=10) |
| ax_hist.invert_yaxis() |
| ax_hist.spines["top"].set_visible(False) |
| ax_hist.spines["right"].set_visible(False) |
| for bar, f in zip(bars, freqs): |
| ax_hist.text(bar.get_width() + 0.05, bar.get_y() + bar.get_height()/2, |
| str(f), va="center", fontsize=10, fontweight="bold", color="#6c757d") |
|
|
| |
| ax_heat = fig.add_subplot(gs[1, :]) |
| filenames_demo = ["credentials.bak", "api_secret.env", "readme.md", "logo.png", "ssh_key.pem"] |
| features_demo = ["cre", "bak", ".ba", "api", "sec", "rea", "adm", "log", "ssh", "key", ".pe", "png"] |
| np.random.seed(42) |
| weights = np.zeros((len(filenames_demo), len(features_demo))) |
| |
| weights[0] = [0.42, 0.61, 0.55, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] |
| |
| weights[1] = [0.0, 0.0, 0.0, 0.48, 0.53, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] |
| |
| weights[2] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.39, 0.31, 0.0, 0.0, 0.0, 0.0, 0.0] |
| |
| weights[3] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.35, 0.0, 0.0, 0.0, 0.28] |
| |
| weights[4] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57, 0.49, 0.44, 0.0] |
|
|
| cmap = LinearSegmentedColormap.from_list("custom", ["#ffffff", "#c5cae9", C_BLUE, C_PRIMARY]) |
| im = ax_heat.imshow(weights, cmap=cmap, aspect="auto", vmin=0, vmax=0.65) |
| ax_heat.set_xticks(range(len(features_demo))) |
| ax_heat.set_xticklabels([f'"{f}"' for f in features_demo], fontfamily="monospace", |
| fontsize=10, rotation=35, ha="right") |
| ax_heat.set_yticks(range(len(filenames_demo))) |
| ax_heat.set_yticklabels(filenames_demo, fontfamily="monospace", fontsize=10.5) |
| ax_heat.set_title("Step 3: TF-IDF Weight Matrix (Character N-Gram Features β Numerical Vectors)", |
| fontsize=13, fontweight="bold", color=C_PRIMARY, pad=12) |
|
|
| |
| for i in range(len(filenames_demo)): |
| for j in range(len(features_demo)): |
| val = weights[i, j] |
| if val > 0.01: |
| text_col = "white" if val > 0.35 else C_PRIMARY |
| ax_heat.text(j, i, f"{val:.2f}", ha="center", va="center", |
| fontsize=9.5, fontweight="bold", color=text_col) |
|
|
| cbar = fig.colorbar(im, ax=ax_heat, fraction=0.02, pad=0.02) |
| cbar.set_label("TF-IDF Weight", fontsize=11, fontweight="bold") |
|
|
| |
| for i, fn in enumerate(filenames_demo): |
| is_sens = fn in ["credentials.bak", "api_secret.env", "ssh_key.pem"] |
| marker_col = C_DANGER if is_sens else C_SAFE |
| label = "SENS" if is_sens else "SAFE" |
| ax_heat.text(len(features_demo) + 0.3, i, f" β {label}", |
| fontsize=9, fontweight="bold", color=marker_col, va="center") |
|
|
| fig.suptitle("Figure 3: Character N-Gram & TF-IDF Vectorization Process", |
| fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01) |
| fig.tight_layout() |
| path = os.path.join(OUT_DIR, "figure3_ngram_tfidf.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| def figure4_svm_hyperplane(): |
| fig = plt.figure(figsize=(14, 9)) |
| ax = fig.add_subplot(111, projection="3d") |
|
|
| np.random.seed(42) |
| |
| n_b = 60 |
| b_x = np.random.normal(2.5, 0.9, n_b) |
| b_y = np.random.normal(2.0, 0.8, n_b) |
| b_z = np.random.normal(1.5, 0.7, n_b) |
|
|
| |
| n_s = 60 |
| s_x = np.random.normal(6.0, 0.9, n_s) |
| s_y = np.random.normal(5.5, 0.8, n_s) |
| s_z = np.random.normal(5.0, 0.7, n_s) |
|
|
| |
| ax.scatter(b_x, b_y, b_z, c=C_BLUE, s=50, alpha=0.7, edgecolors="white", |
| linewidth=0.5, label="Benign (Flag=0)", depthshade=True) |
|
|
| |
| ax.scatter(s_x, s_y, s_z, c=C_DANGER, s=50, alpha=0.7, edgecolors="white", |
| linewidth=0.5, label="Sensitive (Flag=1)", depthshade=True) |
|
|
| |
| xx, yy = np.meshgrid(np.linspace(0, 8, 20), np.linspace(0, 8, 20)) |
| |
| zz = 0.5 * xx + 0.3 * yy - 0.5 |
|
|
| ax.plot_surface(xx, yy, zz, alpha=0.18, color="#2ecc71", edgecolor="#27ae60", |
| linewidth=0.3, shade=True) |
|
|
| |
| zz_upper = zz + 1.0 |
| zz_lower = zz - 1.0 |
| ax.plot_wireframe(xx, yy, zz_upper, alpha=0.08, color="#27ae60", linewidth=0.3, |
| rstride=5, cstride=5) |
| ax.plot_wireframe(xx, yy, zz_lower, alpha=0.08, color="#27ae60", linewidth=0.3, |
| rstride=5, cstride=5) |
|
|
| |
| sv_b = [(3.5, 3.0, 2.5), (3.8, 2.8, 2.2), (4.0, 3.2, 2.8)] |
| sv_s = [(5.0, 4.5, 4.0), (4.8, 4.2, 3.8), (5.2, 4.8, 4.5)] |
| for pt in sv_b: |
| ax.scatter(*pt, c=C_BLUE, s=150, edgecolors="black", linewidth=1.8, |
| zorder=10, marker="D") |
| for pt in sv_s: |
| ax.scatter(*pt, c=C_DANGER, s=150, edgecolors="black", linewidth=1.8, |
| zorder=10, marker="D") |
|
|
| ax.set_xlabel("\nTF-IDF Feature Dim 1\n(credential-related n-grams)", fontsize=10, |
| labelpad=8) |
| ax.set_ylabel("\nTF-IDF Feature Dim 2\n(extension-related n-grams)", fontsize=10, |
| labelpad=8) |
| ax.set_zlabel("\nTF-IDF Feature Dim 3\n(config/key n-grams)", fontsize=10, |
| labelpad=8) |
|
|
| ax.set_title("Figure 4: SVM Decision Hyperplane in Feature Space\n" |
| "LinearSVC with C=0.5 (increased regularization), class_weight='balanced'", |
| fontsize=14, fontweight="bold", color=C_PRIMARY, pad=20) |
|
|
| |
| legend_elements = [ |
| plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_BLUE, |
| markersize=10, label="Benign (Flag=0)"), |
| plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_DANGER, |
| markersize=10, label="Sensitive (Flag=1)"), |
| mpatches.Patch(facecolor="#2ecc71", alpha=0.3, label="Decision Hyperplane"), |
| plt.Line2D([0], [0], marker="s", color="w", markerfacecolor="gray", |
| markeredgecolor="black", markersize=10, label="Support Vectors"), |
| ] |
| ax.legend(handles=legend_elements, loc="upper left", fontsize=10, |
| frameon=True, fancybox=True, shadow=True) |
|
|
| ax.view_init(elev=22, azim=135) |
| ax.set_xlim(0, 8) |
| ax.set_ylim(0, 8) |
| ax.set_zlim(0, 8) |
|
|
| |
| ax.text2D(0.73, 0.08, |
| "Low C = 0.5 β wider margin\n" |
| "β better generalization\n" |
| "β = Support Vectors on margin", |
| transform=ax.transAxes, fontsize=9.5, |
| bbox=dict(boxstyle="round,pad=0.4", facecolor="#fff9c4", |
| edgecolor=C_GOLD, linewidth=1.5), |
| color=C_PRIMARY, fontweight="bold") |
|
|
| path = os.path.join(OUT_DIR, "figure4_svm_hyperplane.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| def figure5_confusion_matrix(): |
| fig = plt.figure(figsize=(16, 8)) |
| gs = gridspec.GridSpec(1, 3, width_ratios=[1.2, 0.8, 1], wspace=0.35) |
|
|
| |
| ax_cm = fig.add_subplot(gs[0]) |
| cm = np.array([[420, 80], |
| [50, 450]]) |
|
|
| labels_pred = ["Predicted\nBenign (0)", "Predicted\nSensitive (1)"] |
| labels_actual = ["Actual\nBenign (0)", "Actual\nSensitive (1)"] |
|
|
| cell_colors = [ |
| ["#d4edda", "#f8d7da"], |
| ["#ffe0b2", "#c8e6c9"], |
| ] |
|
|
| |
| for i in range(2): |
| for j in range(2): |
| rect = FancyBboxPatch((j, 1-i), 0.95, 0.9, |
| boxstyle="round,pad=0.04", |
| facecolor=cell_colors[i][j], |
| edgecolor="#adb5bd", linewidth=1.5) |
| ax_cm.add_patch(rect) |
| val = cm[i, j] |
| cell_label = [["TN", "FP"], ["FN", "TP"]][i][j] |
|
|
| |
| if cell_label == "FN": |
| ax_cm.text(j + 0.475, 1.68 - i, f"{val}", |
| fontsize=28, fontweight="bold", ha="center", va="center", |
| color=C_DANGER, |
| path_effects=[pe.withStroke(linewidth=3, foreground="white")]) |
| ax_cm.text(j + 0.475, 1.28 - i, f"[!] {cell_label}", |
| fontsize=14, fontweight="bold", ha="center", va="center", |
| color=C_DANGER) |
| else: |
| fc = C_SAFE if cell_label in ["TN", "TP"] else "#e65100" |
| ax_cm.text(j + 0.475, 1.68 - i, f"{val}", |
| fontsize=26, fontweight="bold", ha="center", va="center", |
| color=fc) |
| ax_cm.text(j + 0.475, 1.28 - i, cell_label, |
| fontsize=13, fontweight="bold", ha="center", va="center", |
| color=fc, alpha=0.7) |
|
|
| ax_cm.set_xlim(-0.3, 2.2) |
| ax_cm.set_ylim(-0.1, 2.3) |
| ax_cm.set_xticks([0.475, 1.475]) |
| ax_cm.set_xticklabels(labels_pred, fontsize=10, fontweight="bold") |
| ax_cm.set_yticks([0.45, 1.45]) |
| ax_cm.set_yticklabels(labels_actual[::-1], fontsize=10, fontweight="bold") |
| ax_cm.set_title("Confusion Matrix", fontsize=14, fontweight="bold", |
| color=C_PRIMARY, pad=12) |
| ax_cm.spines["top"].set_visible(False) |
| ax_cm.spines["right"].set_visible(False) |
| ax_cm.spines["bottom"].set_visible(False) |
| ax_cm.spines["left"].set_visible(False) |
| ax_cm.tick_params(length=0) |
|
|
| |
| ax_g = fig.add_subplot(gs[1]) |
| ax_g.axis("off") |
| ax_g.set_xlim(0, 10) |
| ax_g.set_ylim(0, 10) |
|
|
| metrics = [ |
| ("Recall", 450/(450+50), C_SAFE, "TP / (TP+FN) = 450/500"), |
| ("Precision", 450/(450+80), C_BLUE, "TP / (TP+FP) = 450/530"), |
| ("Accuracy", (420+450)/1000, C_PURPLE, "(TP+TN) / Total = 870/1000"), |
| ("F1-Score", 2*0.9*0.849/(0.9+0.849), "#e65100", "2Β·PΒ·R / (P+R)"), |
| ] |
|
|
| ax_g.text(5, 9.7, "Key Metrics", fontsize=14, fontweight="bold", |
| ha="center", va="top", color=C_PRIMARY) |
|
|
| for i, (name, value, color, formula) in enumerate(metrics): |
| y = 8.5 - i * 2.2 |
| |
| bar_bg = FancyBboxPatch((1.0, y - 0.35), 8.0, 0.7, |
| boxstyle="round,pad=0.06", facecolor=C_LIGHT, |
| edgecolor=C_GRID, linewidth=1) |
| ax_g.add_patch(bar_bg) |
| |
| bar_fill = FancyBboxPatch((1.0, y - 0.35), 8.0 * value, 0.7, |
| boxstyle="round,pad=0.06", facecolor=color, |
| edgecolor="white", linewidth=1, alpha=0.85) |
| ax_g.add_patch(bar_fill) |
|
|
| ax_g.text(5.0, y, f"{value:.1%}", fontsize=16, fontweight="bold", |
| ha="center", va="center", color="white", |
| path_effects=[pe.withStroke(linewidth=3, foreground=color)]) |
|
|
| ax_g.text(5.0, y + 0.6, name, fontsize=12, fontweight="bold", |
| ha="center", va="center", color=C_PRIMARY) |
| ax_g.text(5.0, y - 0.65, formula, fontsize=7.5, ha="center", |
| va="center", color="#6c757d", fontfamily="monospace") |
|
|
| |
| ax_exp = fig.add_subplot(gs[2]) |
| ax_exp.axis("off") |
| ax_exp.set_xlim(0, 10) |
| ax_exp.set_ylim(0, 10) |
|
|
| ax_exp.text(5, 9.5, "Why Recall is Prioritized", fontsize=14, |
| fontweight="bold", ha="center", va="top", color=C_DANGER) |
|
|
| |
| theta = np.linspace(0, np.pi, 100) |
| gauge_r = 2.5 |
| gx = 5 + gauge_r * np.cos(theta) |
| gy = 6.5 + gauge_r * np.sin(theta) |
| ax_exp.plot(gx, gy, color=C_LIGHT, linewidth=12, solid_capstyle="round") |
|
|
| |
| theta_fill = np.linspace(0, np.pi * 0.90, 100) |
| gx_f = 5 + gauge_r * np.cos(theta_fill) |
| gy_f = 6.5 + gauge_r * np.sin(theta_fill) |
| ax_exp.plot(gx_f, gy_f, color=C_SAFE, linewidth=12, solid_capstyle="round") |
|
|
| ax_exp.text(5, 7.0, "90.0%", fontsize=28, fontweight="bold", |
| ha="center", va="center", color=C_SAFE) |
| ax_exp.text(5, 6.2, "RECALL", fontsize=12, fontweight="bold", |
| ha="center", va="center", color=C_PRIMARY) |
|
|
| |
| bullets = [ |
| "β’ Missing a sensitive file (FN) is far\n costlier than a false alarm (FP)", |
| "β’ FN = 50 β actively minimized by\n class_weight='balanced' + low C", |
| "β’ Security-first: flag everything\n suspicious, triage later", |
| "β’ Regex fallback catches edge cases\n the ML model may miss", |
| ] |
| for i, txt in enumerate(bullets): |
| y = 4.5 - i * 1.1 |
| ax_exp.text(0.5, y, txt, fontsize=9, color=C_PRIMARY, va="top", |
| fontweight="bold" if i == 0 else "normal") |
|
|
| fig.suptitle("Figure 5: Confusion Matrix & Evaluation Metrics (Recall Optimization)", |
| fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01) |
| fig.tight_layout() |
| path = os.path.join(OUT_DIR, "figure5_confusion_matrix.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| def figure6_architecture(): |
| fig, ax = plt.subplots(figsize=(20, 11)) |
| ax.set_xlim(0, 20) |
| ax.set_ylim(0, 11) |
| ax.axis("off") |
| fig.patch.set_facecolor("white") |
|
|
| ax.text(10, 10.7, "Figure 6: End-to-End System Architecture β S3Shastra", |
| fontsize=18, fontweight="bold", ha="center", va="top", color=C_PRIMARY) |
| ax.text(10, 10.25, "Training Path (dotted) Β· Inference Path (solid)", |
| fontsize=11, ha="center", va="top", color="#6c757d", style="italic") |
|
|
| def draw_box(ax, x, y, w, h, label, sublabel="", color=C_BLUE, bg="#eef1ff", |
| fontsize=10, sublabel_size=7.5): |
| rect = FancyBboxPatch((x, y), w, h, |
| boxstyle="round,pad=0.12", facecolor=bg, |
| edgecolor=color, linewidth=2) |
| ax.add_patch(rect) |
| if sublabel: |
| ax.text(x + w/2, y + h/2 + 0.15, label, fontsize=fontsize, |
| fontweight="bold", ha="center", va="center", color=color) |
| ax.text(x + w/2, y + h/2 - 0.22, sublabel, fontsize=sublabel_size, |
| ha="center", va="center", color="#6c757d", style="italic") |
| else: |
| ax.text(x + w/2, y + h/2, label, fontsize=fontsize, |
| fontweight="bold", ha="center", va="center", color=color) |
|
|
| def arrow(ax, x1, y1, x2, y2, color=C_PRIMARY, style="-", lw=2): |
| ls = "--" if style == "dotted" else "-" |
| ax.annotate("", xy=(x2, y2), xytext=(x1, y1), |
| arrowprops=dict(arrowstyle="-|>", color=color, lw=lw, |
| linestyle=ls)) |
|
|
| |
| ax.text(1.0, 9.5, "TRAINING PATH", fontsize=12, fontweight="bold", |
| color=C_PURPLE, rotation=0, |
| bbox=dict(boxstyle="round,pad=0.2", facecolor="#f3e5f5", |
| edgecolor=C_PURPLE, linewidth=1.5)) |
|
|
| |
| draw_box(ax, 0.3, 7.8, 2.8, 1.2, "Sensitive\nKeywords", "91 keywords + variations", |
| color=C_DANGER, bg="#fce4ec") |
|
|
| |
| draw_box(ax, 4.0, 7.8, 3.0, 1.2, "Synthetic Data\nGenerator", "build_dataset()", |
| color=C_PURPLE, bg="#f3e5f5") |
|
|
| |
| draw_box(ax, 0.3, 6.2, 2.8, 1.1, "Benign Words\nPool", "90 words + extensions", |
| color=C_SAFE, bg="#e8f5e9") |
|
|
| |
| arrow(ax, 3.1, 8.4, 4.0, 8.4, color=C_DANGER, style="dotted") |
| arrow(ax, 3.1, 6.75, 4.5, 7.8, color=C_SAFE, style="dotted") |
|
|
| |
| draw_box(ax, 0.3, 4.8, 2.8, 1.0, "Explicit Examples", "49 sens + 48 benign", |
| color=C_GOLD, bg="#fff8e1") |
| arrow(ax, 3.1, 5.3, 4.5, 7.8, color=C_GOLD, style="dotted") |
|
|
| |
| draw_box(ax, 8.0, 7.8, 2.5, 1.2, "Balanced\nDataset", "~4200 samples", |
| color=C_PRIMARY, bg="#e3f2fd") |
| arrow(ax, 7.0, 8.4, 8.0, 8.4, color=C_PURPLE, style="dotted") |
|
|
| |
| draw_box(ax, 11.3, 7.8, 2.8, 1.2, "TF-IDF\nVectorizer", "char_wb, ngram(3,5)", |
| color=C_BLUE, bg="#e8eaf6") |
| arrow(ax, 10.5, 8.4, 11.3, 8.4, color=C_PRIMARY, style="dotted") |
|
|
| |
| draw_box(ax, 14.8, 7.8, 2.8, 1.2, "LinearSVC\nTraining", "C=0.5, balanced", |
| color=C_DANGER, bg="#fce4ec") |
| arrow(ax, 14.1, 8.4, 14.8, 8.4, color=C_PRIMARY, style="dotted") |
|
|
| |
| draw_box(ax, 18.0, 7.8, 1.7, 1.2, "Saved\nModel", ".joblib", |
| color="#6c757d", bg="#f5f5f5") |
| arrow(ax, 17.6, 8.4, 18.0, 8.4, color=C_PRIMARY, style="dotted") |
|
|
| |
| ax.text(1.0, 5.4, "INFERENCE PATH", fontsize=12, fontweight="bold", |
| color=C_BLUE, rotation=0, |
| bbox=dict(boxstyle="round,pad=0.2", facecolor="#e3f2fd", |
| edgecolor=C_BLUE, linewidth=1.5)) |
|
|
| |
| draw_box(ax, 0.3, 3.2, 2.8, 1.2, "Cloud Storage\nBuckets", "S3 / GCS / Azure / β¦", |
| color=C_PRIMARY, bg="#e3f2fd", fontsize=10) |
|
|
| |
| draw_box(ax, 3.8, 3.2, 2.5, 1.2, "Object\nEnumerator", "async HTTP scanner", |
| color=C_BLUE, bg="#e8eaf6") |
| arrow(ax, 3.1, 3.8, 3.8, 3.8, color=C_PRIMARY) |
|
|
| |
| draw_box(ax, 7.0, 3.2, 2.3, 1.2, "Extension\nFilter", "skip images/media", |
| color="#6c757d", bg="#f5f5f5") |
| arrow(ax, 6.3, 3.8, 7.0, 3.8, color=C_PRIMARY) |
|
|
| |
| draw_box(ax, 10.0, 3.2, 2.8, 1.2, "ML Classifier\n(Pipeline)", "TF-IDF β LinearSVC", |
| color=C_BLUE, bg="#e8eaf6") |
| arrow(ax, 9.3, 3.8, 10.0, 3.8, color=C_PRIMARY) |
|
|
| |
| arrow(ax, 18.5, 7.8, 11.4, 4.4, color="#6c757d", style="dotted", lw=1.5) |
| ax.text(15.5, 6.2, "load .joblib", fontsize=8, color="#6c757d", style="italic", |
| rotation=-15) |
|
|
| |
| draw_box(ax, 13.5, 3.2, 2.8, 1.2, "Regex Fallback\nEngine", "keyword matching", |
| color=C_GOLD, bg="#fff8e1") |
| arrow(ax, 12.8, 3.8, 13.5, 3.8, color=C_PRIMARY) |
|
|
| |
| draw_box(ax, 13.5, 1.4, 2.8, 1.2, "Trigger\nExplainer", "get_trigger_explanation()", |
| color="#e65100", bg="#fff3e0") |
| arrow(ax, 14.9, 3.2, 14.9, 2.6, color="#e65100") |
|
|
| |
| draw_box(ax, 17.0, 3.2, 2.7, 1.2, "Classification\nOutput", "flag + trigger word", |
| color=C_DANGER, bg="#fce4ec", fontsize=10) |
| arrow(ax, 16.3, 3.8, 17.0, 3.8, color=C_PRIMARY) |
| arrow(ax, 16.3, 1.9, 17.3, 3.2, color="#e65100") |
|
|
| |
| draw_box(ax, 17.0, 1.4, 2.7, 1.2, "Dashboard\n& WebSocket", "real-time results", |
| color=C_SAFE, bg="#e8f5e9", fontsize=10) |
| arrow(ax, 18.35, 3.2, 18.35, 2.6, color=C_SAFE) |
|
|
| |
| legend_y = 0.5 |
| ax.plot([0.5, 1.5], [legend_y, legend_y], color=C_PRIMARY, linewidth=2, linestyle="-") |
| ax.text(1.7, legend_y, "Inference (Solid)", fontsize=9, va="center", color=C_PRIMARY) |
| ax.plot([4.0, 5.0], [legend_y, legend_y], color=C_PURPLE, linewidth=2, linestyle="--") |
| ax.text(5.2, legend_y, "Training (Dotted)", fontsize=9, va="center", color=C_PURPLE) |
|
|
| legend_patches = [ |
| mpatches.Patch(facecolor="#fce4ec", edgecolor=C_DANGER, label="Sensitive/Classifier"), |
| mpatches.Patch(facecolor="#e8f5e9", edgecolor=C_SAFE, label="Benign/Output"), |
| mpatches.Patch(facecolor="#e8eaf6", edgecolor=C_BLUE, label="Feature Extraction"), |
| mpatches.Patch(facecolor="#fff8e1", edgecolor=C_GOLD, label="Fallback/Explainability"), |
| ] |
| ax.legend(handles=legend_patches, loc="lower right", fontsize=9, |
| frameon=True, fancybox=True, shadow=True, ncol=4, |
| bbox_to_anchor=(0.98, -0.02)) |
|
|
| fig.tight_layout(pad=0.5) |
| path = os.path.join(OUT_DIR, "figure6_architecture.png") |
| fig.savefig(path, dpi=300, bbox_inches="tight") |
| plt.close(fig) |
| print(f" β Saved {path}") |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| print("=" * 60) |
| print(" S3Shastra β Research Paper Figure Generator") |
| print("=" * 60) |
| print(f"\n Output directory: {OUT_DIR}\n") |
|
|
| figure1_conceptual_overview() |
| figure2_dataset_balancing() |
| figure3_ngram_tfidf() |
| figure4_svm_hyperplane() |
| figure5_confusion_matrix() |
| figure6_architecture() |
|
|
| print(f"\n{'=' * 60}") |
| print(f" All 6 figures saved to: {OUT_DIR}") |
| print(f"{'=' * 60}") |
|
|