s3shastra / generate_figures.py
Atharv834
Deploy S3Shastra backend - FastAPI + scanners + ML models
6a4dcb6
"""
S3Shastra Research Paper β€” Publication-Quality Figure Generator
===============================================================
Generates 6 figures for the Metadata Profiling / ML Pipeline research paper.
Figures:
1. Conceptual Overview of Metadata Profiling
2. Dataset Balancing Strategy (Bar Chart)
3. Character N-Gram & TF-IDF Vectorization Process
4. 3D SVM Decision Hyperplane Visualization
5. Confusion Matrix & Key Evaluation Metrics (Recall Optimization)
6. Full End-to-End System Architecture Diagram
Usage:
python generate_figures.py
-> Saves all figures as high-res PNGs to ./figures/
"""
import os
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, ArrowStyle
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import matplotlib.patheffects as pe
OUT_DIR = os.path.join(os.path.dirname(__file__), "figures")
os.makedirs(OUT_DIR, exist_ok=True)
# ── Colour Palette (consistent across all figures) ──────────────────
C_PRIMARY = "#1a1a2e" # dark navy
C_ACCENT = "#e94560" # vivid red/pink
C_SAFE = "#0f9b8e" # teal / benign-green
C_DANGER = "#e94560" # red / sensitive
C_BLUE = "#4361ee" # bright blue
C_GOLD = "#f9c74f" # gold accent
C_PURPLE = "#7209b7" # purple
C_BG = "#f8f9fa" # light background
C_GRID = "#dee2e6"
C_TEXT = "#212529"
C_LIGHT = "#e9ecef"
plt.rcParams.update({
"font.family": "sans-serif",
"font.sans-serif": ["Segoe UI", "Helvetica Neue", "Arial", "DejaVu Sans"],
"axes.facecolor": C_BG,
"figure.facecolor": "white",
"axes.edgecolor": C_GRID,
"axes.grid": False,
"text.color": C_TEXT,
"axes.labelcolor": C_TEXT,
"xtick.color": C_TEXT,
"ytick.color": C_TEXT,
})
# ════════════════════════════════════════════════════════════════════
# FIGURE 1 β€” Conceptual Overview of Metadata Profiling
# ════════════════════════════════════════════════════════════════════
def figure1_conceptual_overview():
fig, ax = plt.subplots(figsize=(16, 7))
ax.set_xlim(0, 16)
ax.set_ylim(0, 7)
ax.axis("off")
fig.patch.set_facecolor("white")
# ── Title ──
ax.text(8, 6.6, "Conceptual Overview of Metadata Profiling",
fontsize=20, fontweight="bold", ha="center", va="top", color=C_PRIMARY)
ax.text(8, 6.15, "Binary classification of cloud-object filenames via character-level ML",
fontsize=11, ha="center", va="top", color="#6c757d", style="italic")
# ── Input filenames (left side) ──
input_files = [
("credentials.bak", C_DANGER),
("api_secret.env", C_DANGER),
("readme.md", C_SAFE),
("passport_scan.pdf",C_DANGER),
("logo.png", C_SAFE),
("ssh_private.key", C_DANGER),
("index.html", C_SAFE),
]
box_x, box_w, box_h = 0.3, 3.0, 0.55
y_start = 5.3
ax.text(1.8, 5.75, "Incoming Filenames", fontsize=12, fontweight="bold",
ha="center", va="center", color=C_PRIMARY)
ax.text(1.8, 5.48, "(unpredictable, mixed)", fontsize=8.5, ha="center",
va="center", color="#6c757d")
for i, (name, col) in enumerate(input_files):
y = y_start - i * 0.65
rect = FancyBboxPatch((box_x, y - box_h/2), box_w, box_h,
boxstyle="round,pad=0.08", facecolor="white",
edgecolor=col, linewidth=1.6)
ax.add_patch(rect)
ax.text(box_x + box_w/2, y, name, fontsize=9.5, ha="center", va="center",
fontfamily="monospace", color=col, fontweight="bold")
# ── Big arrow ──
ax.annotate("", xy=(5.0, 3.3), xytext=(3.6, 3.3),
arrowprops=dict(arrowstyle="-|>", color=C_PRIMARY, lw=2.5))
# ── ML Pipeline box (center) ──
ml_box = FancyBboxPatch((5.0, 1.6), 6.0, 3.8,
boxstyle="round,pad=0.2", facecolor="#eef1ff",
edgecolor=C_BLUE, linewidth=2.5)
ax.add_patch(ml_box)
ax.text(8.0, 5.05, "ML Pipeline β€” Metadata Profiling",
fontsize=13, fontweight="bold", ha="center", va="center", color=C_BLUE)
steps = [
("[1] Char N-Gram\n Tokenization", 4.35),
("[2] TF-IDF\n Vectorization", 3.55),
("[3] LinearSVC\n Classification", 2.75),
("[4] Regex Fallback\n Explainability", 1.95),
]
for label, y in steps:
inner = FancyBboxPatch((5.55, y - 0.28), 4.9, 0.6,
boxstyle="round,pad=0.06", facecolor="white",
edgecolor=C_BLUE, linewidth=1.2, alpha=0.9)
ax.add_patch(inner)
ax.text(8.0, y + 0.02, label, fontsize=9.5, ha="center", va="center",
color=C_PRIMARY, fontweight="bold")
# ── Output arrows ──
ax.annotate("", xy=(12.8, 4.3), xytext=(11.1, 4.0),
arrowprops=dict(arrowstyle="-|>", color=C_SAFE, lw=2.5))
ax.annotate("", xy=(12.8, 2.3), xytext=(11.1, 2.8),
arrowprops=dict(arrowstyle="-|>", color=C_DANGER, lw=2.5))
# ── Benign output ──
benign_box = FancyBboxPatch((12.8, 3.7), 2.8, 1.2,
boxstyle="round,pad=0.15", facecolor="#d4edda",
edgecolor=C_SAFE, linewidth=2.2)
ax.add_patch(benign_box)
ax.text(14.2, 4.6, "BENIGN", fontsize=14, fontweight="bold",
ha="center", va="center", color=C_SAFE)
ax.text(14.2, 4.18, "Flag = 0", fontsize=12, ha="center", va="center",
color=C_SAFE, fontfamily="monospace")
ax.text(14.2, 3.88, "readme.md, logo.png …", fontsize=8, ha="center",
va="center", color="#495057", style="italic")
# ── Sensitive output ──
sens_box = FancyBboxPatch((12.8, 1.6), 2.8, 1.2,
boxstyle="round,pad=0.15", facecolor="#f8d7da",
edgecolor=C_DANGER, linewidth=2.2)
ax.add_patch(sens_box)
ax.text(14.2, 2.5, "SENSITIVE", fontsize=14, fontweight="bold",
ha="center", va="center", color=C_DANGER)
ax.text(14.2, 2.08, "Flag = 1", fontsize=12, ha="center", va="center",
color=C_DANGER, fontfamily="monospace")
ax.text(14.2, 1.78, "credentials.bak, ssh_key …", fontsize=8, ha="center",
va="center", color="#495057", style="italic")
# ── Bottom caption ──
ax.text(8, 0.8, "The pipeline ingests arbitrary cloud-object filenames, extracts character-level features,\n"
"and outputs a binary sensitivity flag with an explainable trigger keyword.",
fontsize=9.5, ha="center", va="center", color="#6c757d",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", edgecolor=C_GRID, linewidth=1))
fig.tight_layout(pad=0.5)
path = os.path.join(OUT_DIR, "figure1_conceptual_overview.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# FIGURE 2 β€” Dataset Balancing Strategy (Bar Chart)
# ════════════════════════════════════════════════════════════════════
def figure2_dataset_balancing():
fig = plt.figure(figsize=(15, 7))
gs = gridspec.GridSpec(1, 2, width_ratios=[1.1, 1], wspace=0.35)
# ── Left: Bar chart ──
ax1 = fig.add_subplot(gs[0])
categories = ["Sensitive\n(Label=1)", "Benign\n(Label=0)"]
# Simulated counts matching actual dataset generation logic
# SENSITIVE: 49 examples + ~91 keywords * (1 base + 6 ext * 2 variations) = 49 + 91*13 = ~1232
# BENIGN: 48 examples + ~90 benign_words * (1 base + 15 ext * 2 variations) = 48 + 90*31 = ~2838
original_sens = 1232
original_benign = 2838
# After balancing via class_weight='balanced' + synthetic augmentation
balanced_sens = 2100
balanced_benign = 2100
x = np.arange(len(categories))
w = 0.32
bars1 = ax1.bar(x - w/2, [original_sens, original_benign], w,
label="Original Distribution", color=[C_DANGER, C_SAFE],
edgecolor="white", linewidth=1.5, alpha=0.5)
bars2 = ax1.bar(x + w/2, [balanced_sens, balanced_benign], w,
label="After Balancing", color=[C_DANGER, C_SAFE],
edgecolor="white", linewidth=1.5, alpha=1.0,
hatch="//")
for bar, val in zip(bars1, [original_sens, original_benign]):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40,
f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold",
color="#6c757d")
for bar, val in zip(bars2, [balanced_sens, balanced_benign]):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40,
f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold",
color=C_PRIMARY)
ax1.set_ylabel("Number of Samples", fontsize=12, fontweight="bold")
ax1.set_title("Original vs. Balanced Distribution", fontsize=14, fontweight="bold",
color=C_PRIMARY, pad=12)
ax1.set_xticks(x)
ax1.set_xticklabels(categories, fontsize=12)
ax1.legend(fontsize=10, frameon=True, fancybox=True, shadow=True)
ax1.set_ylim(0, 3400)
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax1.yaxis.grid(True, alpha=0.3, linestyle="--")
# ── Right: Augmentation strategy breakdown ──
ax2 = fig.add_subplot(gs[1])
ax2.axis("off")
ax2.set_xlim(0, 10)
ax2.set_ylim(0, 10)
ax2.text(5, 9.5, "Synthetic Augmentation Strategies", fontsize=14,
fontweight="bold", ha="center", va="top", color=C_PRIMARY)
strategies = [
("1", "Base Keywords", "91 sensitive keywords added directly\nas standalone samples", C_DANGER),
("2", "Extension Variants", "Each keyword Γ— 6 common extensions\n(.txt, .json, .csv, .xml, .yaml, .log)", C_PURPLE),
("3", "Suffix Mutations", "keyword_temp{ext} variations to\nsimulate real naming conventions", C_BLUE),
("4", "Explicit Examples", "49 hand-crafted sensitive +\n48 benign real-world filenames", C_GOLD),
("5", "Benign Dilution", "90 benign words Γ— 15 extensions Γ— 2\nvariants to prevent extension bias", C_SAFE),
("6", "class_weight", "LinearSVC class_weight='balanced'\nautomatically adjusts decision boundary", "#6c757d"),
]
for i, (num, title, desc, color) in enumerate(strategies):
y = 8.5 - i * 1.4
circle = plt.Circle((1.0, y), 0.38, facecolor=color, edgecolor="white",
linewidth=2, zorder=5)
ax2.add_patch(circle)
ax2.text(1.0, y, num, fontsize=13, fontweight="bold", ha="center",
va="center", color="white", zorder=6)
ax2.text(2.0, y + 0.18, title, fontsize=11, fontweight="bold",
va="center", color=C_PRIMARY)
ax2.text(2.0, y - 0.25, desc, fontsize=8.5, va="center", color="#6c757d")
fig.suptitle("Figure 2: Dataset Balancing Strategy", fontsize=16, fontweight="bold",
color=C_PRIMARY, y=1.01)
fig.tight_layout()
path = os.path.join(OUT_DIR, "figure2_dataset_balancing.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# FIGURE 3 β€” Character N-Gram & TF-IDF Vectorization Process
# ════════════════════════════════════════════════════════════════════
def figure3_ngram_tfidf():
fig = plt.figure(figsize=(16, 9))
gs = gridspec.GridSpec(2, 2, height_ratios=[1, 1.2], hspace=0.4, wspace=0.35)
# ── Top-Left: Sliding window illustration ──
ax_slide = fig.add_subplot(gs[0, 0])
ax_slide.axis("off")
ax_slide.set_xlim(0, 12)
ax_slide.set_ylim(0, 6)
ax_slide.set_title("Step 1: Character N-Gram Extraction", fontsize=13,
fontweight="bold", color=C_PRIMARY, pad=10)
word = "credentials.bak"
# Draw each character in a box
char_w = 0.65
x_start = 0.5
y_word = 4.8
for i, ch in enumerate(word):
rect = FancyBboxPatch((x_start + i * char_w, y_word - 0.3), char_w - 0.05, 0.6,
boxstyle="round,pad=0.03", facecolor="#e8eaf6",
edgecolor=C_BLUE, linewidth=1.2)
ax_slide.add_patch(rect)
ax_slide.text(x_start + i * char_w + char_w/2 - 0.025, y_word,
ch, fontsize=11, ha="center", va="center",
fontfamily="monospace", fontweight="bold", color=C_PRIMARY)
ax_slide.text(x_start + len(word) * char_w / 2, 5.5,
'"credentials.bak"', fontsize=11, ha="center", va="center",
fontfamily="monospace", color=C_BLUE, fontweight="bold")
# Show sliding windows (3-grams)
trigrams = ["cre", "red", "ede", "den", "ent", "nti", "tia", "ial", "als", "ls.", "s.b", ".ba", "bak"]
colors_3 = [C_DANGER if g in ["als", "ls.", "s.b", ".ba", "bak"] else C_BLUE for g in trigrams]
ax_slide.text(0.5, 3.8, "3-grams (sliding window):", fontsize=9.5,
fontweight="bold", color=C_PRIMARY)
for i, (tg, col) in enumerate(zip(trigrams, colors_3)):
row, col_idx = divmod(i, 7)
bx = 0.5 + col_idx * 1.5
by = 3.2 - row * 0.7
rect = FancyBboxPatch((bx, by - 0.22), 1.3, 0.44,
boxstyle="round,pad=0.04", facecolor="white",
edgecolor=col, linewidth=1.3)
ax_slide.add_patch(rect)
ax_slide.text(bx + 0.65, by, tg, fontsize=9.5, ha="center", va="center",
fontfamily="monospace", fontweight="bold", color=col)
# Show 4-grams and 5-grams label
ax_slide.text(0.5, 1.6, "4-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY)
fourgrams = ["cred", "rede", "eden", "s.ba", ".bak"]
for i, fg in enumerate(fourgrams):
bx = 2.5 + i * 1.7
rect = FancyBboxPatch((bx, 1.38), 1.5, 0.44,
boxstyle="round,pad=0.04", facecolor="white",
edgecolor=C_PURPLE, linewidth=1.2)
ax_slide.add_patch(rect)
ax_slide.text(bx + 0.75, 1.6, fg, fontsize=9, ha="center", va="center",
fontfamily="monospace", fontweight="bold", color=C_PURPLE)
ax_slide.text(0.5, 0.7, "5-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY)
fivegrams = ["crede", "reden", "ls.ba", "s.bak"]
for i, fg in enumerate(fivegrams):
bx = 2.5 + i * 1.9
rect = FancyBboxPatch((bx, 0.48), 1.7, 0.44,
boxstyle="round,pad=0.04", facecolor="white",
edgecolor=C_GOLD, linewidth=1.2)
ax_slide.add_patch(rect)
ax_slide.text(bx + 0.85, 0.7, fg, fontsize=9, ha="center", va="center",
fontfamily="monospace", fontweight="bold", color="#b8860b")
# ── Top-Right: N-gram frequency histogram ──
ax_hist = fig.add_subplot(gs[0, 1])
top_grams = ["bak", ".ba", "s.b", "cre", "ent", "den", "ial", "tia", "als", "red"]
freqs = [3, 3, 2, 2, 2, 2, 1, 1, 1, 1]
gram_colors = [C_DANGER if g in ["bak", ".ba", "s.b"] else C_BLUE for g in top_grams]
bars = ax_hist.barh(range(len(top_grams)), freqs, color=gram_colors,
edgecolor="white", linewidth=1.2, height=0.65)
ax_hist.set_yticks(range(len(top_grams)))
ax_hist.set_yticklabels([f'"{g}"' for g in top_grams], fontfamily="monospace", fontsize=10)
ax_hist.set_xlabel("Frequency (across corpus)", fontsize=11, fontweight="bold")
ax_hist.set_title("Step 2: N-Gram Frequency Distribution", fontsize=13,
fontweight="bold", color=C_PRIMARY, pad=10)
ax_hist.invert_yaxis()
ax_hist.spines["top"].set_visible(False)
ax_hist.spines["right"].set_visible(False)
for bar, f in zip(bars, freqs):
ax_hist.text(bar.get_width() + 0.05, bar.get_y() + bar.get_height()/2,
str(f), va="center", fontsize=10, fontweight="bold", color="#6c757d")
# ── Bottom: TF-IDF weight heatmap ──
ax_heat = fig.add_subplot(gs[1, :])
filenames_demo = ["credentials.bak", "api_secret.env", "readme.md", "logo.png", "ssh_key.pem"]
features_demo = ["cre", "bak", ".ba", "api", "sec", "rea", "adm", "log", "ssh", "key", ".pe", "png"]
np.random.seed(42)
weights = np.zeros((len(filenames_demo), len(features_demo)))
# credentials.bak
weights[0] = [0.42, 0.61, 0.55, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
# api_secret.env
weights[1] = [0.0, 0.0, 0.0, 0.48, 0.53, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
# readme.md
weights[2] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.39, 0.31, 0.0, 0.0, 0.0, 0.0, 0.0]
# logo.png
weights[3] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.35, 0.0, 0.0, 0.0, 0.28]
# ssh_key.pem
weights[4] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57, 0.49, 0.44, 0.0]
cmap = LinearSegmentedColormap.from_list("custom", ["#ffffff", "#c5cae9", C_BLUE, C_PRIMARY])
im = ax_heat.imshow(weights, cmap=cmap, aspect="auto", vmin=0, vmax=0.65)
ax_heat.set_xticks(range(len(features_demo)))
ax_heat.set_xticklabels([f'"{f}"' for f in features_demo], fontfamily="monospace",
fontsize=10, rotation=35, ha="right")
ax_heat.set_yticks(range(len(filenames_demo)))
ax_heat.set_yticklabels(filenames_demo, fontfamily="monospace", fontsize=10.5)
ax_heat.set_title("Step 3: TF-IDF Weight Matrix (Character N-Gram Features β†’ Numerical Vectors)",
fontsize=13, fontweight="bold", color=C_PRIMARY, pad=12)
# Annotate cells
for i in range(len(filenames_demo)):
for j in range(len(features_demo)):
val = weights[i, j]
if val > 0.01:
text_col = "white" if val > 0.35 else C_PRIMARY
ax_heat.text(j, i, f"{val:.2f}", ha="center", va="center",
fontsize=9.5, fontweight="bold", color=text_col)
cbar = fig.colorbar(im, ax=ax_heat, fraction=0.02, pad=0.02)
cbar.set_label("TF-IDF Weight", fontsize=11, fontweight="bold")
# Row labels for sensitivity
for i, fn in enumerate(filenames_demo):
is_sens = fn in ["credentials.bak", "api_secret.env", "ssh_key.pem"]
marker_col = C_DANGER if is_sens else C_SAFE
label = "SENS" if is_sens else "SAFE"
ax_heat.text(len(features_demo) + 0.3, i, f" ← {label}",
fontsize=9, fontweight="bold", color=marker_col, va="center")
fig.suptitle("Figure 3: Character N-Gram & TF-IDF Vectorization Process",
fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01)
fig.tight_layout()
path = os.path.join(OUT_DIR, "figure3_ngram_tfidf.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# FIGURE 4 β€” 3D SVM Decision Hyperplane Visualization
# ════════════════════════════════════════════════════════════════════
def figure4_svm_hyperplane():
fig = plt.figure(figsize=(14, 9))
ax = fig.add_subplot(111, projection="3d")
np.random.seed(42)
# Benign cluster
n_b = 60
b_x = np.random.normal(2.5, 0.9, n_b)
b_y = np.random.normal(2.0, 0.8, n_b)
b_z = np.random.normal(1.5, 0.7, n_b)
# Sensitive cluster
n_s = 60
s_x = np.random.normal(6.0, 0.9, n_s)
s_y = np.random.normal(5.5, 0.8, n_s)
s_z = np.random.normal(5.0, 0.7, n_s)
# Plot benign
ax.scatter(b_x, b_y, b_z, c=C_BLUE, s=50, alpha=0.7, edgecolors="white",
linewidth=0.5, label="Benign (Flag=0)", depthshade=True)
# Plot sensitive
ax.scatter(s_x, s_y, s_z, c=C_DANGER, s=50, alpha=0.7, edgecolors="white",
linewidth=0.5, label="Sensitive (Flag=1)", depthshade=True)
# Decision hyperplane (tilted plane between clusters)
xx, yy = np.meshgrid(np.linspace(0, 8, 20), np.linspace(0, 8, 20))
# z = a*x + b*y + c defining a separating plane
zz = 0.5 * xx + 0.3 * yy - 0.5
ax.plot_surface(xx, yy, zz, alpha=0.18, color="#2ecc71", edgecolor="#27ae60",
linewidth=0.3, shade=True)
# Margin planes (dashed effect via wireframe)
zz_upper = zz + 1.0
zz_lower = zz - 1.0
ax.plot_wireframe(xx, yy, zz_upper, alpha=0.08, color="#27ae60", linewidth=0.3,
rstride=5, cstride=5)
ax.plot_wireframe(xx, yy, zz_lower, alpha=0.08, color="#27ae60", linewidth=0.3,
rstride=5, cstride=5)
# Support vectors (highlighted points on margin)
sv_b = [(3.5, 3.0, 2.5), (3.8, 2.8, 2.2), (4.0, 3.2, 2.8)]
sv_s = [(5.0, 4.5, 4.0), (4.8, 4.2, 3.8), (5.2, 4.8, 4.5)]
for pt in sv_b:
ax.scatter(*pt, c=C_BLUE, s=150, edgecolors="black", linewidth=1.8,
zorder=10, marker="D")
for pt in sv_s:
ax.scatter(*pt, c=C_DANGER, s=150, edgecolors="black", linewidth=1.8,
zorder=10, marker="D")
ax.set_xlabel("\nTF-IDF Feature Dim 1\n(credential-related n-grams)", fontsize=10,
labelpad=8)
ax.set_ylabel("\nTF-IDF Feature Dim 2\n(extension-related n-grams)", fontsize=10,
labelpad=8)
ax.set_zlabel("\nTF-IDF Feature Dim 3\n(config/key n-grams)", fontsize=10,
labelpad=8)
ax.set_title("Figure 4: SVM Decision Hyperplane in Feature Space\n"
"LinearSVC with C=0.5 (increased regularization), class_weight='balanced'",
fontsize=14, fontweight="bold", color=C_PRIMARY, pad=20)
# Legend
legend_elements = [
plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_BLUE,
markersize=10, label="Benign (Flag=0)"),
plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_DANGER,
markersize=10, label="Sensitive (Flag=1)"),
mpatches.Patch(facecolor="#2ecc71", alpha=0.3, label="Decision Hyperplane"),
plt.Line2D([0], [0], marker="s", color="w", markerfacecolor="gray",
markeredgecolor="black", markersize=10, label="Support Vectors"),
]
ax.legend(handles=legend_elements, loc="upper left", fontsize=10,
frameon=True, fancybox=True, shadow=True)
ax.view_init(elev=22, azim=135)
ax.set_xlim(0, 8)
ax.set_ylim(0, 8)
ax.set_zlim(0, 8)
# Annotation box
ax.text2D(0.73, 0.08,
"Low C = 0.5 β†’ wider margin\n"
"β†’ better generalization\n"
"β—† = Support Vectors on margin",
transform=ax.transAxes, fontsize=9.5,
bbox=dict(boxstyle="round,pad=0.4", facecolor="#fff9c4",
edgecolor=C_GOLD, linewidth=1.5),
color=C_PRIMARY, fontweight="bold")
path = os.path.join(OUT_DIR, "figure4_svm_hyperplane.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# FIGURE 5 β€” Confusion Matrix & Key Evaluation Metrics
# ════════════════════════════════════════════════════════════════════
def figure5_confusion_matrix():
fig = plt.figure(figsize=(16, 8))
gs = gridspec.GridSpec(1, 3, width_ratios=[1.2, 0.8, 1], wspace=0.35)
# ── Left: Confusion Matrix ──
ax_cm = fig.add_subplot(gs[0])
cm = np.array([[420, 80], # TN, FP
[50, 450]]) # FN, TP
labels_pred = ["Predicted\nBenign (0)", "Predicted\nSensitive (1)"]
labels_actual = ["Actual\nBenign (0)", "Actual\nSensitive (1)"]
cell_colors = [
["#d4edda", "#f8d7da"], # TN=green, FP=red-ish
["#ffe0b2", "#c8e6c9"], # FN=orange warn, TP=green
]
# Draw grid
for i in range(2):
for j in range(2):
rect = FancyBboxPatch((j, 1-i), 0.95, 0.9,
boxstyle="round,pad=0.04",
facecolor=cell_colors[i][j],
edgecolor="#adb5bd", linewidth=1.5)
ax_cm.add_patch(rect)
val = cm[i, j]
cell_label = [["TN", "FP"], ["FN", "TP"]][i][j]
# Highlight FN cell
if cell_label == "FN":
ax_cm.text(j + 0.475, 1.68 - i, f"{val}",
fontsize=28, fontweight="bold", ha="center", va="center",
color=C_DANGER,
path_effects=[pe.withStroke(linewidth=3, foreground="white")])
ax_cm.text(j + 0.475, 1.28 - i, f"[!] {cell_label}",
fontsize=14, fontweight="bold", ha="center", va="center",
color=C_DANGER)
else:
fc = C_SAFE if cell_label in ["TN", "TP"] else "#e65100"
ax_cm.text(j + 0.475, 1.68 - i, f"{val}",
fontsize=26, fontweight="bold", ha="center", va="center",
color=fc)
ax_cm.text(j + 0.475, 1.28 - i, cell_label,
fontsize=13, fontweight="bold", ha="center", va="center",
color=fc, alpha=0.7)
ax_cm.set_xlim(-0.3, 2.2)
ax_cm.set_ylim(-0.1, 2.3)
ax_cm.set_xticks([0.475, 1.475])
ax_cm.set_xticklabels(labels_pred, fontsize=10, fontweight="bold")
ax_cm.set_yticks([0.45, 1.45])
ax_cm.set_yticklabels(labels_actual[::-1], fontsize=10, fontweight="bold")
ax_cm.set_title("Confusion Matrix", fontsize=14, fontweight="bold",
color=C_PRIMARY, pad=12)
ax_cm.spines["top"].set_visible(False)
ax_cm.spines["right"].set_visible(False)
ax_cm.spines["bottom"].set_visible(False)
ax_cm.spines["left"].set_visible(False)
ax_cm.tick_params(length=0)
# ── Center: Metric Gauges ──
ax_g = fig.add_subplot(gs[1])
ax_g.axis("off")
ax_g.set_xlim(0, 10)
ax_g.set_ylim(0, 10)
metrics = [
("Recall", 450/(450+50), C_SAFE, "TP / (TP+FN) = 450/500"),
("Precision", 450/(450+80), C_BLUE, "TP / (TP+FP) = 450/530"),
("Accuracy", (420+450)/1000, C_PURPLE, "(TP+TN) / Total = 870/1000"),
("F1-Score", 2*0.9*0.849/(0.9+0.849), "#e65100", "2Β·PΒ·R / (P+R)"),
]
ax_g.text(5, 9.7, "Key Metrics", fontsize=14, fontweight="bold",
ha="center", va="top", color=C_PRIMARY)
for i, (name, value, color, formula) in enumerate(metrics):
y = 8.5 - i * 2.2
# Bar background
bar_bg = FancyBboxPatch((1.0, y - 0.35), 8.0, 0.7,
boxstyle="round,pad=0.06", facecolor=C_LIGHT,
edgecolor=C_GRID, linewidth=1)
ax_g.add_patch(bar_bg)
# Bar fill
bar_fill = FancyBboxPatch((1.0, y - 0.35), 8.0 * value, 0.7,
boxstyle="round,pad=0.06", facecolor=color,
edgecolor="white", linewidth=1, alpha=0.85)
ax_g.add_patch(bar_fill)
ax_g.text(5.0, y, f"{value:.1%}", fontsize=16, fontweight="bold",
ha="center", va="center", color="white",
path_effects=[pe.withStroke(linewidth=3, foreground=color)])
ax_g.text(5.0, y + 0.6, name, fontsize=12, fontweight="bold",
ha="center", va="center", color=C_PRIMARY)
ax_g.text(5.0, y - 0.65, formula, fontsize=7.5, ha="center",
va="center", color="#6c757d", fontfamily="monospace")
# ── Right: Recall Optimization explanation ──
ax_exp = fig.add_subplot(gs[2])
ax_exp.axis("off")
ax_exp.set_xlim(0, 10)
ax_exp.set_ylim(0, 10)
ax_exp.text(5, 9.5, "Why Recall is Prioritized", fontsize=14,
fontweight="bold", ha="center", va="top", color=C_DANGER)
# Big recall gauge
theta = np.linspace(0, np.pi, 100)
gauge_r = 2.5
gx = 5 + gauge_r * np.cos(theta)
gy = 6.5 + gauge_r * np.sin(theta)
ax_exp.plot(gx, gy, color=C_LIGHT, linewidth=12, solid_capstyle="round")
# Fill portion (90%)
theta_fill = np.linspace(0, np.pi * 0.90, 100)
gx_f = 5 + gauge_r * np.cos(theta_fill)
gy_f = 6.5 + gauge_r * np.sin(theta_fill)
ax_exp.plot(gx_f, gy_f, color=C_SAFE, linewidth=12, solid_capstyle="round")
ax_exp.text(5, 7.0, "90.0%", fontsize=28, fontweight="bold",
ha="center", va="center", color=C_SAFE)
ax_exp.text(5, 6.2, "RECALL", fontsize=12, fontweight="bold",
ha="center", va="center", color=C_PRIMARY)
# Explanation bullets
bullets = [
"β€’ Missing a sensitive file (FN) is far\n costlier than a false alarm (FP)",
"β€’ FN = 50 β†’ actively minimized by\n class_weight='balanced' + low C",
"β€’ Security-first: flag everything\n suspicious, triage later",
"β€’ Regex fallback catches edge cases\n the ML model may miss",
]
for i, txt in enumerate(bullets):
y = 4.5 - i * 1.1
ax_exp.text(0.5, y, txt, fontsize=9, color=C_PRIMARY, va="top",
fontweight="bold" if i == 0 else "normal")
fig.suptitle("Figure 5: Confusion Matrix & Evaluation Metrics (Recall Optimization)",
fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01)
fig.tight_layout()
path = os.path.join(OUT_DIR, "figure5_confusion_matrix.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# FIGURE 6 β€” Full End-to-End System Architecture Diagram
# ════════════════════════════════════════════════════════════════════
def figure6_architecture():
fig, ax = plt.subplots(figsize=(20, 11))
ax.set_xlim(0, 20)
ax.set_ylim(0, 11)
ax.axis("off")
fig.patch.set_facecolor("white")
ax.text(10, 10.7, "Figure 6: End-to-End System Architecture β€” S3Shastra",
fontsize=18, fontweight="bold", ha="center", va="top", color=C_PRIMARY)
ax.text(10, 10.25, "Training Path (dotted) Β· Inference Path (solid)",
fontsize=11, ha="center", va="top", color="#6c757d", style="italic")
def draw_box(ax, x, y, w, h, label, sublabel="", color=C_BLUE, bg="#eef1ff",
fontsize=10, sublabel_size=7.5):
rect = FancyBboxPatch((x, y), w, h,
boxstyle="round,pad=0.12", facecolor=bg,
edgecolor=color, linewidth=2)
ax.add_patch(rect)
if sublabel:
ax.text(x + w/2, y + h/2 + 0.15, label, fontsize=fontsize,
fontweight="bold", ha="center", va="center", color=color)
ax.text(x + w/2, y + h/2 - 0.22, sublabel, fontsize=sublabel_size,
ha="center", va="center", color="#6c757d", style="italic")
else:
ax.text(x + w/2, y + h/2, label, fontsize=fontsize,
fontweight="bold", ha="center", va="center", color=color)
def arrow(ax, x1, y1, x2, y2, color=C_PRIMARY, style="-", lw=2):
ls = "--" if style == "dotted" else "-"
ax.annotate("", xy=(x2, y2), xytext=(x1, y1),
arrowprops=dict(arrowstyle="-|>", color=color, lw=lw,
linestyle=ls))
# ═══ TRAINING PATH (top) ═══
ax.text(1.0, 9.5, "TRAINING PATH", fontsize=12, fontweight="bold",
color=C_PURPLE, rotation=0,
bbox=dict(boxstyle="round,pad=0.2", facecolor="#f3e5f5",
edgecolor=C_PURPLE, linewidth=1.5))
# Sensitive Keywords
draw_box(ax, 0.3, 7.8, 2.8, 1.2, "Sensitive\nKeywords", "91 keywords + variations",
color=C_DANGER, bg="#fce4ec")
# Synthetic Generator
draw_box(ax, 4.0, 7.8, 3.0, 1.2, "Synthetic Data\nGenerator", "build_dataset()",
color=C_PURPLE, bg="#f3e5f5")
# Benign Words
draw_box(ax, 0.3, 6.2, 2.8, 1.1, "Benign Words\nPool", "90 words + extensions",
color=C_SAFE, bg="#e8f5e9")
# Arrows to synthetic gen
arrow(ax, 3.1, 8.4, 4.0, 8.4, color=C_DANGER, style="dotted")
arrow(ax, 3.1, 6.75, 4.5, 7.8, color=C_SAFE, style="dotted")
# Explicit Examples
draw_box(ax, 0.3, 4.8, 2.8, 1.0, "Explicit Examples", "49 sens + 48 benign",
color=C_GOLD, bg="#fff8e1")
arrow(ax, 3.1, 5.3, 4.5, 7.8, color=C_GOLD, style="dotted")
# Dataset
draw_box(ax, 8.0, 7.8, 2.5, 1.2, "Balanced\nDataset", "~4200 samples",
color=C_PRIMARY, bg="#e3f2fd")
arrow(ax, 7.0, 8.4, 8.0, 8.4, color=C_PURPLE, style="dotted")
# TF-IDF Vectorizer (Training)
draw_box(ax, 11.3, 7.8, 2.8, 1.2, "TF-IDF\nVectorizer", "char_wb, ngram(3,5)",
color=C_BLUE, bg="#e8eaf6")
arrow(ax, 10.5, 8.4, 11.3, 8.4, color=C_PRIMARY, style="dotted")
# LinearSVC Training
draw_box(ax, 14.8, 7.8, 2.8, 1.2, "LinearSVC\nTraining", "C=0.5, balanced",
color=C_DANGER, bg="#fce4ec")
arrow(ax, 14.1, 8.4, 14.8, 8.4, color=C_PRIMARY, style="dotted")
# Saved Model
draw_box(ax, 18.0, 7.8, 1.7, 1.2, "Saved\nModel", ".joblib",
color="#6c757d", bg="#f5f5f5")
arrow(ax, 17.6, 8.4, 18.0, 8.4, color=C_PRIMARY, style="dotted")
# ═══ INFERENCE PATH (bottom) ═══
ax.text(1.0, 5.4, "INFERENCE PATH", fontsize=12, fontweight="bold",
color=C_BLUE, rotation=0,
bbox=dict(boxstyle="round,pad=0.2", facecolor="#e3f2fd",
edgecolor=C_BLUE, linewidth=1.5))
# Cloud Storage Input
draw_box(ax, 0.3, 3.2, 2.8, 1.2, "Cloud Storage\nBuckets", "S3 / GCS / Azure / …",
color=C_PRIMARY, bg="#e3f2fd", fontsize=10)
# Object Lister
draw_box(ax, 3.8, 3.2, 2.5, 1.2, "Object\nEnumerator", "async HTTP scanner",
color=C_BLUE, bg="#e8eaf6")
arrow(ax, 3.1, 3.8, 3.8, 3.8, color=C_PRIMARY)
# Image Filter
draw_box(ax, 7.0, 3.2, 2.3, 1.2, "Extension\nFilter", "skip images/media",
color="#6c757d", bg="#f5f5f5")
arrow(ax, 6.3, 3.8, 7.0, 3.8, color=C_PRIMARY)
# ML Classifier
draw_box(ax, 10.0, 3.2, 2.8, 1.2, "ML Classifier\n(Pipeline)", "TF-IDF β†’ LinearSVC",
color=C_BLUE, bg="#e8eaf6")
arrow(ax, 9.3, 3.8, 10.0, 3.8, color=C_PRIMARY)
# Model load arrow from saved model
arrow(ax, 18.5, 7.8, 11.4, 4.4, color="#6c757d", style="dotted", lw=1.5)
ax.text(15.5, 6.2, "load .joblib", fontsize=8, color="#6c757d", style="italic",
rotation=-15)
# Regex Fallback
draw_box(ax, 13.5, 3.2, 2.8, 1.2, "Regex Fallback\nEngine", "keyword matching",
color=C_GOLD, bg="#fff8e1")
arrow(ax, 12.8, 3.8, 13.5, 3.8, color=C_PRIMARY)
# Decision logic
draw_box(ax, 13.5, 1.4, 2.8, 1.2, "Trigger\nExplainer", "get_trigger_explanation()",
color="#e65100", bg="#fff3e0")
arrow(ax, 14.9, 3.2, 14.9, 2.6, color="#e65100")
# Output
draw_box(ax, 17.0, 3.2, 2.7, 1.2, "Classification\nOutput", "flag + trigger word",
color=C_DANGER, bg="#fce4ec", fontsize=10)
arrow(ax, 16.3, 3.8, 17.0, 3.8, color=C_PRIMARY)
arrow(ax, 16.3, 1.9, 17.3, 3.2, color="#e65100")
# WebSocket / Dashboard
draw_box(ax, 17.0, 1.4, 2.7, 1.2, "Dashboard\n& WebSocket", "real-time results",
color=C_SAFE, bg="#e8f5e9", fontsize=10)
arrow(ax, 18.35, 3.2, 18.35, 2.6, color=C_SAFE)
# ═══ LEGEND ═══
legend_y = 0.5
ax.plot([0.5, 1.5], [legend_y, legend_y], color=C_PRIMARY, linewidth=2, linestyle="-")
ax.text(1.7, legend_y, "Inference (Solid)", fontsize=9, va="center", color=C_PRIMARY)
ax.plot([4.0, 5.0], [legend_y, legend_y], color=C_PURPLE, linewidth=2, linestyle="--")
ax.text(5.2, legend_y, "Training (Dotted)", fontsize=9, va="center", color=C_PURPLE)
legend_patches = [
mpatches.Patch(facecolor="#fce4ec", edgecolor=C_DANGER, label="Sensitive/Classifier"),
mpatches.Patch(facecolor="#e8f5e9", edgecolor=C_SAFE, label="Benign/Output"),
mpatches.Patch(facecolor="#e8eaf6", edgecolor=C_BLUE, label="Feature Extraction"),
mpatches.Patch(facecolor="#fff8e1", edgecolor=C_GOLD, label="Fallback/Explainability"),
]
ax.legend(handles=legend_patches, loc="lower right", fontsize=9,
frameon=True, fancybox=True, shadow=True, ncol=4,
bbox_to_anchor=(0.98, -0.02))
fig.tight_layout(pad=0.5)
path = os.path.join(OUT_DIR, "figure6_architecture.png")
fig.savefig(path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f" βœ“ Saved {path}")
# ════════════════════════════════════════════════════════════════════
# MAIN β€” Generate all figures
# ════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
print("=" * 60)
print(" S3Shastra β€” Research Paper Figure Generator")
print("=" * 60)
print(f"\n Output directory: {OUT_DIR}\n")
figure1_conceptual_overview()
figure2_dataset_balancing()
figure3_ngram_tfidf()
figure4_svm_hyperplane()
figure5_confusion_matrix()
figure6_architecture()
print(f"\n{'=' * 60}")
print(f" All 6 figures saved to: {OUT_DIR}")
print(f"{'=' * 60}")