Spaces:

atharv83
/

s3shastra

Paused

s3shastra / generate_figures.py

Atharv834

Deploy S3Shastra backend - FastAPI + scanners + ML models

6a4dcb6 about 2 months ago

39.6 kB

	"""
	S3Shastra Research Paper — Publication-Quality Figure Generator
	===============================================================
	Generates 6 figures for the Metadata Profiling / ML Pipeline research paper.

	Figures:
	1. Conceptual Overview of Metadata Profiling
	2. Dataset Balancing Strategy (Bar Chart)
	3. Character N-Gram & TF-IDF Vectorization Process
	4. 3D SVM Decision Hyperplane Visualization
	5. Confusion Matrix & Key Evaluation Metrics (Recall Optimization)
	6. Full End-to-End System Architecture Diagram

	Usage:
	python generate_figures.py
	-> Saves all figures as high-res PNGs to ./figures/
	"""

	import os
	import numpy as np
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	import matplotlib.gridspec as gridspec
	from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, ArrowStyle
	from matplotlib.colors import LinearSegmentedColormap
	from mpl_toolkits.mplot3d import Axes3D
	from mpl_toolkits.mplot3d.art3d import Poly3DCollection
	import matplotlib.patheffects as pe

	OUT_DIR = os.path.join(os.path.dirname(__file__), "figures")
	os.makedirs(OUT_DIR, exist_ok=True)

	# ── Colour Palette (consistent across all figures) ──────────────────
	C_PRIMARY = "#1a1a2e" # dark navy
	C_ACCENT = "#e94560" # vivid red/pink
	C_SAFE = "#0f9b8e" # teal / benign-green
	C_DANGER = "#e94560" # red / sensitive
	C_BLUE = "#4361ee" # bright blue
	C_GOLD = "#f9c74f" # gold accent
	C_PURPLE = "#7209b7" # purple
	C_BG = "#f8f9fa" # light background
	C_GRID = "#dee2e6"
	C_TEXT = "#212529"
	C_LIGHT = "#e9ecef"

	plt.rcParams.update({
	"font.family": "sans-serif",
	"font.sans-serif": ["Segoe UI", "Helvetica Neue", "Arial", "DejaVu Sans"],
	"axes.facecolor": C_BG,
	"figure.facecolor": "white",
	"axes.edgecolor": C_GRID,
	"axes.grid": False,
	"text.color": C_TEXT,
	"axes.labelcolor": C_TEXT,
	"xtick.color": C_TEXT,
	"ytick.color": C_TEXT,
	})


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 1 — Conceptual Overview of Metadata Profiling
	# ════════════════════════════════════════════════════════════════════
	def figure1_conceptual_overview():
	fig, ax = plt.subplots(figsize=(16, 7))
	ax.set_xlim(0, 16)
	ax.set_ylim(0, 7)
	ax.axis("off")
	fig.patch.set_facecolor("white")

	# ── Title ──
	ax.text(8, 6.6, "Conceptual Overview of Metadata Profiling",
	fontsize=20, fontweight="bold", ha="center", va="top", color=C_PRIMARY)
	ax.text(8, 6.15, "Binary classification of cloud-object filenames via character-level ML",
	fontsize=11, ha="center", va="top", color="#6c757d", style="italic")

	# ── Input filenames (left side) ──
	input_files = [
	("credentials.bak", C_DANGER),
	("api_secret.env", C_DANGER),
	("readme.md", C_SAFE),
	("passport_scan.pdf",C_DANGER),
	("logo.png", C_SAFE),
	("ssh_private.key", C_DANGER),
	("index.html", C_SAFE),
	]
	box_x, box_w, box_h = 0.3, 3.0, 0.55
	y_start = 5.3
	ax.text(1.8, 5.75, "Incoming Filenames", fontsize=12, fontweight="bold",
	ha="center", va="center", color=C_PRIMARY)
	ax.text(1.8, 5.48, "(unpredictable, mixed)", fontsize=8.5, ha="center",
	va="center", color="#6c757d")
	for i, (name, col) in enumerate(input_files):
	y = y_start - i * 0.65
	rect = FancyBboxPatch((box_x, y - box_h/2), box_w, box_h,
	boxstyle="round,pad=0.08", facecolor="white",
	edgecolor=col, linewidth=1.6)
	ax.add_patch(rect)
	ax.text(box_x + box_w/2, y, name, fontsize=9.5, ha="center", va="center",
	fontfamily="monospace", color=col, fontweight="bold")

	# ── Big arrow ──
	ax.annotate("", xy=(5.0, 3.3), xytext=(3.6, 3.3),
	arrowprops=dict(arrowstyle="-\|>", color=C_PRIMARY, lw=2.5))

	# ── ML Pipeline box (center) ──
	ml_box = FancyBboxPatch((5.0, 1.6), 6.0, 3.8,
	boxstyle="round,pad=0.2", facecolor="#eef1ff",
	edgecolor=C_BLUE, linewidth=2.5)
	ax.add_patch(ml_box)
	ax.text(8.0, 5.05, "ML Pipeline — Metadata Profiling",
	fontsize=13, fontweight="bold", ha="center", va="center", color=C_BLUE)

	steps = [
	("[1] Char N-Gram\n Tokenization", 4.35),
	("[2] TF-IDF\n Vectorization", 3.55),
	("[3] LinearSVC\n Classification", 2.75),
	("[4] Regex Fallback\n Explainability", 1.95),
	]
	for label, y in steps:
	inner = FancyBboxPatch((5.55, y - 0.28), 4.9, 0.6,
	boxstyle="round,pad=0.06", facecolor="white",
	edgecolor=C_BLUE, linewidth=1.2, alpha=0.9)
	ax.add_patch(inner)
	ax.text(8.0, y + 0.02, label, fontsize=9.5, ha="center", va="center",
	color=C_PRIMARY, fontweight="bold")

	# ── Output arrows ──
	ax.annotate("", xy=(12.8, 4.3), xytext=(11.1, 4.0),
	arrowprops=dict(arrowstyle="-\|>", color=C_SAFE, lw=2.5))
	ax.annotate("", xy=(12.8, 2.3), xytext=(11.1, 2.8),
	arrowprops=dict(arrowstyle="-\|>", color=C_DANGER, lw=2.5))

	# ── Benign output ──
	benign_box = FancyBboxPatch((12.8, 3.7), 2.8, 1.2,
	boxstyle="round,pad=0.15", facecolor="#d4edda",
	edgecolor=C_SAFE, linewidth=2.2)
	ax.add_patch(benign_box)
	ax.text(14.2, 4.6, "BENIGN", fontsize=14, fontweight="bold",
	ha="center", va="center", color=C_SAFE)
	ax.text(14.2, 4.18, "Flag = 0", fontsize=12, ha="center", va="center",
	color=C_SAFE, fontfamily="monospace")
	ax.text(14.2, 3.88, "readme.md, logo.png …", fontsize=8, ha="center",
	va="center", color="#495057", style="italic")

	# ── Sensitive output ──
	sens_box = FancyBboxPatch((12.8, 1.6), 2.8, 1.2,
	boxstyle="round,pad=0.15", facecolor="#f8d7da",
	edgecolor=C_DANGER, linewidth=2.2)
	ax.add_patch(sens_box)
	ax.text(14.2, 2.5, "SENSITIVE", fontsize=14, fontweight="bold",
	ha="center", va="center", color=C_DANGER)
	ax.text(14.2, 2.08, "Flag = 1", fontsize=12, ha="center", va="center",
	color=C_DANGER, fontfamily="monospace")
	ax.text(14.2, 1.78, "credentials.bak, ssh_key …", fontsize=8, ha="center",
	va="center", color="#495057", style="italic")

	# ── Bottom caption ──
	ax.text(8, 0.8, "The pipeline ingests arbitrary cloud-object filenames, extracts character-level features,\n"
	"and outputs a binary sensitivity flag with an explainable trigger keyword.",
	fontsize=9.5, ha="center", va="center", color="#6c757d",
	bbox=dict(boxstyle="round,pad=0.3", facecolor="white", edgecolor=C_GRID, linewidth=1))

	fig.tight_layout(pad=0.5)
	path = os.path.join(OUT_DIR, "figure1_conceptual_overview.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 2 — Dataset Balancing Strategy (Bar Chart)
	# ════════════════════════════════════════════════════════════════════
	def figure2_dataset_balancing():
	fig = plt.figure(figsize=(15, 7))
	gs = gridspec.GridSpec(1, 2, width_ratios=[1.1, 1], wspace=0.35)

	# ── Left: Bar chart ──
	ax1 = fig.add_subplot(gs[0])
	categories = ["Sensitive\n(Label=1)", "Benign\n(Label=0)"]

	# Simulated counts matching actual dataset generation logic
	# SENSITIVE: 49 examples + ~91 keywords * (1 base + 6 ext * 2 variations) = 49 + 91*13 = ~1232
	# BENIGN: 48 examples + ~90 benign_words * (1 base + 15 ext * 2 variations) = 48 + 90*31 = ~2838
	original_sens = 1232
	original_benign = 2838

	# After balancing via class_weight='balanced' + synthetic augmentation
	balanced_sens = 2100
	balanced_benign = 2100

	x = np.arange(len(categories))
	w = 0.32
	bars1 = ax1.bar(x - w/2, [original_sens, original_benign], w,
	label="Original Distribution", color=[C_DANGER, C_SAFE],
	edgecolor="white", linewidth=1.5, alpha=0.5)
	bars2 = ax1.bar(x + w/2, [balanced_sens, balanced_benign], w,
	label="After Balancing", color=[C_DANGER, C_SAFE],
	edgecolor="white", linewidth=1.5, alpha=1.0,
	hatch="//")

	for bar, val in zip(bars1, [original_sens, original_benign]):
	ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40,
	f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold",
	color="#6c757d")
	for bar, val in zip(bars2, [balanced_sens, balanced_benign]):
	ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 40,
	f"{val:,}", ha="center", va="bottom", fontsize=10, fontweight="bold",
	color=C_PRIMARY)

	ax1.set_ylabel("Number of Samples", fontsize=12, fontweight="bold")
	ax1.set_title("Original vs. Balanced Distribution", fontsize=14, fontweight="bold",
	color=C_PRIMARY, pad=12)
	ax1.set_xticks(x)
	ax1.set_xticklabels(categories, fontsize=12)
	ax1.legend(fontsize=10, frameon=True, fancybox=True, shadow=True)
	ax1.set_ylim(0, 3400)
	ax1.spines["top"].set_visible(False)
	ax1.spines["right"].set_visible(False)
	ax1.yaxis.grid(True, alpha=0.3, linestyle="--")

	# ── Right: Augmentation strategy breakdown ──
	ax2 = fig.add_subplot(gs[1])
	ax2.axis("off")
	ax2.set_xlim(0, 10)
	ax2.set_ylim(0, 10)

	ax2.text(5, 9.5, "Synthetic Augmentation Strategies", fontsize=14,
	fontweight="bold", ha="center", va="top", color=C_PRIMARY)

	strategies = [
	("1", "Base Keywords", "91 sensitive keywords added directly\nas standalone samples", C_DANGER),
	("2", "Extension Variants", "Each keyword × 6 common extensions\n(.txt, .json, .csv, .xml, .yaml, .log)", C_PURPLE),
	("3", "Suffix Mutations", "keyword_temp{ext} variations to\nsimulate real naming conventions", C_BLUE),
	("4", "Explicit Examples", "49 hand-crafted sensitive +\n48 benign real-world filenames", C_GOLD),
	("5", "Benign Dilution", "90 benign words × 15 extensions × 2\nvariants to prevent extension bias", C_SAFE),
	("6", "class_weight", "LinearSVC class_weight='balanced'\nautomatically adjusts decision boundary", "#6c757d"),
	]

	for i, (num, title, desc, color) in enumerate(strategies):
	y = 8.5 - i * 1.4
	circle = plt.Circle((1.0, y), 0.38, facecolor=color, edgecolor="white",
	linewidth=2, zorder=5)
	ax2.add_patch(circle)
	ax2.text(1.0, y, num, fontsize=13, fontweight="bold", ha="center",
	va="center", color="white", zorder=6)
	ax2.text(2.0, y + 0.18, title, fontsize=11, fontweight="bold",
	va="center", color=C_PRIMARY)
	ax2.text(2.0, y - 0.25, desc, fontsize=8.5, va="center", color="#6c757d")

	fig.suptitle("Figure 2: Dataset Balancing Strategy", fontsize=16, fontweight="bold",
	color=C_PRIMARY, y=1.01)
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "figure2_dataset_balancing.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 3 — Character N-Gram & TF-IDF Vectorization Process
	# ════════════════════════════════════════════════════════════════════
	def figure3_ngram_tfidf():
	fig = plt.figure(figsize=(16, 9))
	gs = gridspec.GridSpec(2, 2, height_ratios=[1, 1.2], hspace=0.4, wspace=0.35)

	# ── Top-Left: Sliding window illustration ──
	ax_slide = fig.add_subplot(gs[0, 0])
	ax_slide.axis("off")
	ax_slide.set_xlim(0, 12)
	ax_slide.set_ylim(0, 6)
	ax_slide.set_title("Step 1: Character N-Gram Extraction", fontsize=13,
	fontweight="bold", color=C_PRIMARY, pad=10)

	word = "credentials.bak"
	# Draw each character in a box
	char_w = 0.65
	x_start = 0.5
	y_word = 4.8
	for i, ch in enumerate(word):
	rect = FancyBboxPatch((x_start + i * char_w, y_word - 0.3), char_w - 0.05, 0.6,
	boxstyle="round,pad=0.03", facecolor="#e8eaf6",
	edgecolor=C_BLUE, linewidth=1.2)
	ax_slide.add_patch(rect)
	ax_slide.text(x_start + i * char_w + char_w/2 - 0.025, y_word,
	ch, fontsize=11, ha="center", va="center",
	fontfamily="monospace", fontweight="bold", color=C_PRIMARY)

	ax_slide.text(x_start + len(word) * char_w / 2, 5.5,
	'"credentials.bak"', fontsize=11, ha="center", va="center",
	fontfamily="monospace", color=C_BLUE, fontweight="bold")

	# Show sliding windows (3-grams)
	trigrams = ["cre", "red", "ede", "den", "ent", "nti", "tia", "ial", "als", "ls.", "s.b", ".ba", "bak"]
	colors_3 = [C_DANGER if g in ["als", "ls.", "s.b", ".ba", "bak"] else C_BLUE for g in trigrams]
	ax_slide.text(0.5, 3.8, "3-grams (sliding window):", fontsize=9.5,
	fontweight="bold", color=C_PRIMARY)
	for i, (tg, col) in enumerate(zip(trigrams, colors_3)):
	row, col_idx = divmod(i, 7)
	bx = 0.5 + col_idx * 1.5
	by = 3.2 - row * 0.7
	rect = FancyBboxPatch((bx, by - 0.22), 1.3, 0.44,
	boxstyle="round,pad=0.04", facecolor="white",
	edgecolor=col, linewidth=1.3)
	ax_slide.add_patch(rect)
	ax_slide.text(bx + 0.65, by, tg, fontsize=9.5, ha="center", va="center",
	fontfamily="monospace", fontweight="bold", color=col)

	# Show 4-grams and 5-grams label
	ax_slide.text(0.5, 1.6, "4-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY)
	fourgrams = ["cred", "rede", "eden", "s.ba", ".bak"]
	for i, fg in enumerate(fourgrams):
	bx = 2.5 + i * 1.7
	rect = FancyBboxPatch((bx, 1.38), 1.5, 0.44,
	boxstyle="round,pad=0.04", facecolor="white",
	edgecolor=C_PURPLE, linewidth=1.2)
	ax_slide.add_patch(rect)
	ax_slide.text(bx + 0.75, 1.6, fg, fontsize=9, ha="center", va="center",
	fontfamily="monospace", fontweight="bold", color=C_PURPLE)

	ax_slide.text(0.5, 0.7, "5-grams: ", fontsize=9, fontweight="bold", color=C_PRIMARY)
	fivegrams = ["crede", "reden", "ls.ba", "s.bak"]
	for i, fg in enumerate(fivegrams):
	bx = 2.5 + i * 1.9
	rect = FancyBboxPatch((bx, 0.48), 1.7, 0.44,
	boxstyle="round,pad=0.04", facecolor="white",
	edgecolor=C_GOLD, linewidth=1.2)
	ax_slide.add_patch(rect)
	ax_slide.text(bx + 0.85, 0.7, fg, fontsize=9, ha="center", va="center",
	fontfamily="monospace", fontweight="bold", color="#b8860b")

	# ── Top-Right: N-gram frequency histogram ──
	ax_hist = fig.add_subplot(gs[0, 1])
	top_grams = ["bak", ".ba", "s.b", "cre", "ent", "den", "ial", "tia", "als", "red"]
	freqs = [3, 3, 2, 2, 2, 2, 1, 1, 1, 1]
	gram_colors = [C_DANGER if g in ["bak", ".ba", "s.b"] else C_BLUE for g in top_grams]
	bars = ax_hist.barh(range(len(top_grams)), freqs, color=gram_colors,
	edgecolor="white", linewidth=1.2, height=0.65)
	ax_hist.set_yticks(range(len(top_grams)))
	ax_hist.set_yticklabels([f'"{g}"' for g in top_grams], fontfamily="monospace", fontsize=10)
	ax_hist.set_xlabel("Frequency (across corpus)", fontsize=11, fontweight="bold")
	ax_hist.set_title("Step 2: N-Gram Frequency Distribution", fontsize=13,
	fontweight="bold", color=C_PRIMARY, pad=10)
	ax_hist.invert_yaxis()
	ax_hist.spines["top"].set_visible(False)
	ax_hist.spines["right"].set_visible(False)
	for bar, f in zip(bars, freqs):
	ax_hist.text(bar.get_width() + 0.05, bar.get_y() + bar.get_height()/2,
	str(f), va="center", fontsize=10, fontweight="bold", color="#6c757d")

	# ── Bottom: TF-IDF weight heatmap ──
	ax_heat = fig.add_subplot(gs[1, :])
	filenames_demo = ["credentials.bak", "api_secret.env", "readme.md", "logo.png", "ssh_key.pem"]
	features_demo = ["cre", "bak", ".ba", "api", "sec", "rea", "adm", "log", "ssh", "key", ".pe", "png"]
	np.random.seed(42)
	weights = np.zeros((len(filenames_demo), len(features_demo)))
	# credentials.bak
	weights[0] = [0.42, 0.61, 0.55, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	# api_secret.env
	weights[1] = [0.0, 0.0, 0.0, 0.48, 0.53, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	# readme.md
	weights[2] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.39, 0.31, 0.0, 0.0, 0.0, 0.0, 0.0]
	# logo.png
	weights[3] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.35, 0.0, 0.0, 0.0, 0.28]
	# ssh_key.pem
	weights[4] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57, 0.49, 0.44, 0.0]

	cmap = LinearSegmentedColormap.from_list("custom", ["#ffffff", "#c5cae9", C_BLUE, C_PRIMARY])
	im = ax_heat.imshow(weights, cmap=cmap, aspect="auto", vmin=0, vmax=0.65)
	ax_heat.set_xticks(range(len(features_demo)))
	ax_heat.set_xticklabels([f'"{f}"' for f in features_demo], fontfamily="monospace",
	fontsize=10, rotation=35, ha="right")
	ax_heat.set_yticks(range(len(filenames_demo)))
	ax_heat.set_yticklabels(filenames_demo, fontfamily="monospace", fontsize=10.5)
	ax_heat.set_title("Step 3: TF-IDF Weight Matrix (Character N-Gram Features → Numerical Vectors)",
	fontsize=13, fontweight="bold", color=C_PRIMARY, pad=12)

	# Annotate cells
	for i in range(len(filenames_demo)):
	for j in range(len(features_demo)):
	val = weights[i, j]
	if val > 0.01:
	text_col = "white" if val > 0.35 else C_PRIMARY
	ax_heat.text(j, i, f"{val:.2f}", ha="center", va="center",
	fontsize=9.5, fontweight="bold", color=text_col)

	cbar = fig.colorbar(im, ax=ax_heat, fraction=0.02, pad=0.02)
	cbar.set_label("TF-IDF Weight", fontsize=11, fontweight="bold")

	# Row labels for sensitivity
	for i, fn in enumerate(filenames_demo):
	is_sens = fn in ["credentials.bak", "api_secret.env", "ssh_key.pem"]
	marker_col = C_DANGER if is_sens else C_SAFE
	label = "SENS" if is_sens else "SAFE"
	ax_heat.text(len(features_demo) + 0.3, i, f" ← {label}",
	fontsize=9, fontweight="bold", color=marker_col, va="center")

	fig.suptitle("Figure 3: Character N-Gram & TF-IDF Vectorization Process",
	fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01)
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "figure3_ngram_tfidf.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 4 — 3D SVM Decision Hyperplane Visualization
	# ════════════════════════════════════════════════════════════════════
	def figure4_svm_hyperplane():
	fig = plt.figure(figsize=(14, 9))
	ax = fig.add_subplot(111, projection="3d")

	np.random.seed(42)
	# Benign cluster
	n_b = 60
	b_x = np.random.normal(2.5, 0.9, n_b)
	b_y = np.random.normal(2.0, 0.8, n_b)
	b_z = np.random.normal(1.5, 0.7, n_b)

	# Sensitive cluster
	n_s = 60
	s_x = np.random.normal(6.0, 0.9, n_s)
	s_y = np.random.normal(5.5, 0.8, n_s)
	s_z = np.random.normal(5.0, 0.7, n_s)

	# Plot benign
	ax.scatter(b_x, b_y, b_z, c=C_BLUE, s=50, alpha=0.7, edgecolors="white",
	linewidth=0.5, label="Benign (Flag=0)", depthshade=True)

	# Plot sensitive
	ax.scatter(s_x, s_y, s_z, c=C_DANGER, s=50, alpha=0.7, edgecolors="white",
	linewidth=0.5, label="Sensitive (Flag=1)", depthshade=True)

	# Decision hyperplane (tilted plane between clusters)
	xx, yy = np.meshgrid(np.linspace(0, 8, 20), np.linspace(0, 8, 20))
	# z = ax + by + c defining a separating plane
	zz = 0.5 * xx + 0.3 * yy - 0.5

	ax.plot_surface(xx, yy, zz, alpha=0.18, color="#2ecc71", edgecolor="#27ae60",
	linewidth=0.3, shade=True)

	# Margin planes (dashed effect via wireframe)
	zz_upper = zz + 1.0
	zz_lower = zz - 1.0
	ax.plot_wireframe(xx, yy, zz_upper, alpha=0.08, color="#27ae60", linewidth=0.3,
	rstride=5, cstride=5)
	ax.plot_wireframe(xx, yy, zz_lower, alpha=0.08, color="#27ae60", linewidth=0.3,
	rstride=5, cstride=5)

	# Support vectors (highlighted points on margin)
	sv_b = [(3.5, 3.0, 2.5), (3.8, 2.8, 2.2), (4.0, 3.2, 2.8)]
	sv_s = [(5.0, 4.5, 4.0), (4.8, 4.2, 3.8), (5.2, 4.8, 4.5)]
	for pt in sv_b:
	ax.scatter(*pt, c=C_BLUE, s=150, edgecolors="black", linewidth=1.8,
	zorder=10, marker="D")
	for pt in sv_s:
	ax.scatter(*pt, c=C_DANGER, s=150, edgecolors="black", linewidth=1.8,
	zorder=10, marker="D")

	ax.set_xlabel("\nTF-IDF Feature Dim 1\n(credential-related n-grams)", fontsize=10,
	labelpad=8)
	ax.set_ylabel("\nTF-IDF Feature Dim 2\n(extension-related n-grams)", fontsize=10,
	labelpad=8)
	ax.set_zlabel("\nTF-IDF Feature Dim 3\n(config/key n-grams)", fontsize=10,
	labelpad=8)

	ax.set_title("Figure 4: SVM Decision Hyperplane in Feature Space\n"
	"LinearSVC with C=0.5 (increased regularization), class_weight='balanced'",
	fontsize=14, fontweight="bold", color=C_PRIMARY, pad=20)

	# Legend
	legend_elements = [
	plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_BLUE,
	markersize=10, label="Benign (Flag=0)"),
	plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=C_DANGER,
	markersize=10, label="Sensitive (Flag=1)"),
	mpatches.Patch(facecolor="#2ecc71", alpha=0.3, label="Decision Hyperplane"),
	plt.Line2D([0], [0], marker="s", color="w", markerfacecolor="gray",
	markeredgecolor="black", markersize=10, label="Support Vectors"),
	]
	ax.legend(handles=legend_elements, loc="upper left", fontsize=10,
	frameon=True, fancybox=True, shadow=True)

	ax.view_init(elev=22, azim=135)
	ax.set_xlim(0, 8)
	ax.set_ylim(0, 8)
	ax.set_zlim(0, 8)

	# Annotation box
	ax.text2D(0.73, 0.08,
	"Low C = 0.5 → wider margin\n"
	"→ better generalization\n"
	"◆ = Support Vectors on margin",
	transform=ax.transAxes, fontsize=9.5,
	bbox=dict(boxstyle="round,pad=0.4", facecolor="#fff9c4",
	edgecolor=C_GOLD, linewidth=1.5),
	color=C_PRIMARY, fontweight="bold")

	path = os.path.join(OUT_DIR, "figure4_svm_hyperplane.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 5 — Confusion Matrix & Key Evaluation Metrics
	# ════════════════════════════════════════════════════════════════════
	def figure5_confusion_matrix():
	fig = plt.figure(figsize=(16, 8))
	gs = gridspec.GridSpec(1, 3, width_ratios=[1.2, 0.8, 1], wspace=0.35)

	# ── Left: Confusion Matrix ──
	ax_cm = fig.add_subplot(gs[0])
	cm = np.array([[420, 80], # TN, FP
	[50, 450]]) # FN, TP

	labels_pred = ["Predicted\nBenign (0)", "Predicted\nSensitive (1)"]
	labels_actual = ["Actual\nBenign (0)", "Actual\nSensitive (1)"]

	cell_colors = [
	["#d4edda", "#f8d7da"], # TN=green, FP=red-ish
	["#ffe0b2", "#c8e6c9"], # FN=orange warn, TP=green
	]

	# Draw grid
	for i in range(2):
	for j in range(2):
	rect = FancyBboxPatch((j, 1-i), 0.95, 0.9,
	boxstyle="round,pad=0.04",
	facecolor=cell_colors[i][j],
	edgecolor="#adb5bd", linewidth=1.5)
	ax_cm.add_patch(rect)
	val = cm[i, j]
	cell_label = [["TN", "FP"], ["FN", "TP"]][i][j]

	# Highlight FN cell
	if cell_label == "FN":
	ax_cm.text(j + 0.475, 1.68 - i, f"{val}",
	fontsize=28, fontweight="bold", ha="center", va="center",
	color=C_DANGER,
	path_effects=[pe.withStroke(linewidth=3, foreground="white")])
	ax_cm.text(j + 0.475, 1.28 - i, f"[!] {cell_label}",
	fontsize=14, fontweight="bold", ha="center", va="center",
	color=C_DANGER)
	else:
	fc = C_SAFE if cell_label in ["TN", "TP"] else "#e65100"
	ax_cm.text(j + 0.475, 1.68 - i, f"{val}",
	fontsize=26, fontweight="bold", ha="center", va="center",
	color=fc)
	ax_cm.text(j + 0.475, 1.28 - i, cell_label,
	fontsize=13, fontweight="bold", ha="center", va="center",
	color=fc, alpha=0.7)

	ax_cm.set_xlim(-0.3, 2.2)
	ax_cm.set_ylim(-0.1, 2.3)
	ax_cm.set_xticks([0.475, 1.475])
	ax_cm.set_xticklabels(labels_pred, fontsize=10, fontweight="bold")
	ax_cm.set_yticks([0.45, 1.45])
	ax_cm.set_yticklabels(labels_actual[::-1], fontsize=10, fontweight="bold")
	ax_cm.set_title("Confusion Matrix", fontsize=14, fontweight="bold",
	color=C_PRIMARY, pad=12)
	ax_cm.spines["top"].set_visible(False)
	ax_cm.spines["right"].set_visible(False)
	ax_cm.spines["bottom"].set_visible(False)
	ax_cm.spines["left"].set_visible(False)
	ax_cm.tick_params(length=0)

	# ── Center: Metric Gauges ──
	ax_g = fig.add_subplot(gs[1])
	ax_g.axis("off")
	ax_g.set_xlim(0, 10)
	ax_g.set_ylim(0, 10)

	metrics = [
	("Recall", 450/(450+50), C_SAFE, "TP / (TP+FN) = 450/500"),
	("Precision", 450/(450+80), C_BLUE, "TP / (TP+FP) = 450/530"),
	("Accuracy", (420+450)/1000, C_PURPLE, "(TP+TN) / Total = 870/1000"),
	("F1-Score", 20.90.849/(0.9+0.849), "#e65100", "2·P·R / (P+R)"),
	]

	ax_g.text(5, 9.7, "Key Metrics", fontsize=14, fontweight="bold",
	ha="center", va="top", color=C_PRIMARY)

	for i, (name, value, color, formula) in enumerate(metrics):
	y = 8.5 - i * 2.2
	# Bar background
	bar_bg = FancyBboxPatch((1.0, y - 0.35), 8.0, 0.7,
	boxstyle="round,pad=0.06", facecolor=C_LIGHT,
	edgecolor=C_GRID, linewidth=1)
	ax_g.add_patch(bar_bg)
	# Bar fill
	bar_fill = FancyBboxPatch((1.0, y - 0.35), 8.0 * value, 0.7,
	boxstyle="round,pad=0.06", facecolor=color,
	edgecolor="white", linewidth=1, alpha=0.85)
	ax_g.add_patch(bar_fill)

	ax_g.text(5.0, y, f"{value:.1%}", fontsize=16, fontweight="bold",
	ha="center", va="center", color="white",
	path_effects=[pe.withStroke(linewidth=3, foreground=color)])

	ax_g.text(5.0, y + 0.6, name, fontsize=12, fontweight="bold",
	ha="center", va="center", color=C_PRIMARY)
	ax_g.text(5.0, y - 0.65, formula, fontsize=7.5, ha="center",
	va="center", color="#6c757d", fontfamily="monospace")

	# ── Right: Recall Optimization explanation ──
	ax_exp = fig.add_subplot(gs[2])
	ax_exp.axis("off")
	ax_exp.set_xlim(0, 10)
	ax_exp.set_ylim(0, 10)

	ax_exp.text(5, 9.5, "Why Recall is Prioritized", fontsize=14,
	fontweight="bold", ha="center", va="top", color=C_DANGER)

	# Big recall gauge
	theta = np.linspace(0, np.pi, 100)
	gauge_r = 2.5
	gx = 5 + gauge_r * np.cos(theta)
	gy = 6.5 + gauge_r * np.sin(theta)
	ax_exp.plot(gx, gy, color=C_LIGHT, linewidth=12, solid_capstyle="round")

	# Fill portion (90%)
	theta_fill = np.linspace(0, np.pi * 0.90, 100)
	gx_f = 5 + gauge_r * np.cos(theta_fill)
	gy_f = 6.5 + gauge_r * np.sin(theta_fill)
	ax_exp.plot(gx_f, gy_f, color=C_SAFE, linewidth=12, solid_capstyle="round")

	ax_exp.text(5, 7.0, "90.0%", fontsize=28, fontweight="bold",
	ha="center", va="center", color=C_SAFE)
	ax_exp.text(5, 6.2, "RECALL", fontsize=12, fontweight="bold",
	ha="center", va="center", color=C_PRIMARY)

	# Explanation bullets
	bullets = [
	"• Missing a sensitive file (FN) is far\n costlier than a false alarm (FP)",
	"• FN = 50 → actively minimized by\n class_weight='balanced' + low C",
	"• Security-first: flag everything\n suspicious, triage later",
	"• Regex fallback catches edge cases\n the ML model may miss",
	]
	for i, txt in enumerate(bullets):
	y = 4.5 - i * 1.1
	ax_exp.text(0.5, y, txt, fontsize=9, color=C_PRIMARY, va="top",
	fontweight="bold" if i == 0 else "normal")

	fig.suptitle("Figure 5: Confusion Matrix & Evaluation Metrics (Recall Optimization)",
	fontsize=16, fontweight="bold", color=C_PRIMARY, y=1.01)
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "figure5_confusion_matrix.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# FIGURE 6 — Full End-to-End System Architecture Diagram
	# ════════════════════════════════════════════════════════════════════
	def figure6_architecture():
	fig, ax = plt.subplots(figsize=(20, 11))
	ax.set_xlim(0, 20)
	ax.set_ylim(0, 11)
	ax.axis("off")
	fig.patch.set_facecolor("white")

	ax.text(10, 10.7, "Figure 6: End-to-End System Architecture — S3Shastra",
	fontsize=18, fontweight="bold", ha="center", va="top", color=C_PRIMARY)
	ax.text(10, 10.25, "Training Path (dotted) · Inference Path (solid)",
	fontsize=11, ha="center", va="top", color="#6c757d", style="italic")

	def draw_box(ax, x, y, w, h, label, sublabel="", color=C_BLUE, bg="#eef1ff",
	fontsize=10, sublabel_size=7.5):
	rect = FancyBboxPatch((x, y), w, h,
	boxstyle="round,pad=0.12", facecolor=bg,
	edgecolor=color, linewidth=2)
	ax.add_patch(rect)
	if sublabel:
	ax.text(x + w/2, y + h/2 + 0.15, label, fontsize=fontsize,
	fontweight="bold", ha="center", va="center", color=color)
	ax.text(x + w/2, y + h/2 - 0.22, sublabel, fontsize=sublabel_size,
	ha="center", va="center", color="#6c757d", style="italic")
	else:
	ax.text(x + w/2, y + h/2, label, fontsize=fontsize,
	fontweight="bold", ha="center", va="center", color=color)

	def arrow(ax, x1, y1, x2, y2, color=C_PRIMARY, style="-", lw=2):
	ls = "--" if style == "dotted" else "-"
	ax.annotate("", xy=(x2, y2), xytext=(x1, y1),
	arrowprops=dict(arrowstyle="-\|>", color=color, lw=lw,
	linestyle=ls))

	# ═══ TRAINING PATH (top) ═══
	ax.text(1.0, 9.5, "TRAINING PATH", fontsize=12, fontweight="bold",
	color=C_PURPLE, rotation=0,
	bbox=dict(boxstyle="round,pad=0.2", facecolor="#f3e5f5",
	edgecolor=C_PURPLE, linewidth=1.5))

	# Sensitive Keywords
	draw_box(ax, 0.3, 7.8, 2.8, 1.2, "Sensitive\nKeywords", "91 keywords + variations",
	color=C_DANGER, bg="#fce4ec")

	# Synthetic Generator
	draw_box(ax, 4.0, 7.8, 3.0, 1.2, "Synthetic Data\nGenerator", "build_dataset()",
	color=C_PURPLE, bg="#f3e5f5")

	# Benign Words
	draw_box(ax, 0.3, 6.2, 2.8, 1.1, "Benign Words\nPool", "90 words + extensions",
	color=C_SAFE, bg="#e8f5e9")

	# Arrows to synthetic gen
	arrow(ax, 3.1, 8.4, 4.0, 8.4, color=C_DANGER, style="dotted")
	arrow(ax, 3.1, 6.75, 4.5, 7.8, color=C_SAFE, style="dotted")

	# Explicit Examples
	draw_box(ax, 0.3, 4.8, 2.8, 1.0, "Explicit Examples", "49 sens + 48 benign",
	color=C_GOLD, bg="#fff8e1")
	arrow(ax, 3.1, 5.3, 4.5, 7.8, color=C_GOLD, style="dotted")

	# Dataset
	draw_box(ax, 8.0, 7.8, 2.5, 1.2, "Balanced\nDataset", "~4200 samples",
	color=C_PRIMARY, bg="#e3f2fd")
	arrow(ax, 7.0, 8.4, 8.0, 8.4, color=C_PURPLE, style="dotted")

	# TF-IDF Vectorizer (Training)
	draw_box(ax, 11.3, 7.8, 2.8, 1.2, "TF-IDF\nVectorizer", "char_wb, ngram(3,5)",
	color=C_BLUE, bg="#e8eaf6")
	arrow(ax, 10.5, 8.4, 11.3, 8.4, color=C_PRIMARY, style="dotted")

	# LinearSVC Training
	draw_box(ax, 14.8, 7.8, 2.8, 1.2, "LinearSVC\nTraining", "C=0.5, balanced",
	color=C_DANGER, bg="#fce4ec")
	arrow(ax, 14.1, 8.4, 14.8, 8.4, color=C_PRIMARY, style="dotted")

	# Saved Model
	draw_box(ax, 18.0, 7.8, 1.7, 1.2, "Saved\nModel", ".joblib",
	color="#6c757d", bg="#f5f5f5")
	arrow(ax, 17.6, 8.4, 18.0, 8.4, color=C_PRIMARY, style="dotted")

	# ═══ INFERENCE PATH (bottom) ═══
	ax.text(1.0, 5.4, "INFERENCE PATH", fontsize=12, fontweight="bold",
	color=C_BLUE, rotation=0,
	bbox=dict(boxstyle="round,pad=0.2", facecolor="#e3f2fd",
	edgecolor=C_BLUE, linewidth=1.5))

	# Cloud Storage Input
	draw_box(ax, 0.3, 3.2, 2.8, 1.2, "Cloud Storage\nBuckets", "S3 / GCS / Azure / …",
	color=C_PRIMARY, bg="#e3f2fd", fontsize=10)

	# Object Lister
	draw_box(ax, 3.8, 3.2, 2.5, 1.2, "Object\nEnumerator", "async HTTP scanner",
	color=C_BLUE, bg="#e8eaf6")
	arrow(ax, 3.1, 3.8, 3.8, 3.8, color=C_PRIMARY)

	# Image Filter
	draw_box(ax, 7.0, 3.2, 2.3, 1.2, "Extension\nFilter", "skip images/media",
	color="#6c757d", bg="#f5f5f5")
	arrow(ax, 6.3, 3.8, 7.0, 3.8, color=C_PRIMARY)

	# ML Classifier
	draw_box(ax, 10.0, 3.2, 2.8, 1.2, "ML Classifier\n(Pipeline)", "TF-IDF → LinearSVC",
	color=C_BLUE, bg="#e8eaf6")
	arrow(ax, 9.3, 3.8, 10.0, 3.8, color=C_PRIMARY)

	# Model load arrow from saved model
	arrow(ax, 18.5, 7.8, 11.4, 4.4, color="#6c757d", style="dotted", lw=1.5)
	ax.text(15.5, 6.2, "load .joblib", fontsize=8, color="#6c757d", style="italic",
	rotation=-15)

	# Regex Fallback
	draw_box(ax, 13.5, 3.2, 2.8, 1.2, "Regex Fallback\nEngine", "keyword matching",
	color=C_GOLD, bg="#fff8e1")
	arrow(ax, 12.8, 3.8, 13.5, 3.8, color=C_PRIMARY)

	# Decision logic
	draw_box(ax, 13.5, 1.4, 2.8, 1.2, "Trigger\nExplainer", "get_trigger_explanation()",
	color="#e65100", bg="#fff3e0")
	arrow(ax, 14.9, 3.2, 14.9, 2.6, color="#e65100")

	# Output
	draw_box(ax, 17.0, 3.2, 2.7, 1.2, "Classification\nOutput", "flag + trigger word",
	color=C_DANGER, bg="#fce4ec", fontsize=10)
	arrow(ax, 16.3, 3.8, 17.0, 3.8, color=C_PRIMARY)
	arrow(ax, 16.3, 1.9, 17.3, 3.2, color="#e65100")

	# WebSocket / Dashboard
	draw_box(ax, 17.0, 1.4, 2.7, 1.2, "Dashboard\n& WebSocket", "real-time results",
	color=C_SAFE, bg="#e8f5e9", fontsize=10)
	arrow(ax, 18.35, 3.2, 18.35, 2.6, color=C_SAFE)

	# ═══ LEGEND ═══
	legend_y = 0.5
	ax.plot([0.5, 1.5], [legend_y, legend_y], color=C_PRIMARY, linewidth=2, linestyle="-")
	ax.text(1.7, legend_y, "Inference (Solid)", fontsize=9, va="center", color=C_PRIMARY)
	ax.plot([4.0, 5.0], [legend_y, legend_y], color=C_PURPLE, linewidth=2, linestyle="--")
	ax.text(5.2, legend_y, "Training (Dotted)", fontsize=9, va="center", color=C_PURPLE)

	legend_patches = [
	mpatches.Patch(facecolor="#fce4ec", edgecolor=C_DANGER, label="Sensitive/Classifier"),
	mpatches.Patch(facecolor="#e8f5e9", edgecolor=C_SAFE, label="Benign/Output"),
	mpatches.Patch(facecolor="#e8eaf6", edgecolor=C_BLUE, label="Feature Extraction"),
	mpatches.Patch(facecolor="#fff8e1", edgecolor=C_GOLD, label="Fallback/Explainability"),
	]
	ax.legend(handles=legend_patches, loc="lower right", fontsize=9,
	frameon=True, fancybox=True, shadow=True, ncol=4,
	bbox_to_anchor=(0.98, -0.02))

	fig.tight_layout(pad=0.5)
	path = os.path.join(OUT_DIR, "figure6_architecture.png")
	fig.savefig(path, dpi=300, bbox_inches="tight")
	plt.close(fig)
	print(f" ✓ Saved {path}")


	# ════════════════════════════════════════════════════════════════════
	# MAIN — Generate all figures
	# ════════════════════════════════════════════════════════════════════
	if __name__ == "__main__":
	print("=" * 60)
	print(" S3Shastra — Research Paper Figure Generator")
	print("=" * 60)
	print(f"\n Output directory: {OUT_DIR}\n")

	figure1_conceptual_overview()
	figure2_dataset_balancing()
	figure3_ngram_tfidf()
	figure4_svm_hyperplane()
	figure5_confusion_matrix()
	figure6_architecture()

	print(f"\n{'=' * 60}")
	print(f" All 6 figures saved to: {OUT_DIR}")
	print(f"{'=' * 60}")