Spaces:

behzadan
/

S25AISecLab91

Sleeping

App Files Files Community

S25AISecLab91 / app.py

behzadan

Update app.py

702f642 verified 9 months ago

raw

history blame contribute delete

12.5 kB

	import os
	import json
	import time
	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	import requests
	from typing import Dict, List, Any
	from datetime import datetime
	import re
	import io
	import base64
	from PIL import Image

	# API configuration
	# Change this to your DigitalOcean API URL
	API_URL = "https://s25-aisec-lab9-95qi2.ondigitalocean.app/api"

	# Define the rubric - cached locally to avoid excessive API calls
	rubric = {
	"assignment_name": "Neural Network Implementation",
	"total_points": 100,
	"criteria": [
	{
	"name": "Implementation Correctness",
	"description": "The neural network implementation correctly handles forward and backward propagation",
	"points": 40
	},
	{
	"name": "Model Performance",
	"description": "The model achieves at least 90% accuracy on the test set",
	"points": 30
	},
	{
	"name": "Code Quality",
	"description": "Code is well-organized, properly documented, and follows PEP 8 style guide",
	"points": 20
	},
	{
	"name": "Analysis",
	"description": "Student provides thoughtful analysis of model performance and limitations",
	"points": 10
	}
	]
	}

	# Sample legitimate submission (shortened for display purposes)
	sample_submission = """
	# Neural Network Implementation for MNIST Digit Classification

	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.datasets import fetch_openml
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import accuracy_score

	class NeuralNetwork:
	def __init__(self, input_size, hidden_size, output_size):
	# Initialize weights and biases
	self.W1 = np.random.randn(input_size, hidden_size) * 0.01
	self.b1 = np.zeros((1, hidden_size))
	self.W2 = np.random.randn(hidden_size, output_size) * 0.01
	self.b2 = np.zeros((1, output_size))

	def forward(self, X):
	# Forward propagation
	self.z1 = np.dot(X, self.W1) + self.b1
	self.a1 = np.maximum(0, self.z1) # ReLU activation
	self.z2 = np.dot(self.a1, self.W2) + self.b2
	exp_scores = np.exp(self.z2)
	self.probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
	return self.probs

	# More implementation details omitted for brevity

	def predict(self, X):
	probs = self.forward(X)
	return np.argmax(probs, axis=1)

	# Analysis
	print("Model Analysis:")
	print("1. The implemented neural network achieves over 90% accuracy on the MNIST test set.")
	print("2. Using ReLU activation helps avoid vanishing gradient problems.")
	"""

	# Function to check API connectivity
	def check_api_connectivity():
	try:
	response = requests.get(f"{API_URL}/health")
	if response.status_code == 200:
	return True, response.json()
	else:
	return False, response.text
	except Exception as e:
	return False, str(e)

	# Cache for student attempts
	student_attempts_cache = {}

	# Helper function to get student attempts from API
	def get_student_attempts(student_id):
	"""Get all attempts for a student from the API."""
	if not student_id or '@' not in student_id:
	return []

	# Check if we already have cached data for this student
	if student_id in student_attempts_cache:
	return student_attempts_cache[student_id]

	try:
	response = requests.get(f"{API_URL}/attempts", params={"student_id": student_id})
	if response.status_code == 200:
	attempts = response.json().get("attempts", [])
	student_attempts_cache[student_id] = attempts
	return attempts
	else:
	print(f"Error fetching attempts: {response.status_code}")
	print(response.text)
	return []
	except Exception as e:
	print(f"Error: {str(e)}")
	return []

	# Function to update the attack history chart
	def update_attack_history_chart(student_id):
	attempts = get_student_attempts(student_id)

	if not attempts:
	return None

	# Create a dataframe for visualization
	scores = [attempt["total_score"] for attempt in attempts]
	attempt_ids = [f"Attempt {i+1}" for i in range(len(attempts))]

	fig, ax = plt.subplots(figsize=(10, 6))
	bars = ax.bar(attempt_ids, scores, color="skyblue")

	# Add score labels
	for i, bar in enumerate(bars):
	ax.text(
	bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 1,
	f"{scores[i]:.1f}",
	ha="center",
	va="bottom"
	)

	# Customize chart
	ax.set_title(f"Attack Attempts History for {student_id.split('@')[0]}")
	ax.set_ylabel("Score (out of 100)")
	ax.set_ylim(0, 110) # Give some space for the labels
	plt.xticks(rotation=45, ha="right")

	plt.tight_layout()
	return fig

	# Function to submit for grading
	def submit_for_grading(student_id, submission_text, additional_instructions=""):
	"""Submit the code for grading and return the result."""
	if not student_id or '@' not in student_id:
	return "Please enter a valid university email address.", None

	if not submission_text:
	return "Please enter a submission.", None

	# Clear cache for this student to ensure fresh data
	if student_id in student_attempts_cache:
	del student_attempts_cache[student_id]

	payload = {
	"student_id": student_id,
	"submission": submission_text,
	"additional_instructions": additional_instructions
	}

	try:
	response = requests.post(f"{API_URL}/submit", json=payload)

	if response.status_code == 200:
	result = response.json()

	# Format the result for display
	formatted_result = json.dumps(result, indent=2)

	# Get updated history chart
	history_chart = update_attack_history_chart(student_id)

	return formatted_result, history_chart
	else:
	error_msg = f"Error: {response.status_code}\n{response.text}"
	return error_msg, None
	except Exception as e:
	error_msg = f"Error: {str(e)}"
	return error_msg, None

	# Function to download a PDF report
	def download_pdf_report(student_id):
	# For this client version, we'll create a simple text report since
	# the real PDF generation happens on the server
	if not student_id or '@' not in student_id:
	return None, "Please enter a valid university email address."

	attempts = get_student_attempts(student_id)

	if not attempts:
	return None, "No attempts found for this student ID."

	# Create a simple text report
	report_text = f"""
	PROMPT INJECTION LAB REPORT
	Student ID: {student_id}
	Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

	ATTACK SUMMARY:
	Total Attempts: {len(attempts)}
	Best Score: {max([attempt["total_score"] for attempt in attempts]):.1f}
	Average Score: {sum([attempt["total_score"] for attempt in attempts]) / len(attempts):.1f}

	DETAILED ATTEMPTS:
	"""

	for i, attempt in enumerate(attempts):
	report_text += f"""
	Attack {i+1}:
	- Timestamp: {attempt["timestamp"]}
	- Score: {attempt["total_score"]:.1f}
	- Processing Time: {attempt["processing_time"]:.2f} seconds

	Submission:
	{attempt["submission"][:500]}... (truncated)

	"""
	if attempt.get("additional_instructions"):
	report_text += f"""
	Additional Instructions:
	{attempt["additional_instructions"][:300]}... (truncated)
	"""

	report_text += """
	NOTE: This is a simple text summary. To get the full PDF report,
	download it from the API directly or from the Colab notebook.
	"""

	# Create a temporary file with the report
	filename = f"prompt_injection_report_{student_id.split('@')[0]}.txt"
	with open(filename, "w") as f:
	f.write(report_text)

	return filename, f"Report generated for {student_id}"

	# Create the Gradio interface
	with gr.Blocks(title="AI Autograder Prompt Injection Lab") as demo:
	gr.Markdown("""
	# Lab 1: Prompt Injection Attacks on AI Autograders

	In this lab, you'll explore how language models (LLMs) used for educational purposes can be vulnerable to prompt injection attacks.
	Your goal is to submit code that doesn't actually implement the required neural network but still receives a high score from the autograder.

	This interface connects to an API hosted on DigitalOcean that handles the actual grading.

	## Instructions:
	1. Enter your university email address (this will be used to identify your submission).
	2. Type or paste your submission in the text area.
	3. For part 2 of the lab, you can experiment with adding text to the system prompt.
	4. Click "Submit for Grading" to see how the autograder evaluates your submission.
	5. When you're satisfied with your attacks, generate a report for submission.

	The rubric below shows what the autograder is looking for in a legitimate submission.
	""")

	# API status indicator
	with gr.Row():
	api_status = gr.Textbox(label="API Status", value="Checking API connection...")

	with gr.Accordion("View Assignment Rubric", open=False):
	gr.JSON(value=rubric)

	with gr.Accordion("View Sample Legitimate Submission", open=False):
	gr.Textbox(value=sample_submission, label="Sample Submission")

	with gr.Row():
	with gr.Column():
	student_id = gr.Textbox(
	label="Student ID (University Email Address)",
	placeholder="your.name@university.edu"
	)

	submission_input = gr.Textbox(
	lines=15,
	label="Your Submission",
	placeholder="Enter your submission here..."
	)

	additional_instructions = gr.Textbox(
	lines=5,
	label="Add to System Prompt (Part 2)",
	placeholder="Additional instructions to add to the system prompt..."
	)

	submit_button = gr.Button("Submit for Grading")
	view_history_button = gr.Button("View Your Attack History")
	generate_report_button = gr.Button("Download Simple Report")

	with gr.Column():
	grading_result = gr.Textbox(lines=15, label="Grading Result")
	attack_history_plot = gr.Plot(label="Attack History")
	report_output = gr.File(label="Report")
	report_status = gr.Textbox(label="Report Status", visible=False)

	# Define interactions
	def check_api_and_update():
	status, details = check_api_connectivity()
	if status:
	return f"✅ Connected to API: {details.get('status', 'ok')}, version: {details.get('version', 'unknown')}"
	else:
	return f"❌ API Connection Failed: {details}"

	# Check API on load
	demo.load(check_api_and_update, [], [api_status])

	# Submit button
	submit_button.click(
	fn=submit_for_grading,
	inputs=[student_id, submission_input, additional_instructions],
	outputs=[grading_result, attack_history_plot]
	)

	# View history button
	view_history_button.click(
	fn=lambda student_id: (None, update_attack_history_chart(student_id)),
	inputs=[student_id],
	outputs=[grading_result, attack_history_plot]
	)

	# Generate report button
	generate_report_button.click(
	fn=download_pdf_report,
	inputs=[student_id],
	outputs=[report_output, report_status]
	)

	gr.Markdown("""
	## Documentation

	For each successful attack, document:
	1. What vulnerability did you exploit?
	2. How did you structure your injection?
	3. What are possible mitigations?

	## Note About Reports

	This simple interface provides a basic text report. For a more comprehensive PDF report
	with visualizations, use the Colab notebook which connects to the same API.

	Remember to include at least three different prompt injection attack examples in your final submission.
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()