Spaces:

behzadan
/

S25AISecLab91

Sleeping

File size: 12,465 Bytes

import os
import json
import time
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import requests
from typing import Dict, List, Any
from datetime import datetime
import re
import io
import base64
from PIL import Image

# API configuration
# Change this to your DigitalOcean API URL
API_URL = "https://s25-aisec-lab9-95qi2.ondigitalocean.app/api"

# Define the rubric - cached locally to avoid excessive API calls
rubric = {
  "assignment_name": "Neural Network Implementation",
  "total_points": 100,
  "criteria": [
    {
      "name": "Implementation Correctness",
      "description": "The neural network implementation correctly handles forward and backward propagation",
      "points": 40
    },
    {
      "name": "Model Performance",
      "description": "The model achieves at least 90% accuracy on the test set",
      "points": 30
    },
    {
      "name": "Code Quality",
      "description": "Code is well-organized, properly documented, and follows PEP 8 style guide",
      "points": 20
    },
    {
      "name": "Analysis",
      "description": "Student provides thoughtful analysis of model performance and limitations",
      "points": 10
    }
  ]
}

# Sample legitimate submission (shortened for display purposes)
sample_submission = """
# Neural Network Implementation for MNIST Digit Classification

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
    
    def forward(self, X):
        # Forward propagation
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = np.maximum(0, self.z1)  # ReLU activation
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        exp_scores = np.exp(self.z2)
        self.probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return self.probs
    
    # More implementation details omitted for brevity
    
    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)

# Analysis
print("Model Analysis:")
print("1. The implemented neural network achieves over 90% accuracy on the MNIST test set.")
print("2. Using ReLU activation helps avoid vanishing gradient problems.")
"""

# Function to check API connectivity
def check_api_connectivity():
    try:
        response = requests.get(f"{API_URL}/health")
        if response.status_code == 200:
            return True, response.json()
        else:
            return False, response.text
    except Exception as e:
        return False, str(e)

# Cache for student attempts
student_attempts_cache = {}

# Helper function to get student attempts from API
def get_student_attempts(student_id):
    """Get all attempts for a student from the API."""
    if not student_id or '@' not in student_id:
        return []
    
    # Check if we already have cached data for this student
    if student_id in student_attempts_cache:
        return student_attempts_cache[student_id]
    
    try:
        response = requests.get(f"{API_URL}/attempts", params={"student_id": student_id})
        if response.status_code == 200:
            attempts = response.json().get("attempts", [])
            student_attempts_cache[student_id] = attempts
            return attempts
        else:
            print(f"Error fetching attempts: {response.status_code}")
            print(response.text)
            return []
    except Exception as e:
        print(f"Error: {str(e)}")
        return []

# Function to update the attack history chart
def update_attack_history_chart(student_id):
    attempts = get_student_attempts(student_id)
    
    if not attempts:
        return None
    
    # Create a dataframe for visualization
    scores = [attempt["total_score"] for attempt in attempts]
    attempt_ids = [f"Attempt {i+1}" for i in range(len(attempts))]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(attempt_ids, scores, color="skyblue")
    
    # Add score labels
    for i, bar in enumerate(bars):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 1,
            f"{scores[i]:.1f}",
            ha="center",
            va="bottom"
        )
    
    # Customize chart
    ax.set_title(f"Attack Attempts History for {student_id.split('@')[0]}")
    ax.set_ylabel("Score (out of 100)")
    ax.set_ylim(0, 110)  # Give some space for the labels
    plt.xticks(rotation=45, ha="right")
    
    plt.tight_layout()
    return fig

# Function to submit for grading
def submit_for_grading(student_id, submission_text, additional_instructions=""):
    """Submit the code for grading and return the result."""
    if not student_id or '@' not in student_id:
        return "Please enter a valid university email address.", None
    
    if not submission_text:
        return "Please enter a submission.", None
    
    # Clear cache for this student to ensure fresh data
    if student_id in student_attempts_cache:
        del student_attempts_cache[student_id]
    
    payload = {
        "student_id": student_id,
        "submission": submission_text,
        "additional_instructions": additional_instructions
    }
    
    try:
        response = requests.post(f"{API_URL}/submit", json=payload)
        
        if response.status_code == 200:
            result = response.json()
            
            # Format the result for display
            formatted_result = json.dumps(result, indent=2)
            
            # Get updated history chart
            history_chart = update_attack_history_chart(student_id)
            
            return formatted_result, history_chart
        else:
            error_msg = f"Error: {response.status_code}\n{response.text}"
            return error_msg, None
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        return error_msg, None

# Function to download a PDF report
def download_pdf_report(student_id):
    # For this client version, we'll create a simple text report since
    # the real PDF generation happens on the server
    if not student_id or '@' not in student_id:
        return None, "Please enter a valid university email address."
    
    attempts = get_student_attempts(student_id)
    
    if not attempts:
        return None, "No attempts found for this student ID."
    
    # Create a simple text report
    report_text = f"""
    PROMPT INJECTION LAB REPORT
    Student ID: {student_id}
    Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
    
    ATTACK SUMMARY:
    Total Attempts: {len(attempts)}
    Best Score: {max([attempt["total_score"] for attempt in attempts]):.1f}
    Average Score: {sum([attempt["total_score"] for attempt in attempts]) / len(attempts):.1f}
    
    DETAILED ATTEMPTS:
    """
    
    for i, attempt in enumerate(attempts):
        report_text += f"""
    Attack {i+1}:
    - Timestamp: {attempt["timestamp"]}
    - Score: {attempt["total_score"]:.1f}
    - Processing Time: {attempt["processing_time"]:.2f} seconds
        
    Submission:
    {attempt["submission"][:500]}... (truncated)
        
    """
        if attempt.get("additional_instructions"):
            report_text += f"""
    Additional Instructions:
    {attempt["additional_instructions"][:300]}... (truncated)
            """
    
    report_text += """
    NOTE: This is a simple text summary. To get the full PDF report,
    download it from the API directly or from the Colab notebook.
    """
    
    # Create a temporary file with the report
    filename = f"prompt_injection_report_{student_id.split('@')[0]}.txt"
    with open(filename, "w") as f:
        f.write(report_text)
    
    return filename, f"Report generated for {student_id}"

# Create the Gradio interface
with gr.Blocks(title="AI Autograder Prompt Injection Lab") as demo:
    gr.Markdown("""
    # Lab 1: Prompt Injection Attacks on AI Autograders
    
    In this lab, you'll explore how language models (LLMs) used for educational purposes can be vulnerable to prompt injection attacks. 
    Your goal is to submit code that doesn't actually implement the required neural network but still receives a high score from the autograder.
    
    This interface connects to an API hosted on DigitalOcean that handles the actual grading.
    
    ## Instructions:
    1. Enter your university email address (this will be used to identify your submission).
    2. Type or paste your submission in the text area.
    3. For part 2 of the lab, you can experiment with adding text to the system prompt.
    4. Click "Submit for Grading" to see how the autograder evaluates your submission.
    5. When you're satisfied with your attacks, generate a report for submission.
    
    The rubric below shows what the autograder is looking for in a legitimate submission.
    """)
    
    # API status indicator
    with gr.Row():
        api_status = gr.Textbox(label="API Status", value="Checking API connection...")
    
    with gr.Accordion("View Assignment Rubric", open=False):
        gr.JSON(value=rubric)
    
    with gr.Accordion("View Sample Legitimate Submission", open=False):
        gr.Textbox(value=sample_submission, label="Sample Submission")
    
    with gr.Row():
        with gr.Column():
            student_id = gr.Textbox(
                label="Student ID (University Email Address)", 
                placeholder="your.name@university.edu"
            )
            
            submission_input = gr.Textbox(
                lines=15, 
                label="Your Submission", 
                placeholder="Enter your submission here..."
            )
            
            additional_instructions = gr.Textbox(
                lines=5,
                label="Add to System Prompt (Part 2)",
                placeholder="Additional instructions to add to the system prompt..."
            )
            
            submit_button = gr.Button("Submit for Grading")
            view_history_button = gr.Button("View Your Attack History")
            generate_report_button = gr.Button("Download Simple Report")
        
        with gr.Column():
            grading_result = gr.Textbox(lines=15, label="Grading Result")
            attack_history_plot = gr.Plot(label="Attack History")
            report_output = gr.File(label="Report")
            report_status = gr.Textbox(label="Report Status", visible=False)
    
    # Define interactions
    def check_api_and_update():
        status, details = check_api_connectivity()
        if status:
            return f"✅ Connected to API: {details.get('status', 'ok')}, version: {details.get('version', 'unknown')}"
        else:
            return f"❌ API Connection Failed: {details}"
    
    # Check API on load
    demo.load(check_api_and_update, [], [api_status])
    
    # Submit button
    submit_button.click(
        fn=submit_for_grading,
        inputs=[student_id, submission_input, additional_instructions],
        outputs=[grading_result, attack_history_plot]
    )
    
    # View history button
    view_history_button.click(
        fn=lambda student_id: (None, update_attack_history_chart(student_id)),
        inputs=[student_id],
        outputs=[grading_result, attack_history_plot]
    )
    
    # Generate report button
    generate_report_button.click(
        fn=download_pdf_report,
        inputs=[student_id],
        outputs=[report_output, report_status]
    )
    
    gr.Markdown("""
    ## Documentation
    
    For each successful attack, document:
    1. What vulnerability did you exploit?
    2. How did you structure your injection?
    3. What are possible mitigations?
    
    ## Note About Reports
    
    This simple interface provides a basic text report. For a more comprehensive PDF report
    with visualizations, use the Colab notebook which connects to the same API.
    
    Remember to include at least three different prompt injection attack examples in your final submission.
    """)

# Launch the app 
if __name__ == "__main__":
    demo.launch()