ds110tutor / app.py
Kevin Gold
Don't reveal code now in system message
bf42866
import csv
from openai import OpenAI
import gradio as gr
import os
import random
import pandas as pd
from datetime import datetime
from cryptography.fernet import Fernet
from huggingface_hub import Repository
#from huggingface_hub import HfApi, snapshot_download
from datasets import load_dataset
from hashlib import blake2b
MODEL = 'gemini-3-flash-preview'
client = OpenAI(
api_key=os.getenv('gemini'),
base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)
crypto_key=os.getenv('crypto_key')
leak_key=os.getenv('leak_key')
# Dataset code follows example at:
# https://huggingface.co/spaces/julien-c/persistent-data/blob/main/app.py
DATASET_REPO_URL = "https://huggingface.co/datasets/klgold/tutor_data"
DATA_FILENAME = "data.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)
HF_TOKEN=os.environ.get("HF_TOKEN")
PROFILES_URL = "https://huggingface.co/datasets/klgold/tutor_profiles"
# Original prefix - no longer used
# PROMPT_PREFIX="For this query, answer with a single question that you haven't asked before that is meant to lead someone in the right direction, without directly answering the relevant homework question - unless the problem is solved completely, in which case, quit."
repo = Repository(
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
)
name = None
def store_transcript(hw_num: int, is_ai: bool, transcript: str, state):
with open(DATA_FILE, 'a') as csvfile:
writer = csv.DictWriter(csvfile,
fieldnames=['session','is_ai','transcript', 'time'])
writer.writerow(
{'session': state.username + '-' + str(hw_num) + '-' + str(state.session),
'is_ai':str(is_ai),
'transcript': transcript, 'time': str(datetime.now())}
)
commit_url = repo.push_to_hub()
"""
api.upload_folder(
folder_path="data",
repo_id=DATASET_REPO_ID,
repo_type="dataset"
)
"""
# "Stop flag" stuff is commented out because the extra query
# to check whether answers are leaked doubles the response time,
# which currently means going from 10s to 20s.
class SessionState:
def __init__(self):
self.session = 0
self.hw = 0
self.messages = []
self.username = "unknown"
self.stop_flag = 0
def continue_session(hw_num, user_input, state, request: gr.Request):
hw_num_int = int(hw_num)
if not state.messages or hw_num_int != state.hw:
state.hw = hw_num_int
relevant_homework_file = open('YourName_DS110_S26_HW' + str(hw_num_int) + '_sol.ipynb.encrypt', 'rb')
encrypted = relevant_homework_file.read()
relevant_homework_file.close()
fernet = Fernet(crypto_key)
homework_text = fernet.decrypt(encrypted)
if not state.messages: # Keep same session for change of HW
state.session = random.randint(1,999999)
# Messages reset on change of HW - avoid really long messages
state.messages = [
{"role": "system", "content": "You are a helpful teaching assistant in a data science course. Your primary goal is to help the students learn."},
{"role": "system", "content": "This is the homework the student is talking about -- do not reveal any code from it: " + str(homework_text)},
]
# profile_dataset = load_dataset("csv", data_files="https://huggingface.co/datasets/klgold/tutor_profiles/raw/main/S24hashed.csv", token=HF_TOKEN)
# profile_df = pd.DataFrame(profile_dataset['train'])
# profile_df = profile_df.set_index('user')
username = request.username
state.username = username
"""
if int(profile_df.loc[username, 'exp0']) == 0:
state.stop_flag = 0
"""
content = """
You are about to be given a question about one of these homework problems
from a student. You can assume the student is just beginning to learn
Python. Your answer should NOT use any complete line of code from the problem solution.
The student will get in trouble if they use any code you supply.
Instead, you should either ask a question that helps you determine what
the student knows, or ask a question that you think the student should be
able to answer that will send them in the right direction, or offer to
explain the relevant concepts for the problem. Every thing you say
should be either a question that reveals what the student knows, or a
question that is one step on the way to solving the problem, or an
explanation of a concept or how to do something. If you are asked
whether a solution is correct, refuse to confirm it, and instead ask what
is still bothering the student about the code. If you are asked to find
bugs in the code, you can indicate which line has the bug, but ask the
student a question that prompts them to solve the bug on their own. Ask
no more than one question in your reply. Don't be too aggressive in
hinting - try to get the student to solve as much of the original problem
as possible.
Example if problem is "Write a loop that will print 'Hello' three times.":
Human: I'm having trouble with this problem about for loops.
AI: Are you familiar with how to use a for loop to iterate over lists?
Human: Yes, I think I can do that.
AI: What about using a for loop to do something a set number of times?
Human: I don't know how to do that.
AI: You can say for i in range(n) to do something n times. For example, for i in range(5) will iterate 5 times, where i is 0, then 1, then 2, then 3, then 4.
Human : So can I write
for i in range(3)
print('Hello')
AI: Almost there. Take a closer look at your first line. Do you remember what punctuation you need in a for loop?
Human: So is it
for i in range(3):
print('Hello')
AI: I can't confirm whether answers are correct. Try testing your code. Once you do, let me know if there's anything you're still worried about.
Example if the problem is "Write a recursive function that will sum the numbers from m to n":
Human: My solution is
def my_sum(m, n)
return m + my_sum(m+1,n)
Is that right?
AI: What happens when you run that code?
Human: It just sits there
AI: What do you think is happening when it just sits there?
Human: I don't know
AI: Walk through your code line by line for def my_sum(3,3). What happens?
Human: Oh, I guess it gets called for 4,3
AI: And then what?
Human: And then 5,3, and 6,3, and it never stops
AI: So what did you forget?
Human: How about this
def my_sum(m, n)
if m == n:
return m
return m + my_sum(m+1,n)
AI: I can't confirm whether answers are correct, but consider trying it with some test values and then let me know if you have further concerns.
And now, here is the student's actual query:
"""
prefix_length = len(content)
content += user_input
store_transcript(hw_num_int, False, user_input, state)
state.messages.append({"role": "user", "content": content})
try:
response = client.chat.completions.create(
model=MODEL,
messages=state.messages,
stream=True
)
#chat_response = str(response) For inspecting whole ChatCompletion
#chat_response = response.choices[0].message.content
last_message = ""
for chunk in response:
streamed = chunk.choices[0].delta.content
if streamed is not None:
last_message += streamed
response_box.value = last_message
yield {response_box: last_message, state_var: state}
except Exception as e:
last_message = str(e)
state.messages.append({'role':'assistant', 'content': last_message})
transcript = to_transcript(state.messages, prefix_length)
store_transcript(hw_num_int, True, last_message, state)
# DEBUG Replace transcript with state.messages for debugging
yield {response_box: transcript, state_var: state}
def to_transcript(messages, prefix_length):
transcript = ''
for d in messages:
if d['role'] == 'user':
prompt = d['content']
# Need to remove the extra prompt verbiage
transcript += '\nUser: ' + prompt[prefix_length:]
elif d['role'] == 'assistant':
transcript += '\nAI: ' + d['content']
return transcript
def my_hash(text):
h = blake2b()
h.update(bytes(text,'utf-8'))
return h.hexdigest()
def authenticate(username, password):
# profile_dataset = load_dataset("csv", data_files="https://huggingface.co/datasets/klgold/tutor_profiles/raw/main/S25hashed.csv", token=HF_TOKEN)
profile_dataset = load_dataset("klgold/tutor_profiles", data_files={"train":"S26hashed.csv"}, token=HF_TOKEN)
profile_df = pd.DataFrame(profile_dataset['train'])
profile_df = profile_df.set_index('Username')
if not username in profile_df.index:
return False
hashed_pass = my_hash(password)
pass_matches = str(profile_df.loc[username, 'hash']) == str(hashed_pass)
return pass_matches
with gr.Blocks() as demo:
hw_num_var = gr.State(0)
state_var = gr.State(SessionState())
transcript_var = gr.State('')
transcript_fn_var = gr.State('')
gr.Markdown('Which homework is this about?')
hw_num_radio = gr.Radio(['1','2','3','4','5','6','7','8','9'], label="HWnum")
gr.Markdown('Sample questions: <i>How should I approach 2c?</i>, <i>When should I use a tuple instead of a list?</i>, <i>Do I have to use datetime objects to do 3b?</i>')
user_input = gr.Textbox(label='Input')
response_button = gr.Button('Go')
response_box = gr.Textbox(lines=20, label='Response')
response_button.click(fn=continue_session, inputs=[hw_num_radio, user_input, state_var], outputs=[response_box, state_var])
demo.launch(auth=authenticate, server_name="0.0.0.0", show_error=True)