Spaces:

klgold
/

ds110tutor

Running

Kevin Gold

Don't reveal code now in system message

bf42866 3 months ago

10.1 kB

	import csv
	from openai import OpenAI
	import gradio as gr
	import os
	import random
	import pandas as pd
	from datetime import datetime
	from cryptography.fernet import Fernet
	from huggingface_hub import Repository
	#from huggingface_hub import HfApi, snapshot_download
	from datasets import load_dataset
	from hashlib import blake2b

	MODEL = 'gemini-3-flash-preview'

	client = OpenAI(
	api_key=os.getenv('gemini'),
	base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
	)
	crypto_key=os.getenv('crypto_key')
	leak_key=os.getenv('leak_key')
	# Dataset code follows example at:
	# https://huggingface.co/spaces/julien-c/persistent-data/blob/main/app.py
	DATASET_REPO_URL = "https://huggingface.co/datasets/klgold/tutor_data"
	DATA_FILENAME = "data.csv"
	DATA_FILE = os.path.join("data", DATA_FILENAME)
	HF_TOKEN=os.environ.get("HF_TOKEN")
	PROFILES_URL = "https://huggingface.co/datasets/klgold/tutor_profiles"
	# Original prefix - no longer used
	# PROMPT_PREFIX="For this query, answer with a single question that you haven't asked before that is meant to lead someone in the right direction, without directly answering the relevant homework question - unless the problem is solved completely, in which case, quit."
	repo = Repository(
	local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
	)

	name = None


	def store_transcript(hw_num: int, is_ai: bool, transcript: str, state):
	with open(DATA_FILE, 'a') as csvfile:
	writer = csv.DictWriter(csvfile,
	fieldnames=['session','is_ai','transcript', 'time'])
	writer.writerow(
	{'session': state.username + '-' + str(hw_num) + '-' + str(state.session),
	'is_ai':str(is_ai),
	'transcript': transcript, 'time': str(datetime.now())}
	)
	commit_url = repo.push_to_hub()
	"""
	api.upload_folder(
	folder_path="data",
	repo_id=DATASET_REPO_ID,
	repo_type="dataset"
	)
	"""

	# "Stop flag" stuff is commented out because the extra query
	# to check whether answers are leaked doubles the response time,
	# which currently means going from 10s to 20s.

	class SessionState:
	def __init__(self):
	self.session = 0
	self.hw = 0
	self.messages = []
	self.username = "unknown"
	self.stop_flag = 0

	def continue_session(hw_num, user_input, state, request: gr.Request):
	hw_num_int = int(hw_num)
	if not state.messages or hw_num_int != state.hw:
	state.hw = hw_num_int
	relevant_homework_file = open('YourName_DS110_S26_HW' + str(hw_num_int) + '_sol.ipynb.encrypt', 'rb')
	encrypted = relevant_homework_file.read()
	relevant_homework_file.close()
	fernet = Fernet(crypto_key)
	homework_text = fernet.decrypt(encrypted)
	if not state.messages: # Keep same session for change of HW
	state.session = random.randint(1,999999)
	# Messages reset on change of HW - avoid really long messages
	state.messages = [
	{"role": "system", "content": "You are a helpful teaching assistant in a data science course. Your primary goal is to help the students learn."},
	{"role": "system", "content": "This is the homework the student is talking about -- do not reveal any code from it: " + str(homework_text)},
	]
	# profile_dataset = load_dataset("csv", data_files="https://huggingface.co/datasets/klgold/tutor_profiles/raw/main/S24hashed.csv", token=HF_TOKEN)
	# profile_df = pd.DataFrame(profile_dataset['train'])
	# profile_df = profile_df.set_index('user')
	username = request.username
	state.username = username
	"""
	if int(profile_df.loc[username, 'exp0']) == 0:
	state.stop_flag = 0
	"""

	content = """
	You are about to be given a question about one of these homework problems
	from a student. You can assume the student is just beginning to learn
	Python. Your answer should NOT use any complete line of code from the problem solution.
	The student will get in trouble if they use any code you supply.
	Instead, you should either ask a question that helps you determine what
	the student knows, or ask a question that you think the student should be
	able to answer that will send them in the right direction, or offer to
	explain the relevant concepts for the problem. Every thing you say
	should be either a question that reveals what the student knows, or a
	question that is one step on the way to solving the problem, or an
	explanation of a concept or how to do something. If you are asked
	whether a solution is correct, refuse to confirm it, and instead ask what
	is still bothering the student about the code. If you are asked to find
	bugs in the code, you can indicate which line has the bug, but ask the
	student a question that prompts them to solve the bug on their own. Ask
	no more than one question in your reply. Don't be too aggressive in
	hinting - try to get the student to solve as much of the original problem
	as possible.

	Example if problem is "Write a loop that will print 'Hello' three times.":
	Human: I'm having trouble with this problem about for loops.
	AI: Are you familiar with how to use a for loop to iterate over lists?
	Human: Yes, I think I can do that.
	AI: What about using a for loop to do something a set number of times?
	Human: I don't know how to do that.
	AI: You can say for i in range(n) to do something n times. For example, for i in range(5) will iterate 5 times, where i is 0, then 1, then 2, then 3, then 4.
	Human : So can I write
	for i in range(3)
	print('Hello')
	AI: Almost there. Take a closer look at your first line. Do you remember what punctuation you need in a for loop?
	Human: So is it
	for i in range(3):
	print('Hello')
	AI: I can't confirm whether answers are correct. Try testing your code. Once you do, let me know if there's anything you're still worried about.

	Example if the problem is "Write a recursive function that will sum the numbers from m to n":
	Human: My solution is
	def my_sum(m, n)
	return m + my_sum(m+1,n)
	Is that right?
	AI: What happens when you run that code?
	Human: It just sits there
	AI: What do you think is happening when it just sits there?
	Human: I don't know
	AI: Walk through your code line by line for def my_sum(3,3). What happens?

	Human: Oh, I guess it gets called for 4,3
	AI: And then what?
	Human: And then 5,3, and 6,3, and it never stops
	AI: So what did you forget?
	Human: How about this
	def my_sum(m, n)
	if m == n:
	return m
	return m + my_sum(m+1,n)
	AI: I can't confirm whether answers are correct, but consider trying it with some test values and then let me know if you have further concerns.

	And now, here is the student's actual query:
	"""
	prefix_length = len(content)
	content += user_input
	store_transcript(hw_num_int, False, user_input, state)
	state.messages.append({"role": "user", "content": content})

	try:
	response = client.chat.completions.create(
	model=MODEL,
	messages=state.messages,
	stream=True
	)
	#chat_response = str(response) For inspecting whole ChatCompletion
	#chat_response = response.choices[0].message.content
	last_message = ""
	for chunk in response:
	streamed = chunk.choices[0].delta.content
	if streamed is not None:
	last_message += streamed
	response_box.value = last_message
	yield {response_box: last_message, state_var: state}
	except Exception as e:
	last_message = str(e)
	state.messages.append({'role':'assistant', 'content': last_message})
	transcript = to_transcript(state.messages, prefix_length)

	store_transcript(hw_num_int, True, last_message, state)
	# DEBUG Replace transcript with state.messages for debugging
	yield {response_box: transcript, state_var: state}

	def to_transcript(messages, prefix_length):
	transcript = ''
	for d in messages:
	if d['role'] == 'user':
	prompt = d['content']
	# Need to remove the extra prompt verbiage
	transcript += '\nUser: ' + prompt[prefix_length:]
	elif d['role'] == 'assistant':
	transcript += '\nAI: ' + d['content']
	return transcript

	def my_hash(text):
	h = blake2b()
	h.update(bytes(text,'utf-8'))
	return h.hexdigest()

	def authenticate(username, password):
	# profile_dataset = load_dataset("csv", data_files="https://huggingface.co/datasets/klgold/tutor_profiles/raw/main/S25hashed.csv", token=HF_TOKEN)
	profile_dataset = load_dataset("klgold/tutor_profiles", data_files={"train":"S26hashed.csv"}, token=HF_TOKEN)
	profile_df = pd.DataFrame(profile_dataset['train'])
	profile_df = profile_df.set_index('Username')
	if not username in profile_df.index:
	return False
	hashed_pass = my_hash(password)

	pass_matches = str(profile_df.loc[username, 'hash']) == str(hashed_pass)

	return pass_matches

	with gr.Blocks() as demo:
	hw_num_var = gr.State(0)
	state_var = gr.State(SessionState())
	transcript_var = gr.State('')
	transcript_fn_var = gr.State('')
	gr.Markdown('Which homework is this about?')
	hw_num_radio = gr.Radio(['1','2','3','4','5','6','7','8','9'], label="HWnum")
	gr.Markdown('Sample questions: <i>How should I approach 2c?</i>, <i>When should I use a tuple instead of a list?</i>, <i>Do I have to use datetime objects to do 3b?</i>')
	user_input = gr.Textbox(label='Input')
	response_button = gr.Button('Go')
	response_box = gr.Textbox(lines=20, label='Response')
	response_button.click(fn=continue_session, inputs=[hw_num_radio, user_input, state_var], outputs=[response_box, state_var])

	demo.launch(auth=authenticate, server_name="0.0.0.0", show_error=True)