Spaces:

euler03
/

bbq

Sleeping

App Files Files Community

bbq / app.py

euler03

correct app.py

d50bfa3 verified about 1 year ago

raw

history blame

9.36 kB

	import os
	import gradio as gr
	import torch
	from llama_cpp import Llama
	from transformers import AutoModelForMultipleChoice, AutoTokenizer

	# -------------------------------------------------------
	# GPU setup
	# -------------------------------------------------------
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# -------------------------------------------------------
	# Load LLaMA Locally (for model-input generation)
	# -------------------------------------------------------
	LLAMA_MODEL_PATH = "/home/euler03/projects/bias/bias-detection/bias-detection/models/llama-2-7b-chat.Q4_K_M.gguf"
	if not os.path.exists(LLAMA_MODEL_PATH):
	raise FileNotFoundError(f" LLaMA model not found at: {LLAMA_MODEL_PATH}")

	llm = Llama(
	model_path=LLAMA_MODEL_PATH,
	n_ctx=512,
	n_gpu_layers=100 # adjust if needed
	)

	# -------------------------------------------------------
	# Load BBQ Fine-Tuned BERT Model & Tokenizer (multiple-choice as fine tuned int he bbq model)
	# -------------------------------------------------------
	BBQ_MODEL = "euler03/bbq-distil_bumble_bert"
	bbq_tokenizer = AutoTokenizer.from_pretrained(BBQ_MODEL)
	bbq_model = AutoModelForMultipleChoice.from_pretrained(BBQ_MODEL).to(device)

	# -------------------------------------------------------
	# List of Topics
	# -------------------------------------------------------
	TOPICS = [
	"Artificial Intelligence in Healthcare", "Climate Change and Renewable Energy",
	"Immigration Policies in the USA", "Social Media's Role in Elections",
	"The Ethics of Genetic Engineering", "Universal Basic Income Pros and Cons",
	"Impact of AI on Jobs", "Gender Pay Gap in the Workplace",
	"Government Surveillance and Privacy", "Cryptocurrency Regulation",
	"Censorship in Journalism", "Nuclear Energy as a Climate Solution",
	"Effects of Misinformation on Society", "Affirmative Action in Universities",
	"Automation and Its Effect on the Workforce", "The Role of Religion in Politics",
	"Healthcare Access in Rural Areas", "The Rise of Nationalism in Politics",
	"Police Use of Facial Recognition", "Space Exploration and Government Funding"
	]

	# -------------------------------------------------------
	# 5 Generation: Context, Question & 3 Answers using LLaMA
	# -------------------------------------------------------
	def generate_context_question_answers(topic):
	"""
	Use LLaMA (chat-style prompt) to generate:
	- A short, neutral context about the topic.
	- A question that tests bias on the topic.
	- Three possible answers (Answer0, Answer1, Answer2).
	The output is expected in the following format:
	Context: <...>
	Question: <...>
	Answer0: <...>
	Answer1: <...>
	Answer2: <...>
	"""
	system_prompt = "You are a helpful AI assistant that strictly follows user instructions."
	user_prompt = f"""
	Please write:
	Context: <2-3 sentences about {topic}>
	Question: <a question that tests bias on {topic}>
	Answer0: <possible answer #1>
	Answer1: <possible answer #2>
	Answer2: <possible answer #3>

	Use exactly these labels and no extra text.
	"""
	chat_prompt = f"""[INST] <<SYS>>
	{system_prompt}
	<</SYS>>

	{user_prompt}
	[/INST]"""

	response = llm(
	chat_prompt,
	max_tokens=256,
	temperature=1.0,
	echo=False
	)
	print("Raw LLaMA Output:", response)
	if "choices" in response and len(response["choices"]) > 0:
	text_output = response["choices"][0]["text"].strip()
	else:
	text_output = "[Error: LLaMA did not generate a response]"
	print("Processed LLaMA Output:", text_output)

	# Initialize with defaults comme ca on teste si generation works
	context_line = "[No context generated]"
	question_line = "[No question generated]"
	ans0_line = "[No answer0 generated]"
	ans1_line = "[No answer1 generated]"
	ans2_line = "[No answer2 generated]"

	lines = [line.strip() for line in text_output.split("\n") if line.strip()]
	for line in lines:
	lower_line = line.lower()
	if lower_line.startswith("context:"):
	context_line = line.split(":", 1)[1].strip()
	elif lower_line.startswith("question:"):
	question_line = line.split(":", 1)[1].strip()
	elif lower_line.startswith("answer0:"):
	ans0_line = line.split(":", 1)[1].strip()
	elif lower_line.startswith("answer1:"):
	ans1_line = line.split(":", 1)[1].strip()
	elif lower_line.startswith("answer2:"):
	ans2_line = line.split(":", 1)[1].strip()

	return context_line, question_line, ans0_line, ans1_line, ans2_line

	# -------------------------------------------------------
	# Classification: Run BBQ Model (Multiple-Choice)
	# -------------------------------------------------------
	def classify_multiple_choice(context, question, ans0, ans1, ans2):
	inputs = [f"{question} {ans}" for ans in (ans0, ans1, ans2)]
	contexts = [context, context, context]

	encodings = bbq_tokenizer(
	inputs,
	contexts,
	truncation=True,
	padding="max_length",
	max_length=128,
	return_tensors="pt"
	).to(device)

	bbq_model.eval()
	with torch.no_grad():
	outputs = bbq_model(**{k: v.unsqueeze(0) for k, v in encodings.items()})
	logits = outputs.logits[0]
	probs = torch.softmax(logits, dim=-1)
	pred_idx = torch.argmax(probs).item()
	all_answers = [ans0, ans1, ans2]
	prob_dict = {all_answers[i]: float(probs[i].item()) for i in range(3)}
	predicted_answer = all_answers[pred_idx]
	return predicted_answer, prob_dict

	# -------------------------------------------------------
	# Assess Objectivity: Compare User's Choice to Model's Prediction
	# -------------------------------------------------------
	def assess_objectivity(context, question, ans0, ans1, ans2, user_choice):

	predicted_answer, prob_dict = classify_multiple_choice(context, question, ans0, ans1, ans2)
	if user_choice == predicted_answer:
	assessment = (
	f"Your choice matches the model's prediction ('{predicted_answer}').\n"
	"This indicates an objective response."
	)
	else:
	assessment = (
	f"Your choice ('{user_choice}') does not match the model's prediction ('{predicted_answer}').\n"
	"This suggests a deviation from the objective standard."
	)
	return assessment, prob_dict

	# -------------------------------------------------------
	# Build the Gradio Interface
	# -------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 Bias Detection: Assessing Objectivity")
	gr.Markdown("""
	Steps:
	1. Select a topic from the dropdown.
	2. Click "Generate Context, Question & Answers" to generate a scenario.
	3. Review the generated context, question, and 3 candidate answers.
	4. Select your answer from the radio options.
	5. Click "Assess Objectivity" to see the model's evaluation.
	""")
	# Topic selection
	topic_dropdown = gr.Dropdown(choices=TOPICS, label="Select a Topic")

	# Outputs from LLaMA generation
	context_box = gr.Textbox(label="Generated Context", interactive=False)
	question_box = gr.Textbox(label="Generated Question", interactive=False)
	ans0_box = gr.Textbox(label="Generated Answer 0", interactive=False)
	ans1_box = gr.Textbox(label="Generated Answer 1", interactive=False)
	ans2_box = gr.Textbox(label="Generated Answer 2", interactive=False)

	# User selection: Choose one answer from the generated answers
	user_choice_radio = gr.Radio(choices=[], label="Select Your Answer")

	# Assessment outputs
	assessment_box = gr.Textbox(label="Objectivity Assessment", interactive=False)
	probabilities_box = gr.JSON(label="Confidence Probabilities")

	# Buttons
	generate_button = gr.Button("Generate Context, Question & Answers")
	assess_button = gr.Button("Assess Objectivity")

	# Callback 1: Generate with LLaMA
	def on_generate(topic):
	ctx, q, a0, a1, a2 = generate_context_question_answers(topic)
	# Update the radio button choices with the generated answers
	return ctx, q, a0, a1, a2, gr.update(choices=[a0, a1, a2], value=None)
	generate_button.click(
	fn=on_generate,
	inputs=[topic_dropdown],
	outputs=[context_box, question_box, ans0_box, ans1_box, ans2_box, user_choice_radio]
	)

	# Callback 2: Assess objectivity
	def on_assess(ctx, q, a0, a1, a2, user_choice):
	if user_choice is None or user_choice == "":
	return "Please select one of the generated answers.", {}
	assessment, probs = assess_objectivity(ctx, q, a0, a1, a2, user_choice)
	return assessment, probs
	assess_button.click(
	fn=on_assess,
	inputs=[context_box, question_box, ans0_box, ans1_box, ans2_box, user_choice_radio],
	outputs=[assessment_box, probabilities_box]
	)

	gr.Markdown("""
	### How It Works:
	- LLaMA generates a scenario (context, question, and three candidate answers).
	- You select one answer that you think is most objective.
	- The BBQ model classifies the same scenario and outputs the answer it deems most objective along with confidence scores.
	- The app compares your choice with the model’s prediction and provides an objectivity assessment.
	""")

	demo.launch()