Spaces:

yashm
/

Bioinformatics

Sleeping

App Files Files Community

Bioinformatics / app.py

yashm

Update app.py

5be6f35 verified about 1 month ago

raw

history blame contribute delete

19.5 kB

	import os
	import random
	import tempfile
	from functools import lru_cache
	from typing import Dict, List, Tuple

	import gradio as gr
	import pandas as pd
	from datasets import load_dataset

	DATASET_REPO = "yashm/bioinformatics-qa-dataset"
	RANDOM_SEED = 42

	random.seed(RANDOM_SEED)


	@lru_cache(maxsize=1)
	def load_data() -> pd.DataFrame:
	ds = load_dataset(DATASET_REPO)

	frames = []
	for split_name in ds.keys():
	part = ds[split_name].to_pandas().copy()
	part["split"] = split_name
	frames.append(part)

	df = pd.concat(frames, ignore_index=True)

	required = ["id", "topic", "question", "answer"]
	missing = [c for c in required if c not in df.columns]
	if missing:
	raise ValueError(f"Dataset is missing required columns: {missing}")

	df = df[["id", "topic", "question", "answer", "split"]].copy()
	for col in ["topic", "question", "answer", "split"]:
	df[col] = df[col].astype(str).str.strip()

	df = df.dropna(subset=["topic", "question", "answer"])
	df = df[(df["question"] != "") & (df["answer"] != "")]
	df["answer_len"] = df["answer"].str.len()
	df = df.reset_index(drop=True)

	return df


	DF = load_data()
	ALL_TOPICS = sorted(DF["topic"].unique().tolist())
	ALL_SPLITS = sorted(DF["split"].unique().tolist())


	def compute_stats(df: pd.DataFrame) -> str:
	total_rows = len(df)
	total_topics = df["topic"].nunique() if total_rows else 0
	avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0
	return (
	f"Total rows: {total_rows} \| "
	f"Unique topics: {total_topics} \| "
	f"Average answer length: {avg_answer_len:.1f} chars"
	)


	def apply_filters(
	topic: str,
	split: str,
	keyword: str,
	min_len: int,
	max_len: int,
	sort_by: str,
	sort_dir: str
	) -> pd.DataFrame:
	out = DF.copy()

	if topic != "All":
	out = out[out["topic"] == topic]

	if split != "All":
	out = out[out["split"] == split]

	if keyword and keyword.strip():
	q = keyword.strip().lower()
	out = out[
	out["topic"].str.lower().str.contains(q, na=False)
	\| out["question"].str.lower().str.contains(q, na=False)
	\| out["answer"].str.lower().str.contains(q, na=False)
	]

	out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))]

	col_map = {
	"ID": "id",
	"Topic": "topic",
	"Question length": "question",
	"Answer length": "answer_len",
	"Split": "split",
	}
	sort_col = col_map.get(sort_by, "id")
	ascending = sort_dir == "Ascending"
	out = out.sort_values(by=sort_col, ascending=ascending, kind="stable")

	return out.reset_index(drop=True)


	def run_explore(
	topic: str,
	split: str,
	keyword: str,
	min_len: int,
	max_len: int,
	sort_by: str,
	sort_dir: str,
	page_size: int,
	page_number: int
	):
	filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
	total = len(filtered)
	pages = max(1, (total + page_size - 1) // page_size)
	page_number = min(max(1, page_number), pages)

	start = (page_number - 1) * page_size
	end = min(start + page_size, total)

	page_df = filtered.iloc[start:end].copy()
	table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]]

	summary = (
	f"{compute_stats(filtered)}\n"
	f"Showing rows {start + 1} to {end if total else 0} of {total} \| "
	f"Page {page_number} of {pages}"
	)

	max_row_slider = max(1, len(page_df))
	return (
	summary,
	table_df,
	page_df.to_json(orient="records"),
	gr.update(maximum=pages, value=page_number),
	gr.update(maximum=max_row_slider, value=1),
	)


	def show_row_detail(page_df_json: str, row_idx_1based: int):
	if not page_df_json:
	return "No data loaded for this page.", "", "", "", ""

	page_df = pd.read_json(page_df_json)
	if page_df.empty:
	return "No rows in this page.", "", "", "", ""

	idx = int(row_idx_1based) - 1
	idx = max(0, min(idx, len(page_df) - 1))
	row = page_df.iloc[idx]

	header = f"Record {idx + 1} on current page"
	return (
	header,
	str(row["topic"]),
	str(row["question"]),
	str(row["answer"]),
	f"Split: {row['split']} \| ID: {row['id']} \| Answer length: {row['answer_len']}",
	)


	def export_filtered_csv(
	topic: str,
	split: str,
	keyword: str,
	min_len: int,
	max_len: int,
	sort_by: str,
	sort_dir: str
	):
	filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
	export_df = filtered[["id", "topic", "question", "answer", "split"]].copy()

	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	export_df.to_csv(tmp.name, index=False)
	return tmp.name


	def related_examples(question_text: str, topic: str, k: int = 3) -> str:
	subset = DF[DF["topic"] == topic].copy()
	if subset.empty:
	return "No related examples found."

	q_words = set(str(question_text).lower().split())
	if not q_words:
	return "No related examples found."

	def overlap_score(text: str) -> int:
	return len(q_words.intersection(set(str(text).lower().split())))

	subset["score"] = subset["question"].apply(overlap_score)
	subset = subset.sort_values(by=["score", "id"], ascending=[False, True])
	subset = subset[subset["question"] != question_text].head(k)

	if subset.empty:
	return "No related examples found."

	lines = []
	for _, r in subset.iterrows():
	lines.append(f"- {r['question']}")
	return "\n".join(lines)


	def score_text(correct: int, total: int, streak: int, best_streak: int) -> str:
	acc = (100.0 * correct / total) if total > 0 else 0.0
	return (
	f"Score: {correct}/{total} \| "
	f"Accuracy: {acc:.1f}% \| "
	f"Streak: {streak} \| "
	f"Best streak: {best_streak}"
	)


	def generate_question(topic_filter: str, difficulty: str):
	pool = DF.copy()
	if topic_filter != "All":
	pool = pool[pool["topic"] == topic_filter]

	if pool.empty:
	return (
	"No questions available for this filter.",
	gr.update(choices=[], value=None),
	"",
	"",
	"",
	"",
	)

	row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0]
	topic = str(row["topic"])
	question = str(row["question"])
	correct = str(row["answer"])

	same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy()
	global_pool = DF[DF["answer"] != correct].copy()

	if difficulty == "Easy":
	candidate = global_pool
	elif difficulty == "Medium":
	candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool
	else:
	target_len = len(correct)
	hard_pool = same_topic.copy()
	hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs()
	hard_pool = hard_pool.sort_values(by=["len_gap", "id"])
	if len(hard_pool["answer"].unique()) >= 3:
	candidate = hard_pool
	elif len(same_topic["answer"].unique()) >= 3:
	candidate = same_topic
	else:
	candidate = global_pool

	distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist()
	if len(distractor_answers) < 3:
	return (
	"Not enough distractors to generate a 4-option question.",
	gr.update(choices=[], value=None),
	"",
	"",
	"",
	"",
	)

	distractors = random.sample(distractor_answers, 3)
	options = distractors + [correct]
	random.shuffle(options)

	question_block = f"Topic: {topic}\n\nQuestion: {question}"

	teach_note = (
	f"Teaching note: This question belongs to {topic}. "
	f"Focus on core definitions and tool usage terms."
	)
	related = related_examples(question, topic, k=3)

	return (
	question_block,
	gr.update(choices=options, value=None),
	correct,
	question,
	topic,
	f"{teach_note}\n\nRelated questions:\n{related}",
	)


	def start_quiz(
	topic_filter: str,
	difficulty: str,
	correct_count: int,
	total_count: int,
	streak: int,
	best_streak: int
	):
	q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty)
	return (
	q,
	choices,
	correct,
	raw_q,
	raw_topic,
	teach,
	score_text(correct_count, total_count, streak, best_streak),
	"Quiz started. Select an answer and submit.",
	)


	def submit_and_next(
	selected: str,
	current_correct: str,
	current_q: str,
	current_topic: str,
	topic_filter: str,
	difficulty: str,
	correct_count: int,
	total_count: int,
	streak: int,
	best_streak: int
	):
	if not current_correct or not current_q:
	return (
	"Click Start Quiz first.",
	gr.update(),
	gr.update(),
	current_correct,
	current_q,
	current_topic,
	"",
	score_text(correct_count, total_count, streak, best_streak),
	"No active question.",
	correct_count,
	total_count,
	streak,
	best_streak,
	)

	if not selected:
	return (
	"Please select one option before submitting.",
	gr.update(),
	gr.update(),
	current_correct,
	current_q,
	current_topic,
	"",
	score_text(correct_count, total_count, streak, best_streak),
	"Waiting for answer selection.",
	correct_count,
	total_count,
	streak,
	best_streak,
	)

	total_count += 1
	if selected == current_correct:
	correct_count += 1
	streak += 1
	best_streak = max(best_streak, streak)
	result = (
	"Correct.\n\n"
	f"Your answer: {selected}\n\n"
	f"Reference answer: {current_correct}"
	)
	else:
	streak = 0
	result = (
	"Incorrect.\n\n"
	f"Your answer: {selected}\n\n"
	f"Correct answer: {current_correct}"
	)

	next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question(
	topic_filter, difficulty
	)

	return (
	result,
	next_q,
	next_choices,
	next_correct,
	next_raw_q,
	next_raw_topic,
	next_teach,
	score_text(correct_count, total_count, streak, best_streak),
	"Auto-loaded next question.",
	correct_count,
	total_count,
	streak,
	best_streak,
	)


	def reset_score():
	return (
	0, 0, 0, 0,
	score_text(0, 0, 0, 0),
	"Score reset. Click Start Quiz."
	)


	CSS = """
	:root {
	--brand: #0f766e;
	--accent: #0ea5e9;
	--bg-soft: #f8fafc;
	--card: #ffffff;
	--text: #0f172a;
	--muted: #475569;
	}
	body {
	background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%);
	}
	.gradio-container {
	max-width: 1280px !important;
	}
	#hero {
	background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10));
	border: 1px solid rgba(15,118,110,0.20);
	border-radius: 16px;
	padding: 14px 16px;
	}
	#hero h1, #hero p {
	color: var(--text);
	}
	.card {
	background: var(--card);
	border-radius: 14px;
	border: 1px solid #e2e8f0;
	padding: 10px 12px;
	}
	"""

	with gr.Blocks(
	title="Bioinformatics QA Teaching Studio",
	css=CSS,
	theme=gr.themes.Soft(
	primary_hue="teal",
	secondary_hue="sky",
	neutral_hue="slate"
	),
	) as demo:
	gr.HTML(
	"""
	<div id="hero">
	<h1>Bioinformatics QA Teaching Studio</h1>
	<p>
	Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes.
	This app is for learning and research purposes only. Validate content before high-stakes use.
	</p>
	</div>
	"""
	)

	with gr.Tabs():
	with gr.Tab("Explore"):
	with gr.Row():
	topic_dd = gr.Dropdown(
	choices=["All"] + ALL_TOPICS,
	value="All",
	label="Topic"
	)
	split_dd = gr.Dropdown(
	choices=["All"] + ALL_SPLITS,
	value="All",
	label="Split"
	)
	keyword_tb = gr.Textbox(
	label="Keyword search",
	placeholder="Search topic, question, or answer"
	)

	with gr.Row():
	min_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=0, step=1, label="Min answer length")
	max_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=int(DF["answer_len"].max()), step=1, label="Max answer length")
	sort_by = gr.Dropdown(
	choices=["ID", "Topic", "Question length", "Answer length", "Split"],
	value="ID",
	label="Sort by"
	)
	sort_dir = gr.Radio(
	choices=["Ascending", "Descending"],
	value="Ascending",
	label="Order"
	)

	with gr.Row():
	page_size = gr.Slider(5, 100, value=15, step=5, label="Rows per page")
	page_number = gr.Slider(1, 1, value=1, step=1, label="Page")
	run_btn = gr.Button("Apply filters", variant="primary")
	export_btn = gr.Button("Export filtered CSV")

	summary_md = gr.Markdown(value=compute_stats(DF))
	table = gr.Dataframe(
	headers=["id", "topic", "question", "answer", "split", "answer_len"],
	wrap=True,
	interactive=False,
	label="Filtered results"
	)

	filtered_state = gr.State("")
	row_slider = gr.Slider(1, 1, value=1, step=1, label="Inspect row on current page")
	inspect_btn = gr.Button("Show row details")

	detail_header = gr.Markdown(value="Select filters and click Apply.")
	detail_topic = gr.Textbox(label="Topic", interactive=False)
	detail_question = gr.Textbox(label="Question", lines=4, interactive=False)
	detail_answer = gr.Textbox(label="Answer", lines=7, interactive=False)
	detail_meta = gr.Textbox(label="Metadata", interactive=False)
	csv_file = gr.File(label="Download CSV", interactive=False)

	run_btn.click(
	fn=run_explore,
	inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir, page_size, page_number],
	outputs=[summary_md, table, filtered_state, page_number, row_slider],
	)

	inspect_btn.click(
	fn=show_row_detail,
	inputs=[filtered_state, row_slider],
	outputs=[detail_header, detail_topic, detail_question, detail_answer, detail_meta],
	)

	export_btn.click(
	fn=export_filtered_csv,
	inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir],
	outputs=[csv_file],
	)

	with gr.Tab("Quiz"):
	with gr.Row():
	quiz_topic = gr.Dropdown(
	choices=["All"] + ALL_TOPICS,
	value="All",
	label="Topic filter"
	)
	difficulty = gr.Radio(
	choices=["Easy", "Medium", "Hard"],
	value="Medium",
	label="Difficulty"
	)
	start_btn = gr.Button("Start quiz", variant="primary")
	reset_btn = gr.Button("Reset score")

	quiz_score = gr.Markdown(value=score_text(0, 0, 0, 0))
	quiz_status = gr.Textbox(label="Status", interactive=False)

	question_box = gr.Textbox(label="Question", lines=5, interactive=False)
	choices_radio = gr.Radio(choices=[], label="Choose one answer")
	submit_btn = gr.Button("Submit and load next question", variant="primary")

	result_box = gr.Textbox(label="Result", lines=6, interactive=False)
	teaching_box = gr.Textbox(label="Teaching support", lines=8, interactive=False)

	correct_state = gr.State("")
	q_state = gr.State("")
	topic_state = gr.State("")

	correct_count_state = gr.State(0)
	total_count_state = gr.State(0)
	streak_state = gr.State(0)
	best_streak_state = gr.State(0)

	start_btn.click(
	fn=start_quiz,
	inputs=[
	quiz_topic,
	difficulty,
	correct_count_state,
	total_count_state,
	streak_state,
	best_streak_state,
	],
	outputs=[
	question_box,
	choices_radio,
	correct_state,
	q_state,
	topic_state,
	teaching_box,
	quiz_score,
	quiz_status,
	],
	)

	submit_btn.click(
	fn=submit_and_next,
	inputs=[
	choices_radio,
	correct_state,
	q_state,
	topic_state,
	quiz_topic,
	difficulty,
	correct_count_state,
	total_count_state,
	streak_state,
	best_streak_state,
	],
	outputs=[
	result_box,
	question_box,
	choices_radio,
	correct_state,
	q_state,
	topic_state,
	teaching_box,
	quiz_score,
	quiz_status,
	correct_count_state,
	total_count_state,
	streak_state,
	best_streak_state,
	],
	)

	reset_btn.click(
	fn=reset_score,
	inputs=[],
	outputs=[
	correct_count_state,
	total_count_state,
	streak_state,
	best_streak_state,
	quiz_score,
	quiz_status,
	],
	)

	with gr.Tab("About"):
	gr.Markdown(
	"""
	## About this teaching app

	This Space demonstrates:
	- Practical dataset exploration for bioinformatics QA data
	- Teaching-mode multiple-choice practice with topic-aware distractors
	- Session score tracking with streak metrics

	## Important notice

	This app is intended for learning and research use only.
	Use with caution.
	Do not use as a replacement for expert biomedical or clinical judgment.

	## Dataset

	- Source: yashm/bioinformatics-qa-dataset
	- Citation and DOI are listed in the project README
	"""
	)

	demo.launch()