Bioinformatics / app.py
yashm's picture
Update app.py
5be6f35 verified
import os
import random
import tempfile
from functools import lru_cache
from typing import Dict, List, Tuple
import gradio as gr
import pandas as pd
from datasets import load_dataset
DATASET_REPO = "yashm/bioinformatics-qa-dataset"
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
@lru_cache(maxsize=1)
def load_data() -> pd.DataFrame:
ds = load_dataset(DATASET_REPO)
frames = []
for split_name in ds.keys():
part = ds[split_name].to_pandas().copy()
part["split"] = split_name
frames.append(part)
df = pd.concat(frames, ignore_index=True)
required = ["id", "topic", "question", "answer"]
missing = [c for c in required if c not in df.columns]
if missing:
raise ValueError(f"Dataset is missing required columns: {missing}")
df = df[["id", "topic", "question", "answer", "split"]].copy()
for col in ["topic", "question", "answer", "split"]:
df[col] = df[col].astype(str).str.strip()
df = df.dropna(subset=["topic", "question", "answer"])
df = df[(df["question"] != "") & (df["answer"] != "")]
df["answer_len"] = df["answer"].str.len()
df = df.reset_index(drop=True)
return df
DF = load_data()
ALL_TOPICS = sorted(DF["topic"].unique().tolist())
ALL_SPLITS = sorted(DF["split"].unique().tolist())
def compute_stats(df: pd.DataFrame) -> str:
total_rows = len(df)
total_topics = df["topic"].nunique() if total_rows else 0
avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0
return (
f"Total rows: {total_rows} | "
f"Unique topics: {total_topics} | "
f"Average answer length: {avg_answer_len:.1f} chars"
)
def apply_filters(
topic: str,
split: str,
keyword: str,
min_len: int,
max_len: int,
sort_by: str,
sort_dir: str
) -> pd.DataFrame:
out = DF.copy()
if topic != "All":
out = out[out["topic"] == topic]
if split != "All":
out = out[out["split"] == split]
if keyword and keyword.strip():
q = keyword.strip().lower()
out = out[
out["topic"].str.lower().str.contains(q, na=False)
| out["question"].str.lower().str.contains(q, na=False)
| out["answer"].str.lower().str.contains(q, na=False)
]
out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))]
col_map = {
"ID": "id",
"Topic": "topic",
"Question length": "question",
"Answer length": "answer_len",
"Split": "split",
}
sort_col = col_map.get(sort_by, "id")
ascending = sort_dir == "Ascending"
out = out.sort_values(by=sort_col, ascending=ascending, kind="stable")
return out.reset_index(drop=True)
def run_explore(
topic: str,
split: str,
keyword: str,
min_len: int,
max_len: int,
sort_by: str,
sort_dir: str,
page_size: int,
page_number: int
):
filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
total = len(filtered)
pages = max(1, (total + page_size - 1) // page_size)
page_number = min(max(1, page_number), pages)
start = (page_number - 1) * page_size
end = min(start + page_size, total)
page_df = filtered.iloc[start:end].copy()
table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]]
summary = (
f"{compute_stats(filtered)}\n"
f"Showing rows {start + 1} to {end if total else 0} of {total} | "
f"Page {page_number} of {pages}"
)
max_row_slider = max(1, len(page_df))
return (
summary,
table_df,
page_df.to_json(orient="records"),
gr.update(maximum=pages, value=page_number),
gr.update(maximum=max_row_slider, value=1),
)
def show_row_detail(page_df_json: str, row_idx_1based: int):
if not page_df_json:
return "No data loaded for this page.", "", "", "", ""
page_df = pd.read_json(page_df_json)
if page_df.empty:
return "No rows in this page.", "", "", "", ""
idx = int(row_idx_1based) - 1
idx = max(0, min(idx, len(page_df) - 1))
row = page_df.iloc[idx]
header = f"Record {idx + 1} on current page"
return (
header,
str(row["topic"]),
str(row["question"]),
str(row["answer"]),
f"Split: {row['split']} | ID: {row['id']} | Answer length: {row['answer_len']}",
)
def export_filtered_csv(
topic: str,
split: str,
keyword: str,
min_len: int,
max_len: int,
sort_by: str,
sort_dir: str
):
filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
export_df = filtered[["id", "topic", "question", "answer", "split"]].copy()
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
export_df.to_csv(tmp.name, index=False)
return tmp.name
def related_examples(question_text: str, topic: str, k: int = 3) -> str:
subset = DF[DF["topic"] == topic].copy()
if subset.empty:
return "No related examples found."
q_words = set(str(question_text).lower().split())
if not q_words:
return "No related examples found."
def overlap_score(text: str) -> int:
return len(q_words.intersection(set(str(text).lower().split())))
subset["score"] = subset["question"].apply(overlap_score)
subset = subset.sort_values(by=["score", "id"], ascending=[False, True])
subset = subset[subset["question"] != question_text].head(k)
if subset.empty:
return "No related examples found."
lines = []
for _, r in subset.iterrows():
lines.append(f"- {r['question']}")
return "\n".join(lines)
def score_text(correct: int, total: int, streak: int, best_streak: int) -> str:
acc = (100.0 * correct / total) if total > 0 else 0.0
return (
f"Score: {correct}/{total} | "
f"Accuracy: {acc:.1f}% | "
f"Streak: {streak} | "
f"Best streak: {best_streak}"
)
def generate_question(topic_filter: str, difficulty: str):
pool = DF.copy()
if topic_filter != "All":
pool = pool[pool["topic"] == topic_filter]
if pool.empty:
return (
"No questions available for this filter.",
gr.update(choices=[], value=None),
"",
"",
"",
"",
)
row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0]
topic = str(row["topic"])
question = str(row["question"])
correct = str(row["answer"])
same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy()
global_pool = DF[DF["answer"] != correct].copy()
if difficulty == "Easy":
candidate = global_pool
elif difficulty == "Medium":
candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool
else:
target_len = len(correct)
hard_pool = same_topic.copy()
hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs()
hard_pool = hard_pool.sort_values(by=["len_gap", "id"])
if len(hard_pool["answer"].unique()) >= 3:
candidate = hard_pool
elif len(same_topic["answer"].unique()) >= 3:
candidate = same_topic
else:
candidate = global_pool
distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist()
if len(distractor_answers) < 3:
return (
"Not enough distractors to generate a 4-option question.",
gr.update(choices=[], value=None),
"",
"",
"",
"",
)
distractors = random.sample(distractor_answers, 3)
options = distractors + [correct]
random.shuffle(options)
question_block = f"Topic: {topic}\n\nQuestion: {question}"
teach_note = (
f"Teaching note: This question belongs to {topic}. "
f"Focus on core definitions and tool usage terms."
)
related = related_examples(question, topic, k=3)
return (
question_block,
gr.update(choices=options, value=None),
correct,
question,
topic,
f"{teach_note}\n\nRelated questions:\n{related}",
)
def start_quiz(
topic_filter: str,
difficulty: str,
correct_count: int,
total_count: int,
streak: int,
best_streak: int
):
q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty)
return (
q,
choices,
correct,
raw_q,
raw_topic,
teach,
score_text(correct_count, total_count, streak, best_streak),
"Quiz started. Select an answer and submit.",
)
def submit_and_next(
selected: str,
current_correct: str,
current_q: str,
current_topic: str,
topic_filter: str,
difficulty: str,
correct_count: int,
total_count: int,
streak: int,
best_streak: int
):
if not current_correct or not current_q:
return (
"Click Start Quiz first.",
gr.update(),
gr.update(),
current_correct,
current_q,
current_topic,
"",
score_text(correct_count, total_count, streak, best_streak),
"No active question.",
correct_count,
total_count,
streak,
best_streak,
)
if not selected:
return (
"Please select one option before submitting.",
gr.update(),
gr.update(),
current_correct,
current_q,
current_topic,
"",
score_text(correct_count, total_count, streak, best_streak),
"Waiting for answer selection.",
correct_count,
total_count,
streak,
best_streak,
)
total_count += 1
if selected == current_correct:
correct_count += 1
streak += 1
best_streak = max(best_streak, streak)
result = (
"Correct.\n\n"
f"Your answer: {selected}\n\n"
f"Reference answer: {current_correct}"
)
else:
streak = 0
result = (
"Incorrect.\n\n"
f"Your answer: {selected}\n\n"
f"Correct answer: {current_correct}"
)
next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question(
topic_filter, difficulty
)
return (
result,
next_q,
next_choices,
next_correct,
next_raw_q,
next_raw_topic,
next_teach,
score_text(correct_count, total_count, streak, best_streak),
"Auto-loaded next question.",
correct_count,
total_count,
streak,
best_streak,
)
def reset_score():
return (
0, 0, 0, 0,
score_text(0, 0, 0, 0),
"Score reset. Click Start Quiz."
)
CSS = """
:root {
--brand: #0f766e;
--accent: #0ea5e9;
--bg-soft: #f8fafc;
--card: #ffffff;
--text: #0f172a;
--muted: #475569;
}
body {
background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%);
}
.gradio-container {
max-width: 1280px !important;
}
#hero {
background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10));
border: 1px solid rgba(15,118,110,0.20);
border-radius: 16px;
padding: 14px 16px;
}
#hero h1, #hero p {
color: var(--text);
}
.card {
background: var(--card);
border-radius: 14px;
border: 1px solid #e2e8f0;
padding: 10px 12px;
}
"""
with gr.Blocks(
title="Bioinformatics QA Teaching Studio",
css=CSS,
theme=gr.themes.Soft(
primary_hue="teal",
secondary_hue="sky",
neutral_hue="slate"
),
) as demo:
gr.HTML(
"""
<div id="hero">
<h1>Bioinformatics QA Teaching Studio</h1>
<p>
Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes.
This app is for learning and research purposes only. Validate content before high-stakes use.
</p>
</div>
"""
)
with gr.Tabs():
with gr.Tab("Explore"):
with gr.Row():
topic_dd = gr.Dropdown(
choices=["All"] + ALL_TOPICS,
value="All",
label="Topic"
)
split_dd = gr.Dropdown(
choices=["All"] + ALL_SPLITS,
value="All",
label="Split"
)
keyword_tb = gr.Textbox(
label="Keyword search",
placeholder="Search topic, question, or answer"
)
with gr.Row():
min_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=0, step=1, label="Min answer length")
max_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=int(DF["answer_len"].max()), step=1, label="Max answer length")
sort_by = gr.Dropdown(
choices=["ID", "Topic", "Question length", "Answer length", "Split"],
value="ID",
label="Sort by"
)
sort_dir = gr.Radio(
choices=["Ascending", "Descending"],
value="Ascending",
label="Order"
)
with gr.Row():
page_size = gr.Slider(5, 100, value=15, step=5, label="Rows per page")
page_number = gr.Slider(1, 1, value=1, step=1, label="Page")
run_btn = gr.Button("Apply filters", variant="primary")
export_btn = gr.Button("Export filtered CSV")
summary_md = gr.Markdown(value=compute_stats(DF))
table = gr.Dataframe(
headers=["id", "topic", "question", "answer", "split", "answer_len"],
wrap=True,
interactive=False,
label="Filtered results"
)
filtered_state = gr.State("")
row_slider = gr.Slider(1, 1, value=1, step=1, label="Inspect row on current page")
inspect_btn = gr.Button("Show row details")
detail_header = gr.Markdown(value="Select filters and click Apply.")
detail_topic = gr.Textbox(label="Topic", interactive=False)
detail_question = gr.Textbox(label="Question", lines=4, interactive=False)
detail_answer = gr.Textbox(label="Answer", lines=7, interactive=False)
detail_meta = gr.Textbox(label="Metadata", interactive=False)
csv_file = gr.File(label="Download CSV", interactive=False)
run_btn.click(
fn=run_explore,
inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir, page_size, page_number],
outputs=[summary_md, table, filtered_state, page_number, row_slider],
)
inspect_btn.click(
fn=show_row_detail,
inputs=[filtered_state, row_slider],
outputs=[detail_header, detail_topic, detail_question, detail_answer, detail_meta],
)
export_btn.click(
fn=export_filtered_csv,
inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir],
outputs=[csv_file],
)
with gr.Tab("Quiz"):
with gr.Row():
quiz_topic = gr.Dropdown(
choices=["All"] + ALL_TOPICS,
value="All",
label="Topic filter"
)
difficulty = gr.Radio(
choices=["Easy", "Medium", "Hard"],
value="Medium",
label="Difficulty"
)
start_btn = gr.Button("Start quiz", variant="primary")
reset_btn = gr.Button("Reset score")
quiz_score = gr.Markdown(value=score_text(0, 0, 0, 0))
quiz_status = gr.Textbox(label="Status", interactive=False)
question_box = gr.Textbox(label="Question", lines=5, interactive=False)
choices_radio = gr.Radio(choices=[], label="Choose one answer")
submit_btn = gr.Button("Submit and load next question", variant="primary")
result_box = gr.Textbox(label="Result", lines=6, interactive=False)
teaching_box = gr.Textbox(label="Teaching support", lines=8, interactive=False)
correct_state = gr.State("")
q_state = gr.State("")
topic_state = gr.State("")
correct_count_state = gr.State(0)
total_count_state = gr.State(0)
streak_state = gr.State(0)
best_streak_state = gr.State(0)
start_btn.click(
fn=start_quiz,
inputs=[
quiz_topic,
difficulty,
correct_count_state,
total_count_state,
streak_state,
best_streak_state,
],
outputs=[
question_box,
choices_radio,
correct_state,
q_state,
topic_state,
teaching_box,
quiz_score,
quiz_status,
],
)
submit_btn.click(
fn=submit_and_next,
inputs=[
choices_radio,
correct_state,
q_state,
topic_state,
quiz_topic,
difficulty,
correct_count_state,
total_count_state,
streak_state,
best_streak_state,
],
outputs=[
result_box,
question_box,
choices_radio,
correct_state,
q_state,
topic_state,
teaching_box,
quiz_score,
quiz_status,
correct_count_state,
total_count_state,
streak_state,
best_streak_state,
],
)
reset_btn.click(
fn=reset_score,
inputs=[],
outputs=[
correct_count_state,
total_count_state,
streak_state,
best_streak_state,
quiz_score,
quiz_status,
],
)
with gr.Tab("About"):
gr.Markdown(
"""
## About this teaching app
This Space demonstrates:
- Practical dataset exploration for bioinformatics QA data
- Teaching-mode multiple-choice practice with topic-aware distractors
- Session score tracking with streak metrics
## Important notice
This app is intended for learning and research use only.
Use with caution.
Do not use as a replacement for expert biomedical or clinical judgment.
## Dataset
- Source: yashm/bioinformatics-qa-dataset
- Citation and DOI are listed in the project README
"""
)
demo.launch()