import os import random import tempfile from functools import lru_cache from typing import Dict, List, Tuple import gradio as gr import pandas as pd from datasets import load_dataset DATASET_REPO = "yashm/bioinformatics-qa-dataset" RANDOM_SEED = 42 random.seed(RANDOM_SEED) @lru_cache(maxsize=1) def load_data() -> pd.DataFrame: ds = load_dataset(DATASET_REPO) frames = [] for split_name in ds.keys(): part = ds[split_name].to_pandas().copy() part["split"] = split_name frames.append(part) df = pd.concat(frames, ignore_index=True) required = ["id", "topic", "question", "answer"] missing = [c for c in required if c not in df.columns] if missing: raise ValueError(f"Dataset is missing required columns: {missing}") df = df[["id", "topic", "question", "answer", "split"]].copy() for col in ["topic", "question", "answer", "split"]: df[col] = df[col].astype(str).str.strip() df = df.dropna(subset=["topic", "question", "answer"]) df = df[(df["question"] != "") & (df["answer"] != "")] df["answer_len"] = df["answer"].str.len() df = df.reset_index(drop=True) return df DF = load_data() ALL_TOPICS = sorted(DF["topic"].unique().tolist()) ALL_SPLITS = sorted(DF["split"].unique().tolist()) def compute_stats(df: pd.DataFrame) -> str: total_rows = len(df) total_topics = df["topic"].nunique() if total_rows else 0 avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0 return ( f"Total rows: {total_rows} | " f"Unique topics: {total_topics} | " f"Average answer length: {avg_answer_len:.1f} chars" ) def apply_filters( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str ) -> pd.DataFrame: out = DF.copy() if topic != "All": out = out[out["topic"] == topic] if split != "All": out = out[out["split"] == split] if keyword and keyword.strip(): q = keyword.strip().lower() out = out[ out["topic"].str.lower().str.contains(q, na=False) | out["question"].str.lower().str.contains(q, na=False) | out["answer"].str.lower().str.contains(q, na=False) ] out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))] col_map = { "ID": "id", "Topic": "topic", "Question length": "question", "Answer length": "answer_len", "Split": "split", } sort_col = col_map.get(sort_by, "id") ascending = sort_dir == "Ascending" out = out.sort_values(by=sort_col, ascending=ascending, kind="stable") return out.reset_index(drop=True) def run_explore( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str, page_size: int, page_number: int ): filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) total = len(filtered) pages = max(1, (total + page_size - 1) // page_size) page_number = min(max(1, page_number), pages) start = (page_number - 1) * page_size end = min(start + page_size, total) page_df = filtered.iloc[start:end].copy() table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]] summary = ( f"{compute_stats(filtered)}\n" f"Showing rows {start + 1} to {end if total else 0} of {total} | " f"Page {page_number} of {pages}" ) max_row_slider = max(1, len(page_df)) return ( summary, table_df, page_df.to_json(orient="records"), gr.update(maximum=pages, value=page_number), gr.update(maximum=max_row_slider, value=1), ) def show_row_detail(page_df_json: str, row_idx_1based: int): if not page_df_json: return "No data loaded for this page.", "", "", "", "" page_df = pd.read_json(page_df_json) if page_df.empty: return "No rows in this page.", "", "", "", "" idx = int(row_idx_1based) - 1 idx = max(0, min(idx, len(page_df) - 1)) row = page_df.iloc[idx] header = f"Record {idx + 1} on current page" return ( header, str(row["topic"]), str(row["question"]), str(row["answer"]), f"Split: {row['split']} | ID: {row['id']} | Answer length: {row['answer_len']}", ) def export_filtered_csv( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str ): filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) export_df = filtered[["id", "topic", "question", "answer", "split"]].copy() with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: export_df.to_csv(tmp.name, index=False) return tmp.name def related_examples(question_text: str, topic: str, k: int = 3) -> str: subset = DF[DF["topic"] == topic].copy() if subset.empty: return "No related examples found." q_words = set(str(question_text).lower().split()) if not q_words: return "No related examples found." def overlap_score(text: str) -> int: return len(q_words.intersection(set(str(text).lower().split()))) subset["score"] = subset["question"].apply(overlap_score) subset = subset.sort_values(by=["score", "id"], ascending=[False, True]) subset = subset[subset["question"] != question_text].head(k) if subset.empty: return "No related examples found." lines = [] for _, r in subset.iterrows(): lines.append(f"- {r['question']}") return "\n".join(lines) def score_text(correct: int, total: int, streak: int, best_streak: int) -> str: acc = (100.0 * correct / total) if total > 0 else 0.0 return ( f"Score: {correct}/{total} | " f"Accuracy: {acc:.1f}% | " f"Streak: {streak} | " f"Best streak: {best_streak}" ) def generate_question(topic_filter: str, difficulty: str): pool = DF.copy() if topic_filter != "All": pool = pool[pool["topic"] == topic_filter] if pool.empty: return ( "No questions available for this filter.", gr.update(choices=[], value=None), "", "", "", "", ) row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0] topic = str(row["topic"]) question = str(row["question"]) correct = str(row["answer"]) same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy() global_pool = DF[DF["answer"] != correct].copy() if difficulty == "Easy": candidate = global_pool elif difficulty == "Medium": candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool else: target_len = len(correct) hard_pool = same_topic.copy() hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs() hard_pool = hard_pool.sort_values(by=["len_gap", "id"]) if len(hard_pool["answer"].unique()) >= 3: candidate = hard_pool elif len(same_topic["answer"].unique()) >= 3: candidate = same_topic else: candidate = global_pool distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist() if len(distractor_answers) < 3: return ( "Not enough distractors to generate a 4-option question.", gr.update(choices=[], value=None), "", "", "", "", ) distractors = random.sample(distractor_answers, 3) options = distractors + [correct] random.shuffle(options) question_block = f"Topic: {topic}\n\nQuestion: {question}" teach_note = ( f"Teaching note: This question belongs to {topic}. " f"Focus on core definitions and tool usage terms." ) related = related_examples(question, topic, k=3) return ( question_block, gr.update(choices=options, value=None), correct, question, topic, f"{teach_note}\n\nRelated questions:\n{related}", ) def start_quiz( topic_filter: str, difficulty: str, correct_count: int, total_count: int, streak: int, best_streak: int ): q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty) return ( q, choices, correct, raw_q, raw_topic, teach, score_text(correct_count, total_count, streak, best_streak), "Quiz started. Select an answer and submit.", ) def submit_and_next( selected: str, current_correct: str, current_q: str, current_topic: str, topic_filter: str, difficulty: str, correct_count: int, total_count: int, streak: int, best_streak: int ): if not current_correct or not current_q: return ( "Click Start Quiz first.", gr.update(), gr.update(), current_correct, current_q, current_topic, "", score_text(correct_count, total_count, streak, best_streak), "No active question.", correct_count, total_count, streak, best_streak, ) if not selected: return ( "Please select one option before submitting.", gr.update(), gr.update(), current_correct, current_q, current_topic, "", score_text(correct_count, total_count, streak, best_streak), "Waiting for answer selection.", correct_count, total_count, streak, best_streak, ) total_count += 1 if selected == current_correct: correct_count += 1 streak += 1 best_streak = max(best_streak, streak) result = ( "Correct.\n\n" f"Your answer: {selected}\n\n" f"Reference answer: {current_correct}" ) else: streak = 0 result = ( "Incorrect.\n\n" f"Your answer: {selected}\n\n" f"Correct answer: {current_correct}" ) next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question( topic_filter, difficulty ) return ( result, next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach, score_text(correct_count, total_count, streak, best_streak), "Auto-loaded next question.", correct_count, total_count, streak, best_streak, ) def reset_score(): return ( 0, 0, 0, 0, score_text(0, 0, 0, 0), "Score reset. Click Start Quiz." ) CSS = """ :root { --brand: #0f766e; --accent: #0ea5e9; --bg-soft: #f8fafc; --card: #ffffff; --text: #0f172a; --muted: #475569; } body { background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%); } .gradio-container { max-width: 1280px !important; } #hero { background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10)); border: 1px solid rgba(15,118,110,0.20); border-radius: 16px; padding: 14px 16px; } #hero h1, #hero p { color: var(--text); } .card { background: var(--card); border-radius: 14px; border: 1px solid #e2e8f0; padding: 10px 12px; } """ with gr.Blocks( title="Bioinformatics QA Teaching Studio", css=CSS, theme=gr.themes.Soft( primary_hue="teal", secondary_hue="sky", neutral_hue="slate" ), ) as demo: gr.HTML( """

Bioinformatics QA Teaching Studio

Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes. This app is for learning and research purposes only. Validate content before high-stakes use.

""" ) with gr.Tabs(): with gr.Tab("Explore"): with gr.Row(): topic_dd = gr.Dropdown( choices=["All"] + ALL_TOPICS, value="All", label="Topic" ) split_dd = gr.Dropdown( choices=["All"] + ALL_SPLITS, value="All", label="Split" ) keyword_tb = gr.Textbox( label="Keyword search", placeholder="Search topic, question, or answer" ) with gr.Row(): min_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=0, step=1, label="Min answer length") max_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=int(DF["answer_len"].max()), step=1, label="Max answer length") sort_by = gr.Dropdown( choices=["ID", "Topic", "Question length", "Answer length", "Split"], value="ID", label="Sort by" ) sort_dir = gr.Radio( choices=["Ascending", "Descending"], value="Ascending", label="Order" ) with gr.Row(): page_size = gr.Slider(5, 100, value=15, step=5, label="Rows per page") page_number = gr.Slider(1, 1, value=1, step=1, label="Page") run_btn = gr.Button("Apply filters", variant="primary") export_btn = gr.Button("Export filtered CSV") summary_md = gr.Markdown(value=compute_stats(DF)) table = gr.Dataframe( headers=["id", "topic", "question", "answer", "split", "answer_len"], wrap=True, interactive=False, label="Filtered results" ) filtered_state = gr.State("") row_slider = gr.Slider(1, 1, value=1, step=1, label="Inspect row on current page") inspect_btn = gr.Button("Show row details") detail_header = gr.Markdown(value="Select filters and click Apply.") detail_topic = gr.Textbox(label="Topic", interactive=False) detail_question = gr.Textbox(label="Question", lines=4, interactive=False) detail_answer = gr.Textbox(label="Answer", lines=7, interactive=False) detail_meta = gr.Textbox(label="Metadata", interactive=False) csv_file = gr.File(label="Download CSV", interactive=False) run_btn.click( fn=run_explore, inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir, page_size, page_number], outputs=[summary_md, table, filtered_state, page_number, row_slider], ) inspect_btn.click( fn=show_row_detail, inputs=[filtered_state, row_slider], outputs=[detail_header, detail_topic, detail_question, detail_answer, detail_meta], ) export_btn.click( fn=export_filtered_csv, inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir], outputs=[csv_file], ) with gr.Tab("Quiz"): with gr.Row(): quiz_topic = gr.Dropdown( choices=["All"] + ALL_TOPICS, value="All", label="Topic filter" ) difficulty = gr.Radio( choices=["Easy", "Medium", "Hard"], value="Medium", label="Difficulty" ) start_btn = gr.Button("Start quiz", variant="primary") reset_btn = gr.Button("Reset score") quiz_score = gr.Markdown(value=score_text(0, 0, 0, 0)) quiz_status = gr.Textbox(label="Status", interactive=False) question_box = gr.Textbox(label="Question", lines=5, interactive=False) choices_radio = gr.Radio(choices=[], label="Choose one answer") submit_btn = gr.Button("Submit and load next question", variant="primary") result_box = gr.Textbox(label="Result", lines=6, interactive=False) teaching_box = gr.Textbox(label="Teaching support", lines=8, interactive=False) correct_state = gr.State("") q_state = gr.State("") topic_state = gr.State("") correct_count_state = gr.State(0) total_count_state = gr.State(0) streak_state = gr.State(0) best_streak_state = gr.State(0) start_btn.click( fn=start_quiz, inputs=[ quiz_topic, difficulty, correct_count_state, total_count_state, streak_state, best_streak_state, ], outputs=[ question_box, choices_radio, correct_state, q_state, topic_state, teaching_box, quiz_score, quiz_status, ], ) submit_btn.click( fn=submit_and_next, inputs=[ choices_radio, correct_state, q_state, topic_state, quiz_topic, difficulty, correct_count_state, total_count_state, streak_state, best_streak_state, ], outputs=[ result_box, question_box, choices_radio, correct_state, q_state, topic_state, teaching_box, quiz_score, quiz_status, correct_count_state, total_count_state, streak_state, best_streak_state, ], ) reset_btn.click( fn=reset_score, inputs=[], outputs=[ correct_count_state, total_count_state, streak_state, best_streak_state, quiz_score, quiz_status, ], ) with gr.Tab("About"): gr.Markdown( """ ## About this teaching app This Space demonstrates: - Practical dataset exploration for bioinformatics QA data - Teaching-mode multiple-choice practice with topic-aware distractors - Session score tracking with streak metrics ## Important notice This app is intended for learning and research use only. Use with caution. Do not use as a replacement for expert biomedical or clinical judgment. ## Dataset - Source: yashm/bioinformatics-qa-dataset - Citation and DOI are listed in the project README """ ) demo.launch()