import os import random import tempfile from functools import lru_cache from typing import Dict, List, Tuple import gradio as gr import pandas as pd from datasets import load_dataset DATASET_REPO = "yashm/bioinformatics-qa-dataset" RANDOM_SEED = 42 random.seed(RANDOM_SEED) @lru_cache(maxsize=1) def load_data() -> pd.DataFrame: ds = load_dataset(DATASET_REPO) frames = [] for split_name in ds.keys(): part = ds[split_name].to_pandas().copy() part["split"] = split_name frames.append(part) df = pd.concat(frames, ignore_index=True) required = ["id", "topic", "question", "answer"] missing = [c for c in required if c not in df.columns] if missing: raise ValueError(f"Dataset is missing required columns: {missing}") df = df[["id", "topic", "question", "answer", "split"]].copy() for col in ["topic", "question", "answer", "split"]: df[col] = df[col].astype(str).str.strip() df = df.dropna(subset=["topic", "question", "answer"]) df = df[(df["question"] != "") & (df["answer"] != "")] df["answer_len"] = df["answer"].str.len() df = df.reset_index(drop=True) return df DF = load_data() ALL_TOPICS = sorted(DF["topic"].unique().tolist()) ALL_SPLITS = sorted(DF["split"].unique().tolist()) def compute_stats(df: pd.DataFrame) -> str: total_rows = len(df) total_topics = df["topic"].nunique() if total_rows else 0 avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0 return ( f"Total rows: {total_rows} | " f"Unique topics: {total_topics} | " f"Average answer length: {avg_answer_len:.1f} chars" ) def apply_filters( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str ) -> pd.DataFrame: out = DF.copy() if topic != "All": out = out[out["topic"] == topic] if split != "All": out = out[out["split"] == split] if keyword and keyword.strip(): q = keyword.strip().lower() out = out[ out["topic"].str.lower().str.contains(q, na=False) | out["question"].str.lower().str.contains(q, na=False) | out["answer"].str.lower().str.contains(q, na=False) ] out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))] col_map = { "ID": "id", "Topic": "topic", "Question length": "question", "Answer length": "answer_len", "Split": "split", } sort_col = col_map.get(sort_by, "id") ascending = sort_dir == "Ascending" out = out.sort_values(by=sort_col, ascending=ascending, kind="stable") return out.reset_index(drop=True) def run_explore( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str, page_size: int, page_number: int ): filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) total = len(filtered) pages = max(1, (total + page_size - 1) // page_size) page_number = min(max(1, page_number), pages) start = (page_number - 1) * page_size end = min(start + page_size, total) page_df = filtered.iloc[start:end].copy() table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]] summary = ( f"{compute_stats(filtered)}\n" f"Showing rows {start + 1} to {end if total else 0} of {total} | " f"Page {page_number} of {pages}" ) max_row_slider = max(1, len(page_df)) return ( summary, table_df, page_df.to_json(orient="records"), gr.update(maximum=pages, value=page_number), gr.update(maximum=max_row_slider, value=1), ) def show_row_detail(page_df_json: str, row_idx_1based: int): if not page_df_json: return "No data loaded for this page.", "", "", "", "" page_df = pd.read_json(page_df_json) if page_df.empty: return "No rows in this page.", "", "", "", "" idx = int(row_idx_1based) - 1 idx = max(0, min(idx, len(page_df) - 1)) row = page_df.iloc[idx] header = f"Record {idx + 1} on current page" return ( header, str(row["topic"]), str(row["question"]), str(row["answer"]), f"Split: {row['split']} | ID: {row['id']} | Answer length: {row['answer_len']}", ) def export_filtered_csv( topic: str, split: str, keyword: str, min_len: int, max_len: int, sort_by: str, sort_dir: str ): filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) export_df = filtered[["id", "topic", "question", "answer", "split"]].copy() with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: export_df.to_csv(tmp.name, index=False) return tmp.name def related_examples(question_text: str, topic: str, k: int = 3) -> str: subset = DF[DF["topic"] == topic].copy() if subset.empty: return "No related examples found." q_words = set(str(question_text).lower().split()) if not q_words: return "No related examples found." def overlap_score(text: str) -> int: return len(q_words.intersection(set(str(text).lower().split()))) subset["score"] = subset["question"].apply(overlap_score) subset = subset.sort_values(by=["score", "id"], ascending=[False, True]) subset = subset[subset["question"] != question_text].head(k) if subset.empty: return "No related examples found." lines = [] for _, r in subset.iterrows(): lines.append(f"- {r['question']}") return "\n".join(lines) def score_text(correct: int, total: int, streak: int, best_streak: int) -> str: acc = (100.0 * correct / total) if total > 0 else 0.0 return ( f"Score: {correct}/{total} | " f"Accuracy: {acc:.1f}% | " f"Streak: {streak} | " f"Best streak: {best_streak}" ) def generate_question(topic_filter: str, difficulty: str): pool = DF.copy() if topic_filter != "All": pool = pool[pool["topic"] == topic_filter] if pool.empty: return ( "No questions available for this filter.", gr.update(choices=[], value=None), "", "", "", "", ) row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0] topic = str(row["topic"]) question = str(row["question"]) correct = str(row["answer"]) same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy() global_pool = DF[DF["answer"] != correct].copy() if difficulty == "Easy": candidate = global_pool elif difficulty == "Medium": candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool else: target_len = len(correct) hard_pool = same_topic.copy() hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs() hard_pool = hard_pool.sort_values(by=["len_gap", "id"]) if len(hard_pool["answer"].unique()) >= 3: candidate = hard_pool elif len(same_topic["answer"].unique()) >= 3: candidate = same_topic else: candidate = global_pool distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist() if len(distractor_answers) < 3: return ( "Not enough distractors to generate a 4-option question.", gr.update(choices=[], value=None), "", "", "", "", ) distractors = random.sample(distractor_answers, 3) options = distractors + [correct] random.shuffle(options) question_block = f"Topic: {topic}\n\nQuestion: {question}" teach_note = ( f"Teaching note: This question belongs to {topic}. " f"Focus on core definitions and tool usage terms." ) related = related_examples(question, topic, k=3) return ( question_block, gr.update(choices=options, value=None), correct, question, topic, f"{teach_note}\n\nRelated questions:\n{related}", ) def start_quiz( topic_filter: str, difficulty: str, correct_count: int, total_count: int, streak: int, best_streak: int ): q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty) return ( q, choices, correct, raw_q, raw_topic, teach, score_text(correct_count, total_count, streak, best_streak), "Quiz started. Select an answer and submit.", ) def submit_and_next( selected: str, current_correct: str, current_q: str, current_topic: str, topic_filter: str, difficulty: str, correct_count: int, total_count: int, streak: int, best_streak: int ): if not current_correct or not current_q: return ( "Click Start Quiz first.", gr.update(), gr.update(), current_correct, current_q, current_topic, "", score_text(correct_count, total_count, streak, best_streak), "No active question.", correct_count, total_count, streak, best_streak, ) if not selected: return ( "Please select one option before submitting.", gr.update(), gr.update(), current_correct, current_q, current_topic, "", score_text(correct_count, total_count, streak, best_streak), "Waiting for answer selection.", correct_count, total_count, streak, best_streak, ) total_count += 1 if selected == current_correct: correct_count += 1 streak += 1 best_streak = max(best_streak, streak) result = ( "Correct.\n\n" f"Your answer: {selected}\n\n" f"Reference answer: {current_correct}" ) else: streak = 0 result = ( "Incorrect.\n\n" f"Your answer: {selected}\n\n" f"Correct answer: {current_correct}" ) next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question( topic_filter, difficulty ) return ( result, next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach, score_text(correct_count, total_count, streak, best_streak), "Auto-loaded next question.", correct_count, total_count, streak, best_streak, ) def reset_score(): return ( 0, 0, 0, 0, score_text(0, 0, 0, 0), "Score reset. Click Start Quiz." ) CSS = """ :root { --brand: #0f766e; --accent: #0ea5e9; --bg-soft: #f8fafc; --card: #ffffff; --text: #0f172a; --muted: #475569; } body { background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%); } .gradio-container { max-width: 1280px !important; } #hero { background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10)); border: 1px solid rgba(15,118,110,0.20); border-radius: 16px; padding: 14px 16px; } #hero h1, #hero p { color: var(--text); } .card { background: var(--card); border-radius: 14px; border: 1px solid #e2e8f0; padding: 10px 12px; } """ with gr.Blocks( title="Bioinformatics QA Teaching Studio", css=CSS, theme=gr.themes.Soft( primary_hue="teal", secondary_hue="sky", neutral_hue="slate" ), ) as demo: gr.HTML( """
Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes. This app is for learning and research purposes only. Validate content before high-stakes use.