Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| import tempfile | |
| from functools import lru_cache | |
| from typing import Dict, List, Tuple | |
| import gradio as gr | |
| import pandas as pd | |
| from datasets import load_dataset | |
| DATASET_REPO = "yashm/bioinformatics-qa-dataset" | |
| RANDOM_SEED = 42 | |
| random.seed(RANDOM_SEED) | |
| def load_data() -> pd.DataFrame: | |
| ds = load_dataset(DATASET_REPO) | |
| frames = [] | |
| for split_name in ds.keys(): | |
| part = ds[split_name].to_pandas().copy() | |
| part["split"] = split_name | |
| frames.append(part) | |
| df = pd.concat(frames, ignore_index=True) | |
| required = ["id", "topic", "question", "answer"] | |
| missing = [c for c in required if c not in df.columns] | |
| if missing: | |
| raise ValueError(f"Dataset is missing required columns: {missing}") | |
| df = df[["id", "topic", "question", "answer", "split"]].copy() | |
| for col in ["topic", "question", "answer", "split"]: | |
| df[col] = df[col].astype(str).str.strip() | |
| df = df.dropna(subset=["topic", "question", "answer"]) | |
| df = df[(df["question"] != "") & (df["answer"] != "")] | |
| df["answer_len"] = df["answer"].str.len() | |
| df = df.reset_index(drop=True) | |
| return df | |
| DF = load_data() | |
| ALL_TOPICS = sorted(DF["topic"].unique().tolist()) | |
| ALL_SPLITS = sorted(DF["split"].unique().tolist()) | |
| def compute_stats(df: pd.DataFrame) -> str: | |
| total_rows = len(df) | |
| total_topics = df["topic"].nunique() if total_rows else 0 | |
| avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0 | |
| return ( | |
| f"Total rows: {total_rows} | " | |
| f"Unique topics: {total_topics} | " | |
| f"Average answer length: {avg_answer_len:.1f} chars" | |
| ) | |
| def apply_filters( | |
| topic: str, | |
| split: str, | |
| keyword: str, | |
| min_len: int, | |
| max_len: int, | |
| sort_by: str, | |
| sort_dir: str | |
| ) -> pd.DataFrame: | |
| out = DF.copy() | |
| if topic != "All": | |
| out = out[out["topic"] == topic] | |
| if split != "All": | |
| out = out[out["split"] == split] | |
| if keyword and keyword.strip(): | |
| q = keyword.strip().lower() | |
| out = out[ | |
| out["topic"].str.lower().str.contains(q, na=False) | |
| | out["question"].str.lower().str.contains(q, na=False) | |
| | out["answer"].str.lower().str.contains(q, na=False) | |
| ] | |
| out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))] | |
| col_map = { | |
| "ID": "id", | |
| "Topic": "topic", | |
| "Question length": "question", | |
| "Answer length": "answer_len", | |
| "Split": "split", | |
| } | |
| sort_col = col_map.get(sort_by, "id") | |
| ascending = sort_dir == "Ascending" | |
| out = out.sort_values(by=sort_col, ascending=ascending, kind="stable") | |
| return out.reset_index(drop=True) | |
| def run_explore( | |
| topic: str, | |
| split: str, | |
| keyword: str, | |
| min_len: int, | |
| max_len: int, | |
| sort_by: str, | |
| sort_dir: str, | |
| page_size: int, | |
| page_number: int | |
| ): | |
| filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) | |
| total = len(filtered) | |
| pages = max(1, (total + page_size - 1) // page_size) | |
| page_number = min(max(1, page_number), pages) | |
| start = (page_number - 1) * page_size | |
| end = min(start + page_size, total) | |
| page_df = filtered.iloc[start:end].copy() | |
| table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]] | |
| summary = ( | |
| f"{compute_stats(filtered)}\n" | |
| f"Showing rows {start + 1} to {end if total else 0} of {total} | " | |
| f"Page {page_number} of {pages}" | |
| ) | |
| max_row_slider = max(1, len(page_df)) | |
| return ( | |
| summary, | |
| table_df, | |
| page_df.to_json(orient="records"), | |
| gr.update(maximum=pages, value=page_number), | |
| gr.update(maximum=max_row_slider, value=1), | |
| ) | |
| def show_row_detail(page_df_json: str, row_idx_1based: int): | |
| if not page_df_json: | |
| return "No data loaded for this page.", "", "", "", "" | |
| page_df = pd.read_json(page_df_json) | |
| if page_df.empty: | |
| return "No rows in this page.", "", "", "", "" | |
| idx = int(row_idx_1based) - 1 | |
| idx = max(0, min(idx, len(page_df) - 1)) | |
| row = page_df.iloc[idx] | |
| header = f"Record {idx + 1} on current page" | |
| return ( | |
| header, | |
| str(row["topic"]), | |
| str(row["question"]), | |
| str(row["answer"]), | |
| f"Split: {row['split']} | ID: {row['id']} | Answer length: {row['answer_len']}", | |
| ) | |
| def export_filtered_csv( | |
| topic: str, | |
| split: str, | |
| keyword: str, | |
| min_len: int, | |
| max_len: int, | |
| sort_by: str, | |
| sort_dir: str | |
| ): | |
| filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir) | |
| export_df = filtered[["id", "topic", "question", "answer", "split"]].copy() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| export_df.to_csv(tmp.name, index=False) | |
| return tmp.name | |
| def related_examples(question_text: str, topic: str, k: int = 3) -> str: | |
| subset = DF[DF["topic"] == topic].copy() | |
| if subset.empty: | |
| return "No related examples found." | |
| q_words = set(str(question_text).lower().split()) | |
| if not q_words: | |
| return "No related examples found." | |
| def overlap_score(text: str) -> int: | |
| return len(q_words.intersection(set(str(text).lower().split()))) | |
| subset["score"] = subset["question"].apply(overlap_score) | |
| subset = subset.sort_values(by=["score", "id"], ascending=[False, True]) | |
| subset = subset[subset["question"] != question_text].head(k) | |
| if subset.empty: | |
| return "No related examples found." | |
| lines = [] | |
| for _, r in subset.iterrows(): | |
| lines.append(f"- {r['question']}") | |
| return "\n".join(lines) | |
| def score_text(correct: int, total: int, streak: int, best_streak: int) -> str: | |
| acc = (100.0 * correct / total) if total > 0 else 0.0 | |
| return ( | |
| f"Score: {correct}/{total} | " | |
| f"Accuracy: {acc:.1f}% | " | |
| f"Streak: {streak} | " | |
| f"Best streak: {best_streak}" | |
| ) | |
| def generate_question(topic_filter: str, difficulty: str): | |
| pool = DF.copy() | |
| if topic_filter != "All": | |
| pool = pool[pool["topic"] == topic_filter] | |
| if pool.empty: | |
| return ( | |
| "No questions available for this filter.", | |
| gr.update(choices=[], value=None), | |
| "", | |
| "", | |
| "", | |
| "", | |
| ) | |
| row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0] | |
| topic = str(row["topic"]) | |
| question = str(row["question"]) | |
| correct = str(row["answer"]) | |
| same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy() | |
| global_pool = DF[DF["answer"] != correct].copy() | |
| if difficulty == "Easy": | |
| candidate = global_pool | |
| elif difficulty == "Medium": | |
| candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool | |
| else: | |
| target_len = len(correct) | |
| hard_pool = same_topic.copy() | |
| hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs() | |
| hard_pool = hard_pool.sort_values(by=["len_gap", "id"]) | |
| if len(hard_pool["answer"].unique()) >= 3: | |
| candidate = hard_pool | |
| elif len(same_topic["answer"].unique()) >= 3: | |
| candidate = same_topic | |
| else: | |
| candidate = global_pool | |
| distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist() | |
| if len(distractor_answers) < 3: | |
| return ( | |
| "Not enough distractors to generate a 4-option question.", | |
| gr.update(choices=[], value=None), | |
| "", | |
| "", | |
| "", | |
| "", | |
| ) | |
| distractors = random.sample(distractor_answers, 3) | |
| options = distractors + [correct] | |
| random.shuffle(options) | |
| question_block = f"Topic: {topic}\n\nQuestion: {question}" | |
| teach_note = ( | |
| f"Teaching note: This question belongs to {topic}. " | |
| f"Focus on core definitions and tool usage terms." | |
| ) | |
| related = related_examples(question, topic, k=3) | |
| return ( | |
| question_block, | |
| gr.update(choices=options, value=None), | |
| correct, | |
| question, | |
| topic, | |
| f"{teach_note}\n\nRelated questions:\n{related}", | |
| ) | |
| def start_quiz( | |
| topic_filter: str, | |
| difficulty: str, | |
| correct_count: int, | |
| total_count: int, | |
| streak: int, | |
| best_streak: int | |
| ): | |
| q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty) | |
| return ( | |
| q, | |
| choices, | |
| correct, | |
| raw_q, | |
| raw_topic, | |
| teach, | |
| score_text(correct_count, total_count, streak, best_streak), | |
| "Quiz started. Select an answer and submit.", | |
| ) | |
| def submit_and_next( | |
| selected: str, | |
| current_correct: str, | |
| current_q: str, | |
| current_topic: str, | |
| topic_filter: str, | |
| difficulty: str, | |
| correct_count: int, | |
| total_count: int, | |
| streak: int, | |
| best_streak: int | |
| ): | |
| if not current_correct or not current_q: | |
| return ( | |
| "Click Start Quiz first.", | |
| gr.update(), | |
| gr.update(), | |
| current_correct, | |
| current_q, | |
| current_topic, | |
| "", | |
| score_text(correct_count, total_count, streak, best_streak), | |
| "No active question.", | |
| correct_count, | |
| total_count, | |
| streak, | |
| best_streak, | |
| ) | |
| if not selected: | |
| return ( | |
| "Please select one option before submitting.", | |
| gr.update(), | |
| gr.update(), | |
| current_correct, | |
| current_q, | |
| current_topic, | |
| "", | |
| score_text(correct_count, total_count, streak, best_streak), | |
| "Waiting for answer selection.", | |
| correct_count, | |
| total_count, | |
| streak, | |
| best_streak, | |
| ) | |
| total_count += 1 | |
| if selected == current_correct: | |
| correct_count += 1 | |
| streak += 1 | |
| best_streak = max(best_streak, streak) | |
| result = ( | |
| "Correct.\n\n" | |
| f"Your answer: {selected}\n\n" | |
| f"Reference answer: {current_correct}" | |
| ) | |
| else: | |
| streak = 0 | |
| result = ( | |
| "Incorrect.\n\n" | |
| f"Your answer: {selected}\n\n" | |
| f"Correct answer: {current_correct}" | |
| ) | |
| next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question( | |
| topic_filter, difficulty | |
| ) | |
| return ( | |
| result, | |
| next_q, | |
| next_choices, | |
| next_correct, | |
| next_raw_q, | |
| next_raw_topic, | |
| next_teach, | |
| score_text(correct_count, total_count, streak, best_streak), | |
| "Auto-loaded next question.", | |
| correct_count, | |
| total_count, | |
| streak, | |
| best_streak, | |
| ) | |
| def reset_score(): | |
| return ( | |
| 0, 0, 0, 0, | |
| score_text(0, 0, 0, 0), | |
| "Score reset. Click Start Quiz." | |
| ) | |
| CSS = """ | |
| :root { | |
| --brand: #0f766e; | |
| --accent: #0ea5e9; | |
| --bg-soft: #f8fafc; | |
| --card: #ffffff; | |
| --text: #0f172a; | |
| --muted: #475569; | |
| } | |
| body { | |
| background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%); | |
| } | |
| .gradio-container { | |
| max-width: 1280px !important; | |
| } | |
| #hero { | |
| background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10)); | |
| border: 1px solid rgba(15,118,110,0.20); | |
| border-radius: 16px; | |
| padding: 14px 16px; | |
| } | |
| #hero h1, #hero p { | |
| color: var(--text); | |
| } | |
| .card { | |
| background: var(--card); | |
| border-radius: 14px; | |
| border: 1px solid #e2e8f0; | |
| padding: 10px 12px; | |
| } | |
| """ | |
| with gr.Blocks( | |
| title="Bioinformatics QA Teaching Studio", | |
| css=CSS, | |
| theme=gr.themes.Soft( | |
| primary_hue="teal", | |
| secondary_hue="sky", | |
| neutral_hue="slate" | |
| ), | |
| ) as demo: | |
| gr.HTML( | |
| """ | |
| <div id="hero"> | |
| <h1>Bioinformatics QA Teaching Studio</h1> | |
| <p> | |
| Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes. | |
| This app is for learning and research purposes only. Validate content before high-stakes use. | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Explore"): | |
| with gr.Row(): | |
| topic_dd = gr.Dropdown( | |
| choices=["All"] + ALL_TOPICS, | |
| value="All", | |
| label="Topic" | |
| ) | |
| split_dd = gr.Dropdown( | |
| choices=["All"] + ALL_SPLITS, | |
| value="All", | |
| label="Split" | |
| ) | |
| keyword_tb = gr.Textbox( | |
| label="Keyword search", | |
| placeholder="Search topic, question, or answer" | |
| ) | |
| with gr.Row(): | |
| min_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=0, step=1, label="Min answer length") | |
| max_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=int(DF["answer_len"].max()), step=1, label="Max answer length") | |
| sort_by = gr.Dropdown( | |
| choices=["ID", "Topic", "Question length", "Answer length", "Split"], | |
| value="ID", | |
| label="Sort by" | |
| ) | |
| sort_dir = gr.Radio( | |
| choices=["Ascending", "Descending"], | |
| value="Ascending", | |
| label="Order" | |
| ) | |
| with gr.Row(): | |
| page_size = gr.Slider(5, 100, value=15, step=5, label="Rows per page") | |
| page_number = gr.Slider(1, 1, value=1, step=1, label="Page") | |
| run_btn = gr.Button("Apply filters", variant="primary") | |
| export_btn = gr.Button("Export filtered CSV") | |
| summary_md = gr.Markdown(value=compute_stats(DF)) | |
| table = gr.Dataframe( | |
| headers=["id", "topic", "question", "answer", "split", "answer_len"], | |
| wrap=True, | |
| interactive=False, | |
| label="Filtered results" | |
| ) | |
| filtered_state = gr.State("") | |
| row_slider = gr.Slider(1, 1, value=1, step=1, label="Inspect row on current page") | |
| inspect_btn = gr.Button("Show row details") | |
| detail_header = gr.Markdown(value="Select filters and click Apply.") | |
| detail_topic = gr.Textbox(label="Topic", interactive=False) | |
| detail_question = gr.Textbox(label="Question", lines=4, interactive=False) | |
| detail_answer = gr.Textbox(label="Answer", lines=7, interactive=False) | |
| detail_meta = gr.Textbox(label="Metadata", interactive=False) | |
| csv_file = gr.File(label="Download CSV", interactive=False) | |
| run_btn.click( | |
| fn=run_explore, | |
| inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir, page_size, page_number], | |
| outputs=[summary_md, table, filtered_state, page_number, row_slider], | |
| ) | |
| inspect_btn.click( | |
| fn=show_row_detail, | |
| inputs=[filtered_state, row_slider], | |
| outputs=[detail_header, detail_topic, detail_question, detail_answer, detail_meta], | |
| ) | |
| export_btn.click( | |
| fn=export_filtered_csv, | |
| inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir], | |
| outputs=[csv_file], | |
| ) | |
| with gr.Tab("Quiz"): | |
| with gr.Row(): | |
| quiz_topic = gr.Dropdown( | |
| choices=["All"] + ALL_TOPICS, | |
| value="All", | |
| label="Topic filter" | |
| ) | |
| difficulty = gr.Radio( | |
| choices=["Easy", "Medium", "Hard"], | |
| value="Medium", | |
| label="Difficulty" | |
| ) | |
| start_btn = gr.Button("Start quiz", variant="primary") | |
| reset_btn = gr.Button("Reset score") | |
| quiz_score = gr.Markdown(value=score_text(0, 0, 0, 0)) | |
| quiz_status = gr.Textbox(label="Status", interactive=False) | |
| question_box = gr.Textbox(label="Question", lines=5, interactive=False) | |
| choices_radio = gr.Radio(choices=[], label="Choose one answer") | |
| submit_btn = gr.Button("Submit and load next question", variant="primary") | |
| result_box = gr.Textbox(label="Result", lines=6, interactive=False) | |
| teaching_box = gr.Textbox(label="Teaching support", lines=8, interactive=False) | |
| correct_state = gr.State("") | |
| q_state = gr.State("") | |
| topic_state = gr.State("") | |
| correct_count_state = gr.State(0) | |
| total_count_state = gr.State(0) | |
| streak_state = gr.State(0) | |
| best_streak_state = gr.State(0) | |
| start_btn.click( | |
| fn=start_quiz, | |
| inputs=[ | |
| quiz_topic, | |
| difficulty, | |
| correct_count_state, | |
| total_count_state, | |
| streak_state, | |
| best_streak_state, | |
| ], | |
| outputs=[ | |
| question_box, | |
| choices_radio, | |
| correct_state, | |
| q_state, | |
| topic_state, | |
| teaching_box, | |
| quiz_score, | |
| quiz_status, | |
| ], | |
| ) | |
| submit_btn.click( | |
| fn=submit_and_next, | |
| inputs=[ | |
| choices_radio, | |
| correct_state, | |
| q_state, | |
| topic_state, | |
| quiz_topic, | |
| difficulty, | |
| correct_count_state, | |
| total_count_state, | |
| streak_state, | |
| best_streak_state, | |
| ], | |
| outputs=[ | |
| result_box, | |
| question_box, | |
| choices_radio, | |
| correct_state, | |
| q_state, | |
| topic_state, | |
| teaching_box, | |
| quiz_score, | |
| quiz_status, | |
| correct_count_state, | |
| total_count_state, | |
| streak_state, | |
| best_streak_state, | |
| ], | |
| ) | |
| reset_btn.click( | |
| fn=reset_score, | |
| inputs=[], | |
| outputs=[ | |
| correct_count_state, | |
| total_count_state, | |
| streak_state, | |
| best_streak_state, | |
| quiz_score, | |
| quiz_status, | |
| ], | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| """ | |
| ## About this teaching app | |
| This Space demonstrates: | |
| - Practical dataset exploration for bioinformatics QA data | |
| - Teaching-mode multiple-choice practice with topic-aware distractors | |
| - Session score tracking with streak metrics | |
| ## Important notice | |
| This app is intended for learning and research use only. | |
| Use with caution. | |
| Do not use as a replacement for expert biomedical or clinical judgment. | |
| ## Dataset | |
| - Source: yashm/bioinformatics-qa-dataset | |
| - Citation and DOI are listed in the project README | |
| """ | |
| ) | |
| demo.launch() |