Spaces:

yashm
/

Bioinformatics

Sleeping

File size: 19,527 Bytes

import os
import random
import tempfile
from functools import lru_cache
from typing import Dict, List, Tuple

import gradio as gr
import pandas as pd
from datasets import load_dataset

DATASET_REPO = "yashm/bioinformatics-qa-dataset"
RANDOM_SEED = 42

random.seed(RANDOM_SEED)


@lru_cache(maxsize=1)
def load_data() -> pd.DataFrame:
    ds = load_dataset(DATASET_REPO)

    frames = []
    for split_name in ds.keys():
        part = ds[split_name].to_pandas().copy()
        part["split"] = split_name
        frames.append(part)

    df = pd.concat(frames, ignore_index=True)

    required = ["id", "topic", "question", "answer"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Dataset is missing required columns: {missing}")

    df = df[["id", "topic", "question", "answer", "split"]].copy()
    for col in ["topic", "question", "answer", "split"]:
        df[col] = df[col].astype(str).str.strip()

    df = df.dropna(subset=["topic", "question", "answer"])
    df = df[(df["question"] != "") & (df["answer"] != "")]
    df["answer_len"] = df["answer"].str.len()
    df = df.reset_index(drop=True)

    return df


DF = load_data()
ALL_TOPICS = sorted(DF["topic"].unique().tolist())
ALL_SPLITS = sorted(DF["split"].unique().tolist())


def compute_stats(df: pd.DataFrame) -> str:
    total_rows = len(df)
    total_topics = df["topic"].nunique() if total_rows else 0
    avg_answer_len = float(df["answer_len"].mean()) if total_rows else 0.0
    return (
        f"Total rows: {total_rows} | "
        f"Unique topics: {total_topics} | "
        f"Average answer length: {avg_answer_len:.1f} chars"
    )


def apply_filters(
    topic: str,
    split: str,
    keyword: str,
    min_len: int,
    max_len: int,
    sort_by: str,
    sort_dir: str
) -> pd.DataFrame:
    out = DF.copy()

    if topic != "All":
        out = out[out["topic"] == topic]

    if split != "All":
        out = out[out["split"] == split]

    if keyword and keyword.strip():
        q = keyword.strip().lower()
        out = out[
            out["topic"].str.lower().str.contains(q, na=False)
            | out["question"].str.lower().str.contains(q, na=False)
            | out["answer"].str.lower().str.contains(q, na=False)
        ]

    out = out[(out["answer_len"] >= int(min_len)) & (out["answer_len"] <= int(max_len))]

    col_map = {
        "ID": "id",
        "Topic": "topic",
        "Question length": "question",
        "Answer length": "answer_len",
        "Split": "split",
    }
    sort_col = col_map.get(sort_by, "id")
    ascending = sort_dir == "Ascending"
    out = out.sort_values(by=sort_col, ascending=ascending, kind="stable")

    return out.reset_index(drop=True)


def run_explore(
    topic: str,
    split: str,
    keyword: str,
    min_len: int,
    max_len: int,
    sort_by: str,
    sort_dir: str,
    page_size: int,
    page_number: int
):
    filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
    total = len(filtered)
    pages = max(1, (total + page_size - 1) // page_size)
    page_number = min(max(1, page_number), pages)

    start = (page_number - 1) * page_size
    end = min(start + page_size, total)

    page_df = filtered.iloc[start:end].copy()
    table_df = page_df[["id", "topic", "question", "answer", "split", "answer_len"]]

    summary = (
        f"{compute_stats(filtered)}\n"
        f"Showing rows {start + 1} to {end if total else 0} of {total} | "
        f"Page {page_number} of {pages}"
    )

    max_row_slider = max(1, len(page_df))
    return (
        summary,
        table_df,
        page_df.to_json(orient="records"),
        gr.update(maximum=pages, value=page_number),
        gr.update(maximum=max_row_slider, value=1),
    )


def show_row_detail(page_df_json: str, row_idx_1based: int):
    if not page_df_json:
        return "No data loaded for this page.", "", "", "", ""

    page_df = pd.read_json(page_df_json)
    if page_df.empty:
        return "No rows in this page.", "", "", "", ""

    idx = int(row_idx_1based) - 1
    idx = max(0, min(idx, len(page_df) - 1))
    row = page_df.iloc[idx]

    header = f"Record {idx + 1} on current page"
    return (
        header,
        str(row["topic"]),
        str(row["question"]),
        str(row["answer"]),
        f"Split: {row['split']} | ID: {row['id']} | Answer length: {row['answer_len']}",
    )


def export_filtered_csv(
    topic: str,
    split: str,
    keyword: str,
    min_len: int,
    max_len: int,
    sort_by: str,
    sort_dir: str
):
    filtered = apply_filters(topic, split, keyword, min_len, max_len, sort_by, sort_dir)
    export_df = filtered[["id", "topic", "question", "answer", "split"]].copy()

    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
        export_df.to_csv(tmp.name, index=False)
        return tmp.name


def related_examples(question_text: str, topic: str, k: int = 3) -> str:
    subset = DF[DF["topic"] == topic].copy()
    if subset.empty:
        return "No related examples found."

    q_words = set(str(question_text).lower().split())
    if not q_words:
        return "No related examples found."

    def overlap_score(text: str) -> int:
        return len(q_words.intersection(set(str(text).lower().split())))

    subset["score"] = subset["question"].apply(overlap_score)
    subset = subset.sort_values(by=["score", "id"], ascending=[False, True])
    subset = subset[subset["question"] != question_text].head(k)

    if subset.empty:
        return "No related examples found."

    lines = []
    for _, r in subset.iterrows():
        lines.append(f"- {r['question']}")
    return "\n".join(lines)


def score_text(correct: int, total: int, streak: int, best_streak: int) -> str:
    acc = (100.0 * correct / total) if total > 0 else 0.0
    return (
        f"Score: {correct}/{total} | "
        f"Accuracy: {acc:.1f}% | "
        f"Streak: {streak} | "
        f"Best streak: {best_streak}"
    )


def generate_question(topic_filter: str, difficulty: str):
    pool = DF.copy()
    if topic_filter != "All":
        pool = pool[pool["topic"] == topic_filter]

    if pool.empty:
        return (
            "No questions available for this filter.",
            gr.update(choices=[], value=None),
            "",
            "",
            "",
            "",
        )

    row = pool.sample(1, random_state=random.randint(0, 10_000_000)).iloc[0]
    topic = str(row["topic"])
    question = str(row["question"])
    correct = str(row["answer"])

    same_topic = DF[(DF["topic"] == topic) & (DF["answer"] != correct)].copy()
    global_pool = DF[DF["answer"] != correct].copy()

    if difficulty == "Easy":
        candidate = global_pool
    elif difficulty == "Medium":
        candidate = same_topic if len(same_topic["answer"].unique()) >= 3 else global_pool
    else:
        target_len = len(correct)
        hard_pool = same_topic.copy()
        hard_pool["len_gap"] = (hard_pool["answer"].str.len() - target_len).abs()
        hard_pool = hard_pool.sort_values(by=["len_gap", "id"])
        if len(hard_pool["answer"].unique()) >= 3:
            candidate = hard_pool
        elif len(same_topic["answer"].unique()) >= 3:
            candidate = same_topic
        else:
            candidate = global_pool

    distractor_answers = candidate["answer"].dropna().astype(str).drop_duplicates().tolist()
    if len(distractor_answers) < 3:
        return (
            "Not enough distractors to generate a 4-option question.",
            gr.update(choices=[], value=None),
            "",
            "",
            "",
            "",
        )

    distractors = random.sample(distractor_answers, 3)
    options = distractors + [correct]
    random.shuffle(options)

    question_block = f"Topic: {topic}\n\nQuestion: {question}"

    teach_note = (
        f"Teaching note: This question belongs to {topic}. "
        f"Focus on core definitions and tool usage terms."
    )
    related = related_examples(question, topic, k=3)

    return (
        question_block,
        gr.update(choices=options, value=None),
        correct,
        question,
        topic,
        f"{teach_note}\n\nRelated questions:\n{related}",
    )


def start_quiz(
    topic_filter: str,
    difficulty: str,
    correct_count: int,
    total_count: int,
    streak: int,
    best_streak: int
):
    q, choices, correct, raw_q, raw_topic, teach = generate_question(topic_filter, difficulty)
    return (
        q,
        choices,
        correct,
        raw_q,
        raw_topic,
        teach,
        score_text(correct_count, total_count, streak, best_streak),
        "Quiz started. Select an answer and submit.",
    )


def submit_and_next(
    selected: str,
    current_correct: str,
    current_q: str,
    current_topic: str,
    topic_filter: str,
    difficulty: str,
    correct_count: int,
    total_count: int,
    streak: int,
    best_streak: int
):
    if not current_correct or not current_q:
        return (
            "Click Start Quiz first.",
            gr.update(),
            gr.update(),
            current_correct,
            current_q,
            current_topic,
            "",
            score_text(correct_count, total_count, streak, best_streak),
            "No active question.",
            correct_count,
            total_count,
            streak,
            best_streak,
        )

    if not selected:
        return (
            "Please select one option before submitting.",
            gr.update(),
            gr.update(),
            current_correct,
            current_q,
            current_topic,
            "",
            score_text(correct_count, total_count, streak, best_streak),
            "Waiting for answer selection.",
            correct_count,
            total_count,
            streak,
            best_streak,
        )

    total_count += 1
    if selected == current_correct:
        correct_count += 1
        streak += 1
        best_streak = max(best_streak, streak)
        result = (
            "Correct.\n\n"
            f"Your answer: {selected}\n\n"
            f"Reference answer: {current_correct}"
        )
    else:
        streak = 0
        result = (
            "Incorrect.\n\n"
            f"Your answer: {selected}\n\n"
            f"Correct answer: {current_correct}"
        )

    next_q, next_choices, next_correct, next_raw_q, next_raw_topic, next_teach = generate_question(
        topic_filter, difficulty
    )

    return (
        result,
        next_q,
        next_choices,
        next_correct,
        next_raw_q,
        next_raw_topic,
        next_teach,
        score_text(correct_count, total_count, streak, best_streak),
        "Auto-loaded next question.",
        correct_count,
        total_count,
        streak,
        best_streak,
    )


def reset_score():
    return (
        0, 0, 0, 0,
        score_text(0, 0, 0, 0),
        "Score reset. Click Start Quiz."
    )


CSS = """
:root {
  --brand: #0f766e;
  --accent: #0ea5e9;
  --bg-soft: #f8fafc;
  --card: #ffffff;
  --text: #0f172a;
  --muted: #475569;
}
body {
  background: linear-gradient(180deg, #f0fdfa 0%, #f8fafc 35%, #ffffff 100%);
}
.gradio-container {
  max-width: 1280px !important;
}
#hero {
  background: linear-gradient(135deg, rgba(15,118,110,0.10), rgba(14,165,233,0.10));
  border: 1px solid rgba(15,118,110,0.20);
  border-radius: 16px;
  padding: 14px 16px;
}
#hero h1, #hero p {
  color: var(--text);
}
.card {
  background: var(--card);
  border-radius: 14px;
  border: 1px solid #e2e8f0;
  padding: 10px 12px;
}
"""

with gr.Blocks(
    title="Bioinformatics QA Teaching Studio",
    css=CSS,
    theme=gr.themes.Soft(
        primary_hue="teal",
        secondary_hue="sky",
        neutral_hue="slate"
    ),
) as demo:
    gr.HTML(
        """
        <div id="hero">
          <h1>Bioinformatics QA Teaching Studio</h1>
          <p>
            Explore the dataset, learn core concepts, and practice with teaching-mode multiple-choice quizzes.
            This app is for learning and research purposes only. Validate content before high-stakes use.
          </p>
        </div>
        """
    )

    with gr.Tabs():
        with gr.Tab("Explore"):
            with gr.Row():
                topic_dd = gr.Dropdown(
                    choices=["All"] + ALL_TOPICS,
                    value="All",
                    label="Topic"
                )
                split_dd = gr.Dropdown(
                    choices=["All"] + ALL_SPLITS,
                    value="All",
                    label="Split"
                )
                keyword_tb = gr.Textbox(
                    label="Keyword search",
                    placeholder="Search topic, question, or answer"
                )

            with gr.Row():
                min_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=0, step=1, label="Min answer length")
                max_len = gr.Slider(0, int(max(DF["answer_len"].max(), 20)), value=int(DF["answer_len"].max()), step=1, label="Max answer length")
                sort_by = gr.Dropdown(
                    choices=["ID", "Topic", "Question length", "Answer length", "Split"],
                    value="ID",
                    label="Sort by"
                )
                sort_dir = gr.Radio(
                    choices=["Ascending", "Descending"],
                    value="Ascending",
                    label="Order"
                )

            with gr.Row():
                page_size = gr.Slider(5, 100, value=15, step=5, label="Rows per page")
                page_number = gr.Slider(1, 1, value=1, step=1, label="Page")
                run_btn = gr.Button("Apply filters", variant="primary")
                export_btn = gr.Button("Export filtered CSV")

            summary_md = gr.Markdown(value=compute_stats(DF))
            table = gr.Dataframe(
                headers=["id", "topic", "question", "answer", "split", "answer_len"],
                wrap=True,
                interactive=False,
                label="Filtered results"
            )

            filtered_state = gr.State("")
            row_slider = gr.Slider(1, 1, value=1, step=1, label="Inspect row on current page")
            inspect_btn = gr.Button("Show row details")

            detail_header = gr.Markdown(value="Select filters and click Apply.")
            detail_topic = gr.Textbox(label="Topic", interactive=False)
            detail_question = gr.Textbox(label="Question", lines=4, interactive=False)
            detail_answer = gr.Textbox(label="Answer", lines=7, interactive=False)
            detail_meta = gr.Textbox(label="Metadata", interactive=False)
            csv_file = gr.File(label="Download CSV", interactive=False)

            run_btn.click(
                fn=run_explore,
                inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir, page_size, page_number],
                outputs=[summary_md, table, filtered_state, page_number, row_slider],
            )

            inspect_btn.click(
                fn=show_row_detail,
                inputs=[filtered_state, row_slider],
                outputs=[detail_header, detail_topic, detail_question, detail_answer, detail_meta],
            )

            export_btn.click(
                fn=export_filtered_csv,
                inputs=[topic_dd, split_dd, keyword_tb, min_len, max_len, sort_by, sort_dir],
                outputs=[csv_file],
            )

        with gr.Tab("Quiz"):
            with gr.Row():
                quiz_topic = gr.Dropdown(
                    choices=["All"] + ALL_TOPICS,
                    value="All",
                    label="Topic filter"
                )
                difficulty = gr.Radio(
                    choices=["Easy", "Medium", "Hard"],
                    value="Medium",
                    label="Difficulty"
                )
                start_btn = gr.Button("Start quiz", variant="primary")
                reset_btn = gr.Button("Reset score")

            quiz_score = gr.Markdown(value=score_text(0, 0, 0, 0))
            quiz_status = gr.Textbox(label="Status", interactive=False)

            question_box = gr.Textbox(label="Question", lines=5, interactive=False)
            choices_radio = gr.Radio(choices=[], label="Choose one answer")
            submit_btn = gr.Button("Submit and load next question", variant="primary")

            result_box = gr.Textbox(label="Result", lines=6, interactive=False)
            teaching_box = gr.Textbox(label="Teaching support", lines=8, interactive=False)

            correct_state = gr.State("")
            q_state = gr.State("")
            topic_state = gr.State("")

            correct_count_state = gr.State(0)
            total_count_state = gr.State(0)
            streak_state = gr.State(0)
            best_streak_state = gr.State(0)

            start_btn.click(
                fn=start_quiz,
                inputs=[
                    quiz_topic,
                    difficulty,
                    correct_count_state,
                    total_count_state,
                    streak_state,
                    best_streak_state,
                ],
                outputs=[
                    question_box,
                    choices_radio,
                    correct_state,
                    q_state,
                    topic_state,
                    teaching_box,
                    quiz_score,
                    quiz_status,
                ],
            )

            submit_btn.click(
                fn=submit_and_next,
                inputs=[
                    choices_radio,
                    correct_state,
                    q_state,
                    topic_state,
                    quiz_topic,
                    difficulty,
                    correct_count_state,
                    total_count_state,
                    streak_state,
                    best_streak_state,
                ],
                outputs=[
                    result_box,
                    question_box,
                    choices_radio,
                    correct_state,
                    q_state,
                    topic_state,
                    teaching_box,
                    quiz_score,
                    quiz_status,
                    correct_count_state,
                    total_count_state,
                    streak_state,
                    best_streak_state,
                ],
            )

            reset_btn.click(
                fn=reset_score,
                inputs=[],
                outputs=[
                    correct_count_state,
                    total_count_state,
                    streak_state,
                    best_streak_state,
                    quiz_score,
                    quiz_status,
                ],
            )

        with gr.Tab("About"):
            gr.Markdown(
                """
## About this teaching app

This Space demonstrates:
- Practical dataset exploration for bioinformatics QA data
- Teaching-mode multiple-choice practice with topic-aware distractors
- Session score tracking with streak metrics

## Important notice

This app is intended for learning and research use only.
Use with caution.
Do not use as a replacement for expert biomedical or clinical judgment.

## Dataset

- Source: yashm/bioinformatics-qa-dataset
- Citation and DOI are listed in the project README
"""
            )

demo.launch()