# app.py — Study Buddy (transparent TF-IDF MCQ helper)
# Hugging Face Space: Gradio app (no API keys required)

from dataclasses import dataclass
from typing import List, Tuple
import re
import numpy as np
import gradio as gr

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --------------------------
# Default corpus (your text)
# --------------------------
DEFAULT_NOTES = [
    ("drought_planning", """
Drought planning should consider risk, cash flow, forage, and animal welfare.
Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
Waiting for rain without a contingency is risky due to price and yield volatility.
"""),
    ("depreciation_basics", """
Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life.
Annual Depreciation = (Cost − Salvage) / Life.
Ending book value should not fall below the salvage value.
"""),
    ("stocking_rate", """
Rotational grazing with adequate rest improves pasture condition and resilience.
During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
Short term leasing of nearby pasture can also relieve pressure and stabilize production.
"""),
    ("budgeting_cashflow", """
Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
"""),
]

DEFAULT_CORPUS_TEXT = """# Study Buddy Corpus (edit me)
# Format suggestions:
# - You can leave this as simple text. Paragraphs/sentences become "passages".
# - Optionally add section headings like: ### drought_planning

### drought_planning
Drought planning should consider risk, cash flow, forage, and animal welfare.
Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
Waiting for rain without a contingency is risky due to price and yield volatility.

### depreciation_basics
Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life.
Annual Depreciation = (Cost − Salvage) / Life.
Ending book value should not fall below the salvage value.

### stocking_rate
Rotational grazing with adequate rest improves pasture condition and resilience.
During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
Short term leasing of nearby pasture can also relieve pressure and stabilize production.

### budgeting_cashflow
Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
"""

# --------------------------
# Passage splitting & index
# --------------------------
def split_passages(text: str, min_len=30) -> List[str]:
    # naive sentence-ish split; keep only nontrivial lines
    rough = re.split(r'[.\n]+', text)
    out = []
    for s in (t.strip() for t in rough):
        if len(s) >= min_len:
            out.append(s)
    return out

def parse_corpus_to_notes(corpus_text: str) -> List[Tuple[str, str]]:
    """
    Parses the editable textarea into (doc_id, text) chunks.
    If the user includes lines like '### section_name', use them as doc_ids,
    otherwise everything becomes a single doc 'notes'.
    """
    lines = corpus_text.splitlines()
    notes: List[Tuple[str, str]] = []
    current_id = "notes"
    current_buf: List[str] = []

    def flush():
        nonlocal notes, current_id, current_buf
        if current_buf:
            notes.append((current_id, "\n".join(current_buf).strip()))
            current_buf = []

    for ln in lines:
        m = re.match(r'^\s*#{3,}\s*(.+?)\s*$', ln)  # ### heading
        if m:
            flush()
            # Normalize id
            current_id = re.sub(r'[^a-z0-9_]+', '_', m.group(1).strip().lower())
        else:
            current_buf.append(ln)
    flush()

    # Fallback to defaults if parsed content is too small
    ok_notes = [(i, t) for i, t in notes if len(t.strip()) >= 30]
    return ok_notes if ok_notes else DEFAULT_NOTES

@dataclass
class PassageIndex:
    vectorizer: TfidfVectorizer
    X: np.ndarray
    ids: List[str]
    texts: List[str]

def build_corpus_passages(notes: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    passages = []
    for doc_id, txt in notes:
        for p in split_passages(txt):
            passages.append((doc_id, p))
    return passages

def build_index_from_corpus(corpus_text: str) -> PassageIndex:
    notes = parse_corpus_to_notes(corpus_text)
    passages = build_corpus_passages(notes)
    ids = [d for d, _ in passages]
    texts = [t for _, t in passages]
    if not texts:
        # Ensure vectorizer has at least one doc
        texts = ["placeholder"]
        ids = ["placeholder"]
    vec = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_df=0.9, min_df=1)
    X = vec.fit_transform(texts)
    return PassageIndex(vec, X, ids, texts)

# --------------------------
# MCQ scoring
# --------------------------
@dataclass
class OptionScore:
    label: str
    text: str
    total: float
    evidence: List[Tuple[float, str]]

def score_option(index: PassageIndex, question: str, option_text: str, top_k=3) -> Tuple[float, List[Tuple[float, str]]]:
    query = (question + " " + option_text).strip()
    qv = index.vectorizer.transform([query])
    sims = cosine_similarity(qv, index.X)[0]
    top_idx = np.argsort(sims)[::-1][:top_k]
    evidence = [(float(sims[i]), index.texts[i]) for i in top_idx]
    total = float(np.mean([s for s, _ in evidence])) if evidence else 0.0
    return total, evidence

def answer_mcq(index: PassageIndex, question: str, options: List[Tuple[str, str]], top_k=3) -> OptionScore:
    scored = []
    for lab, opt in options:
        tot, ev = score_option(index, question, opt, top_k=top_k)
        scored.append(OptionScore(lab, opt, tot, ev))
    scored.sort(key=lambda r: r.total, reverse=True)
    return scored[0] if scored else OptionScore("-", "(no options)", 0.0, [])

# --------------------------
# Gradio UI logic
# --------------------------
def rebuild_index(corpus_text):
    idx = build_index_from_corpus(corpus_text)
    return idx, "✅ Corpus indexed (passages = {})".format(len(idx.texts))

def run_mcq(idx: PassageIndex, question: str, opts: List[str], top_k: int):
    # Parse options: keep non-empty; label A..F
    labels = list("ABCDEF")
    pairs = [(labels[i], o.strip()) for i, o in enumerate(opts) if o and o.strip()]
    if not question.strip() or not pairs:
        return "Enter a question and at least one option.", ""
    best = answer_mcq(idx, question, pairs, top_k=top_k)
    # Pretty output
    head = f"→ PICK: {best.label}) {best.text}   [score={best.total:.4f}]"
    ev = "\n".join([f" • ({s:.4f}) {p}" for s, p in best.evidence])
    return head, f"Evidence passages:\n{ev}"

with gr.Blocks(title="Study Buddy — Transparent MCQ Helper") as demo:
    gr.Markdown("# Study Buddy — Transparent MCQ Helper\nA tiny TF-IDF + cosine similarity bot. Paste your notes, build an index, then ask a multiple-choice question. The bot shows the top evidence it used.")

    with gr.Tab("Corpus"):
        corpus_text = gr.Textbox(value=DEFAULT_CORPUS_TEXT, lines=20, label="Your notes (editable)")
        build_btn = gr.Button("Rebuild Corpus Index")
        status = gr.Markdown()
        state_idx = gr.State(build_index_from_corpus(DEFAULT_CORPUS_TEXT))
        build_btn.click(fn=rebuild_index, inputs=corpus_text, outputs=[state_idx, status])

    with gr.Tab("Ask MCQ"):
        q = gr.Textbox(lines=3, label="Question")
        with gr.Row():
            oA = gr.Textbox(label="Option A")
            oB = gr.Textbox(label="Option B")
        with gr.Row():
            oC = gr.Textbox(label="Option C")
            oD = gr.Textbox(label="Option D")
        with gr.Accordion("Add more options (optional)", open=False):
            with gr.Row():
                oE = gr.Textbox(label="Option E")
                oF = gr.Textbox(label="Option F")
        topk = gr.Slider(1, 5, value=3, step=1, label="Evidence passages to average (k)")
        go = gr.Button("Answer")
        pick = gr.Markdown()
        evidence = gr.Markdown()

        def _run(idx, q, a, b, c, d, e, f, k):
            return run_mcq(idx, q, [a, b, c, d, e, f], int(k))

        go.click(_run, inputs=[state_idx, q, oA, oB, oC, oD, oE, oF, topk], outputs=[pick, evidence])

    with gr.Tab("Demo (3 sample MCQs)"):
        gr.Markdown("Click a button to load a sample Q with options.")
        demo_out = gr.Markdown()

        def load_demo(which):
            if which == "Q1":
                q = "Given elevated drought risk and limited cash, which short term strategy is best?"
                opts = ["Purchase a large quantity of supplemental feed immediately",
                        "Wait two months and hope for rain",
                        "Lease additional pasture short term to relieve pressure",
                        "Increase stocking rate to maintain output"]
            elif which == "Q2":
                q = "Under straight line depreciation, what is the annual depreciation formula?"
                opts = ["Annual Depreciation = Cost / Life",
                        "Annual Depreciation = (Cost − Salvage) / Life",
                        "Annual Depreciation = (Cost − Salvage) × Life",
                        "Annual Depreciation equals Ending BV − Beginning BV"]
            else:
                q = "During slow pasture growth in a rotational system, which action is most conservative?"
                opts = ["Reduce stocking rate by around 10–15%",
                        "Wait for rain without changing anything",
                        "Graze all paddocks continuously to keep animals fed",
                        "Overseed high traffic paddocks only"]
            head, ev = run_mcq(build_index_from_corpus(DEFAULT_CORPUS_TEXT), q, opts, top_k=3)
            return f"**Q:** {q}\n\n" + "\n".join([f"- {chr(65+i)}) {opt}" for i, opt in enumerate(opts)]) + f"\n\n{head}\n\n{ev}"

        with gr.Row():
            b1 = gr.Button("Load demo Q1")
            b2 = gr.Button("Load demo Q2")
            b3 = gr.Button("Load demo Q3")
        b1.click(lambda: load_demo("Q1"), outputs=demo_out)
        b2.click(lambda: load_demo("Q2"), outputs=demo_out)
        b3.click(lambda: load_demo("Q3"), outputs=demo_out)

if __name__ == "__main__":
    demo.launch()