# app.py — Study Buddy (transparent TF-IDF MCQ helper) # Hugging Face Space: Gradio app (no API keys required) from dataclasses import dataclass from typing import List, Tuple import re import numpy as np import gradio as gr # scikit-learn from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # -------------------------- # Default corpus (your text) # -------------------------- DEFAULT_NOTES = [ ("drought_planning", """ Drought planning should consider risk, cash flow, forage, and animal welfare. Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%). Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases. Waiting for rain without a contingency is risky due to price and yield volatility. """), ("depreciation_basics", """ Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life. Annual Depreciation = (Cost − Salvage) / Life. Ending book value should not fall below the salvage value. """), ("stocking_rate", """ Rotational grazing with adequate rest improves pasture condition and resilience. During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse. Short term leasing of nearby pasture can also relieve pressure and stabilize production. """), ("budgeting_cashflow", """ Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk. When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable. """), ] DEFAULT_CORPUS_TEXT = """# Study Buddy Corpus (edit me) # Format suggestions: # - You can leave this as simple text. Paragraphs/sentences become "passages". # - Optionally add section headings like: ### drought_planning ### drought_planning Drought planning should consider risk, cash flow, forage, and animal welfare. Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%). Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases. Waiting for rain without a contingency is risky due to price and yield volatility. ### depreciation_basics Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life. Annual Depreciation = (Cost − Salvage) / Life. Ending book value should not fall below the salvage value. ### stocking_rate Rotational grazing with adequate rest improves pasture condition and resilience. During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse. Short term leasing of nearby pasture can also relieve pressure and stabilize production. ### budgeting_cashflow Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk. When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable. """ # -------------------------- # Passage splitting & index # -------------------------- def split_passages(text: str, min_len=30) -> List[str]: # naive sentence-ish split; keep only nontrivial lines rough = re.split(r'[.\n]+', text) out = [] for s in (t.strip() for t in rough): if len(s) >= min_len: out.append(s) return out def parse_corpus_to_notes(corpus_text: str) -> List[Tuple[str, str]]: """ Parses the editable textarea into (doc_id, text) chunks. If the user includes lines like '### section_name', use them as doc_ids, otherwise everything becomes a single doc 'notes'. """ lines = corpus_text.splitlines() notes: List[Tuple[str, str]] = [] current_id = "notes" current_buf: List[str] = [] def flush(): nonlocal notes, current_id, current_buf if current_buf: notes.append((current_id, "\n".join(current_buf).strip())) current_buf = [] for ln in lines: m = re.match(r'^\s*#{3,}\s*(.+?)\s*$', ln) # ### heading if m: flush() # Normalize id current_id = re.sub(r'[^a-z0-9_]+', '_', m.group(1).strip().lower()) else: current_buf.append(ln) flush() # Fallback to defaults if parsed content is too small ok_notes = [(i, t) for i, t in notes if len(t.strip()) >= 30] return ok_notes if ok_notes else DEFAULT_NOTES @dataclass class PassageIndex: vectorizer: TfidfVectorizer X: np.ndarray ids: List[str] texts: List[str] def build_corpus_passages(notes: List[Tuple[str, str]]) -> List[Tuple[str, str]]: passages = [] for doc_id, txt in notes: for p in split_passages(txt): passages.append((doc_id, p)) return passages def build_index_from_corpus(corpus_text: str) -> PassageIndex: notes = parse_corpus_to_notes(corpus_text) passages = build_corpus_passages(notes) ids = [d for d, _ in passages] texts = [t for _, t in passages] if not texts: # Ensure vectorizer has at least one doc texts = ["placeholder"] ids = ["placeholder"] vec = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_df=0.9, min_df=1) X = vec.fit_transform(texts) return PassageIndex(vec, X, ids, texts) # -------------------------- # MCQ scoring # -------------------------- @dataclass class OptionScore: label: str text: str total: float evidence: List[Tuple[float, str]] def score_option(index: PassageIndex, question: str, option_text: str, top_k=3) -> Tuple[float, List[Tuple[float, str]]]: query = (question + " " + option_text).strip() qv = index.vectorizer.transform([query]) sims = cosine_similarity(qv, index.X)[0] top_idx = np.argsort(sims)[::-1][:top_k] evidence = [(float(sims[i]), index.texts[i]) for i in top_idx] total = float(np.mean([s for s, _ in evidence])) if evidence else 0.0 return total, evidence def answer_mcq(index: PassageIndex, question: str, options: List[Tuple[str, str]], top_k=3) -> OptionScore: scored = [] for lab, opt in options: tot, ev = score_option(index, question, opt, top_k=top_k) scored.append(OptionScore(lab, opt, tot, ev)) scored.sort(key=lambda r: r.total, reverse=True) return scored[0] if scored else OptionScore("-", "(no options)", 0.0, []) # -------------------------- # Gradio UI logic # -------------------------- def rebuild_index(corpus_text): idx = build_index_from_corpus(corpus_text) return idx, "✅ Corpus indexed (passages = {})".format(len(idx.texts)) def run_mcq(idx: PassageIndex, question: str, opts: List[str], top_k: int): # Parse options: keep non-empty; label A..F labels = list("ABCDEF") pairs = [(labels[i], o.strip()) for i, o in enumerate(opts) if o and o.strip()] if not question.strip() or not pairs: return "Enter a question and at least one option.", "" best = answer_mcq(idx, question, pairs, top_k=top_k) # Pretty output head = f"→ PICK: {best.label}) {best.text} [score={best.total:.4f}]" ev = "\n".join([f" • ({s:.4f}) {p}" for s, p in best.evidence]) return head, f"Evidence passages:\n{ev}" with gr.Blocks(title="Study Buddy — Transparent MCQ Helper") as demo: gr.Markdown("# Study Buddy — Transparent MCQ Helper\nA tiny TF-IDF + cosine similarity bot. Paste your notes, build an index, then ask a multiple-choice question. The bot shows the top evidence it used.") with gr.Tab("Corpus"): corpus_text = gr.Textbox(value=DEFAULT_CORPUS_TEXT, lines=20, label="Your notes (editable)") build_btn = gr.Button("Rebuild Corpus Index") status = gr.Markdown() state_idx = gr.State(build_index_from_corpus(DEFAULT_CORPUS_TEXT)) build_btn.click(fn=rebuild_index, inputs=corpus_text, outputs=[state_idx, status]) with gr.Tab("Ask MCQ"): q = gr.Textbox(lines=3, label="Question") with gr.Row(): oA = gr.Textbox(label="Option A") oB = gr.Textbox(label="Option B") with gr.Row(): oC = gr.Textbox(label="Option C") oD = gr.Textbox(label="Option D") with gr.Accordion("Add more options (optional)", open=False): with gr.Row(): oE = gr.Textbox(label="Option E") oF = gr.Textbox(label="Option F") topk = gr.Slider(1, 5, value=3, step=1, label="Evidence passages to average (k)") go = gr.Button("Answer") pick = gr.Markdown() evidence = gr.Markdown() def _run(idx, q, a, b, c, d, e, f, k): return run_mcq(idx, q, [a, b, c, d, e, f], int(k)) go.click(_run, inputs=[state_idx, q, oA, oB, oC, oD, oE, oF, topk], outputs=[pick, evidence]) with gr.Tab("Demo (3 sample MCQs)"): gr.Markdown("Click a button to load a sample Q with options.") demo_out = gr.Markdown() def load_demo(which): if which == "Q1": q = "Given elevated drought risk and limited cash, which short term strategy is best?" opts = ["Purchase a large quantity of supplemental feed immediately", "Wait two months and hope for rain", "Lease additional pasture short term to relieve pressure", "Increase stocking rate to maintain output"] elif which == "Q2": q = "Under straight line depreciation, what is the annual depreciation formula?" opts = ["Annual Depreciation = Cost / Life", "Annual Depreciation = (Cost − Salvage) / Life", "Annual Depreciation = (Cost − Salvage) × Life", "Annual Depreciation equals Ending BV − Beginning BV"] else: q = "During slow pasture growth in a rotational system, which action is most conservative?" opts = ["Reduce stocking rate by around 10–15%", "Wait for rain without changing anything", "Graze all paddocks continuously to keep animals fed", "Overseed high traffic paddocks only"] head, ev = run_mcq(build_index_from_corpus(DEFAULT_CORPUS_TEXT), q, opts, top_k=3) return f"**Q:** {q}\n\n" + "\n".join([f"- {chr(65+i)}) {opt}" for i, opt in enumerate(opts)]) + f"\n\n{head}\n\n{ev}" with gr.Row(): b1 = gr.Button("Load demo Q1") b2 = gr.Button("Load demo Q2") b3 = gr.Button("Load demo Q3") b1.click(lambda: load_demo("Q1"), outputs=demo_out) b2.click(lambda: load_demo("Q2"), outputs=demo_out) b3.click(lambda: load_demo("Q3"), outputs=demo_out) if __name__ == "__main__": demo.launch()