Study_Budy / app.py
jeffrey1963's picture
Create app.py
937ebf4 verified
# app.py β€” Study Buddy (transparent TF-IDF MCQ helper)
# Hugging Face Space: Gradio app (no API keys required)
from dataclasses import dataclass
from typing import List, Tuple
import re
import numpy as np
import gradio as gr
# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# --------------------------
# Default corpus (your text)
# --------------------------
DEFAULT_NOTES = [
("drought_planning", """
Drought planning should consider risk, cash flow, forage, and animal welfare.
Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
Waiting for rain without a contingency is risky due to price and yield volatility.
"""),
("depreciation_basics", """
Straight line depreciation allocates (Cost βˆ’ Salvage) evenly over an asset's useful life.
Annual Depreciation = (Cost βˆ’ Salvage) / Life.
Ending book value should not fall below the salvage value.
"""),
("stocking_rate", """
Rotational grazing with adequate rest improves pasture condition and resilience.
During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
Short term leasing of nearby pasture can also relieve pressure and stabilize production.
"""),
("budgeting_cashflow", """
Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
"""),
]
DEFAULT_CORPUS_TEXT = """# Study Buddy Corpus (edit me)
# Format suggestions:
# - You can leave this as simple text. Paragraphs/sentences become "passages".
# - Optionally add section headings like: ### drought_planning
### drought_planning
Drought planning should consider risk, cash flow, forage, and animal welfare.
Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
Waiting for rain without a contingency is risky due to price and yield volatility.
### depreciation_basics
Straight line depreciation allocates (Cost βˆ’ Salvage) evenly over an asset's useful life.
Annual Depreciation = (Cost βˆ’ Salvage) / Life.
Ending book value should not fall below the salvage value.
### stocking_rate
Rotational grazing with adequate rest improves pasture condition and resilience.
During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
Short term leasing of nearby pasture can also relieve pressure and stabilize production.
### budgeting_cashflow
Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
"""
# --------------------------
# Passage splitting & index
# --------------------------
def split_passages(text: str, min_len=30) -> List[str]:
# naive sentence-ish split; keep only nontrivial lines
rough = re.split(r'[.\n]+', text)
out = []
for s in (t.strip() for t in rough):
if len(s) >= min_len:
out.append(s)
return out
def parse_corpus_to_notes(corpus_text: str) -> List[Tuple[str, str]]:
"""
Parses the editable textarea into (doc_id, text) chunks.
If the user includes lines like '### section_name', use them as doc_ids,
otherwise everything becomes a single doc 'notes'.
"""
lines = corpus_text.splitlines()
notes: List[Tuple[str, str]] = []
current_id = "notes"
current_buf: List[str] = []
def flush():
nonlocal notes, current_id, current_buf
if current_buf:
notes.append((current_id, "\n".join(current_buf).strip()))
current_buf = []
for ln in lines:
m = re.match(r'^\s*#{3,}\s*(.+?)\s*$', ln) # ### heading
if m:
flush()
# Normalize id
current_id = re.sub(r'[^a-z0-9_]+', '_', m.group(1).strip().lower())
else:
current_buf.append(ln)
flush()
# Fallback to defaults if parsed content is too small
ok_notes = [(i, t) for i, t in notes if len(t.strip()) >= 30]
return ok_notes if ok_notes else DEFAULT_NOTES
@dataclass
class PassageIndex:
vectorizer: TfidfVectorizer
X: np.ndarray
ids: List[str]
texts: List[str]
def build_corpus_passages(notes: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
passages = []
for doc_id, txt in notes:
for p in split_passages(txt):
passages.append((doc_id, p))
return passages
def build_index_from_corpus(corpus_text: str) -> PassageIndex:
notes = parse_corpus_to_notes(corpus_text)
passages = build_corpus_passages(notes)
ids = [d for d, _ in passages]
texts = [t for _, t in passages]
if not texts:
# Ensure vectorizer has at least one doc
texts = ["placeholder"]
ids = ["placeholder"]
vec = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_df=0.9, min_df=1)
X = vec.fit_transform(texts)
return PassageIndex(vec, X, ids, texts)
# --------------------------
# MCQ scoring
# --------------------------
@dataclass
class OptionScore:
label: str
text: str
total: float
evidence: List[Tuple[float, str]]
def score_option(index: PassageIndex, question: str, option_text: str, top_k=3) -> Tuple[float, List[Tuple[float, str]]]:
query = (question + " " + option_text).strip()
qv = index.vectorizer.transform([query])
sims = cosine_similarity(qv, index.X)[0]
top_idx = np.argsort(sims)[::-1][:top_k]
evidence = [(float(sims[i]), index.texts[i]) for i in top_idx]
total = float(np.mean([s for s, _ in evidence])) if evidence else 0.0
return total, evidence
def answer_mcq(index: PassageIndex, question: str, options: List[Tuple[str, str]], top_k=3) -> OptionScore:
scored = []
for lab, opt in options:
tot, ev = score_option(index, question, opt, top_k=top_k)
scored.append(OptionScore(lab, opt, tot, ev))
scored.sort(key=lambda r: r.total, reverse=True)
return scored[0] if scored else OptionScore("-", "(no options)", 0.0, [])
# --------------------------
# Gradio UI logic
# --------------------------
def rebuild_index(corpus_text):
idx = build_index_from_corpus(corpus_text)
return idx, "βœ… Corpus indexed (passages = {})".format(len(idx.texts))
def run_mcq(idx: PassageIndex, question: str, opts: List[str], top_k: int):
# Parse options: keep non-empty; label A..F
labels = list("ABCDEF")
pairs = [(labels[i], o.strip()) for i, o in enumerate(opts) if o and o.strip()]
if not question.strip() or not pairs:
return "Enter a question and at least one option.", ""
best = answer_mcq(idx, question, pairs, top_k=top_k)
# Pretty output
head = f"β†’ PICK: {best.label}) {best.text} [score={best.total:.4f}]"
ev = "\n".join([f" β€’ ({s:.4f}) {p}" for s, p in best.evidence])
return head, f"Evidence passages:\n{ev}"
with gr.Blocks(title="Study Buddy β€” Transparent MCQ Helper") as demo:
gr.Markdown("# Study Buddy β€” Transparent MCQ Helper\nA tiny TF-IDF + cosine similarity bot. Paste your notes, build an index, then ask a multiple-choice question. The bot shows the top evidence it used.")
with gr.Tab("Corpus"):
corpus_text = gr.Textbox(value=DEFAULT_CORPUS_TEXT, lines=20, label="Your notes (editable)")
build_btn = gr.Button("Rebuild Corpus Index")
status = gr.Markdown()
state_idx = gr.State(build_index_from_corpus(DEFAULT_CORPUS_TEXT))
build_btn.click(fn=rebuild_index, inputs=corpus_text, outputs=[state_idx, status])
with gr.Tab("Ask MCQ"):
q = gr.Textbox(lines=3, label="Question")
with gr.Row():
oA = gr.Textbox(label="Option A")
oB = gr.Textbox(label="Option B")
with gr.Row():
oC = gr.Textbox(label="Option C")
oD = gr.Textbox(label="Option D")
with gr.Accordion("Add more options (optional)", open=False):
with gr.Row():
oE = gr.Textbox(label="Option E")
oF = gr.Textbox(label="Option F")
topk = gr.Slider(1, 5, value=3, step=1, label="Evidence passages to average (k)")
go = gr.Button("Answer")
pick = gr.Markdown()
evidence = gr.Markdown()
def _run(idx, q, a, b, c, d, e, f, k):
return run_mcq(idx, q, [a, b, c, d, e, f], int(k))
go.click(_run, inputs=[state_idx, q, oA, oB, oC, oD, oE, oF, topk], outputs=[pick, evidence])
with gr.Tab("Demo (3 sample MCQs)"):
gr.Markdown("Click a button to load a sample Q with options.")
demo_out = gr.Markdown()
def load_demo(which):
if which == "Q1":
q = "Given elevated drought risk and limited cash, which short term strategy is best?"
opts = ["Purchase a large quantity of supplemental feed immediately",
"Wait two months and hope for rain",
"Lease additional pasture short term to relieve pressure",
"Increase stocking rate to maintain output"]
elif which == "Q2":
q = "Under straight line depreciation, what is the annual depreciation formula?"
opts = ["Annual Depreciation = Cost / Life",
"Annual Depreciation = (Cost βˆ’ Salvage) / Life",
"Annual Depreciation = (Cost βˆ’ Salvage) Γ— Life",
"Annual Depreciation equals Ending BV βˆ’ Beginning BV"]
else:
q = "During slow pasture growth in a rotational system, which action is most conservative?"
opts = ["Reduce stocking rate by around 10–15%",
"Wait for rain without changing anything",
"Graze all paddocks continuously to keep animals fed",
"Overseed high traffic paddocks only"]
head, ev = run_mcq(build_index_from_corpus(DEFAULT_CORPUS_TEXT), q, opts, top_k=3)
return f"**Q:** {q}\n\n" + "\n".join([f"- {chr(65+i)}) {opt}" for i, opt in enumerate(opts)]) + f"\n\n{head}\n\n{ev}"
with gr.Row():
b1 = gr.Button("Load demo Q1")
b2 = gr.Button("Load demo Q2")
b3 = gr.Button("Load demo Q3")
b1.click(lambda: load_demo("Q1"), outputs=demo_out)
b2.click(lambda: load_demo("Q2"), outputs=demo_out)
b3.click(lambda: load_demo("Q3"), outputs=demo_out)
if __name__ == "__main__":
demo.launch()