Spaces:

jeffrey1963
/

Study_Budy

Sleeping

App Files Files Community

Study_Budy / app.py

jeffrey1963

Create app.py

937ebf4 verified 7 months ago

raw

history blame contribute delete

10.9 kB

	# app.py — Study Buddy (transparent TF-IDF MCQ helper)
	# Hugging Face Space: Gradio app (no API keys required)

	from dataclasses import dataclass
	from typing import List, Tuple
	import re
	import numpy as np
	import gradio as gr

	# scikit-learn
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# --------------------------
	# Default corpus (your text)
	# --------------------------
	DEFAULT_NOTES = [
	("drought_planning", """
	Drought planning should consider risk, cash flow, forage, and animal welfare.
	Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
	Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
	Waiting for rain without a contingency is risky due to price and yield volatility.
	"""),
	("depreciation_basics", """
	Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life.
	Annual Depreciation = (Cost − Salvage) / Life.
	Ending book value should not fall below the salvage value.
	"""),
	("stocking_rate", """
	Rotational grazing with adequate rest improves pasture condition and resilience.
	During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
	Short term leasing of nearby pasture can also relieve pressure and stabilize production.
	"""),
	("budgeting_cashflow", """
	Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
	When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
	"""),
	]

	DEFAULT_CORPUS_TEXT = """# Study Buddy Corpus (edit me)
	# Format suggestions:
	# - You can leave this as simple text. Paragraphs/sentences become "passages".
	# - Optionally add section headings like: ### drought_planning

	### drought_planning
	Drought planning should consider risk, cash flow, forage, and animal welfare.
	Short term options include purchasing supplemental feed, leasing additional pasture, or modestly destocking (e.g., 10–15%).
	Leasing pasture helps preserve home forage and avoids overgrazing, often with lower upfront cash than large feed purchases.
	Waiting for rain without a contingency is risky due to price and yield volatility.

	### depreciation_basics
	Straight line depreciation allocates (Cost − Salvage) evenly over an asset's useful life.
	Annual Depreciation = (Cost − Salvage) / Life.
	Ending book value should not fall below the salvage value.

	### stocking_rate
	Rotational grazing with adequate rest improves pasture condition and resilience.
	During drought or slow growth, reduce stocking rate by 10–15% to protect ground cover and avoid overuse.
	Short term leasing of nearby pasture can also relieve pressure and stabilize production.

	### budgeting_cashflow
	Enterprise budgets compare revenues and costs per activity; sensitivity analysis helps evaluate risk.
	When cash flow is tight, options with lower upfront cost or flexible payment terms can be preferable.
	"""

	# --------------------------
	# Passage splitting & index
	# --------------------------
	def split_passages(text: str, min_len=30) -> List[str]:
	# naive sentence-ish split; keep only nontrivial lines
	rough = re.split(r'[.\n]+', text)
	out = []
	for s in (t.strip() for t in rough):
	if len(s) >= min_len:
	out.append(s)
	return out

	def parse_corpus_to_notes(corpus_text: str) -> List[Tuple[str, str]]:
	"""
	Parses the editable textarea into (doc_id, text) chunks.
	If the user includes lines like '### section_name', use them as doc_ids,
	otherwise everything becomes a single doc 'notes'.
	"""
	lines = corpus_text.splitlines()
	notes: List[Tuple[str, str]] = []
	current_id = "notes"
	current_buf: List[str] = []

	def flush():
	nonlocal notes, current_id, current_buf
	if current_buf:
	notes.append((current_id, "\n".join(current_buf).strip()))
	current_buf = []

	for ln in lines:
	m = re.match(r'^\s#{3,}\s(.+?)\s*$', ln) # ### heading
	if m:
	flush()
	# Normalize id
	current_id = re.sub(r'[^a-z0-9_]+', '_', m.group(1).strip().lower())
	else:
	current_buf.append(ln)
	flush()

	# Fallback to defaults if parsed content is too small
	ok_notes = [(i, t) for i, t in notes if len(t.strip()) >= 30]
	return ok_notes if ok_notes else DEFAULT_NOTES

	@dataclass
	class PassageIndex:
	vectorizer: TfidfVectorizer
	X: np.ndarray
	ids: List[str]
	texts: List[str]

	def build_corpus_passages(notes: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
	passages = []
	for doc_id, txt in notes:
	for p in split_passages(txt):
	passages.append((doc_id, p))
	return passages

	def build_index_from_corpus(corpus_text: str) -> PassageIndex:
	notes = parse_corpus_to_notes(corpus_text)
	passages = build_corpus_passages(notes)
	ids = [d for d, _ in passages]
	texts = [t for _, t in passages]
	if not texts:
	# Ensure vectorizer has at least one doc
	texts = ["placeholder"]
	ids = ["placeholder"]
	vec = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_df=0.9, min_df=1)
	X = vec.fit_transform(texts)
	return PassageIndex(vec, X, ids, texts)

	# --------------------------
	# MCQ scoring
	# --------------------------
	@dataclass
	class OptionScore:
	label: str
	text: str
	total: float
	evidence: List[Tuple[float, str]]

	def score_option(index: PassageIndex, question: str, option_text: str, top_k=3) -> Tuple[float, List[Tuple[float, str]]]:
	query = (question + " " + option_text).strip()
	qv = index.vectorizer.transform([query])
	sims = cosine_similarity(qv, index.X)[0]
	top_idx = np.argsort(sims)[::-1][:top_k]
	evidence = [(float(sims[i]), index.texts[i]) for i in top_idx]
	total = float(np.mean([s for s, _ in evidence])) if evidence else 0.0
	return total, evidence

	def answer_mcq(index: PassageIndex, question: str, options: List[Tuple[str, str]], top_k=3) -> OptionScore:
	scored = []
	for lab, opt in options:
	tot, ev = score_option(index, question, opt, top_k=top_k)
	scored.append(OptionScore(lab, opt, tot, ev))
	scored.sort(key=lambda r: r.total, reverse=True)
	return scored[0] if scored else OptionScore("-", "(no options)", 0.0, [])

	# --------------------------
	# Gradio UI logic
	# --------------------------
	def rebuild_index(corpus_text):
	idx = build_index_from_corpus(corpus_text)
	return idx, "✅ Corpus indexed (passages = {})".format(len(idx.texts))

	def run_mcq(idx: PassageIndex, question: str, opts: List[str], top_k: int):
	# Parse options: keep non-empty; label A..F
	labels = list("ABCDEF")
	pairs = [(labels[i], o.strip()) for i, o in enumerate(opts) if o and o.strip()]
	if not question.strip() or not pairs:
	return "Enter a question and at least one option.", ""
	best = answer_mcq(idx, question, pairs, top_k=top_k)
	# Pretty output
	head = f"→ PICK: {best.label}) {best.text} [score={best.total:.4f}]"
	ev = "\n".join([f" • ({s:.4f}) {p}" for s, p in best.evidence])
	return head, f"Evidence passages:\n{ev}"

	with gr.Blocks(title="Study Buddy — Transparent MCQ Helper") as demo:
	gr.Markdown("# Study Buddy — Transparent MCQ Helper\nA tiny TF-IDF + cosine similarity bot. Paste your notes, build an index, then ask a multiple-choice question. The bot shows the top evidence it used.")

	with gr.Tab("Corpus"):
	corpus_text = gr.Textbox(value=DEFAULT_CORPUS_TEXT, lines=20, label="Your notes (editable)")
	build_btn = gr.Button("Rebuild Corpus Index")
	status = gr.Markdown()
	state_idx = gr.State(build_index_from_corpus(DEFAULT_CORPUS_TEXT))
	build_btn.click(fn=rebuild_index, inputs=corpus_text, outputs=[state_idx, status])

	with gr.Tab("Ask MCQ"):
	q = gr.Textbox(lines=3, label="Question")
	with gr.Row():
	oA = gr.Textbox(label="Option A")
	oB = gr.Textbox(label="Option B")
	with gr.Row():
	oC = gr.Textbox(label="Option C")
	oD = gr.Textbox(label="Option D")
	with gr.Accordion("Add more options (optional)", open=False):
	with gr.Row():
	oE = gr.Textbox(label="Option E")
	oF = gr.Textbox(label="Option F")
	topk = gr.Slider(1, 5, value=3, step=1, label="Evidence passages to average (k)")
	go = gr.Button("Answer")
	pick = gr.Markdown()
	evidence = gr.Markdown()

	def _run(idx, q, a, b, c, d, e, f, k):
	return run_mcq(idx, q, [a, b, c, d, e, f], int(k))

	go.click(_run, inputs=[state_idx, q, oA, oB, oC, oD, oE, oF, topk], outputs=[pick, evidence])

	with gr.Tab("Demo (3 sample MCQs)"):
	gr.Markdown("Click a button to load a sample Q with options.")
	demo_out = gr.Markdown()

	def load_demo(which):
	if which == "Q1":
	q = "Given elevated drought risk and limited cash, which short term strategy is best?"
	opts = ["Purchase a large quantity of supplemental feed immediately",
	"Wait two months and hope for rain",
	"Lease additional pasture short term to relieve pressure",
	"Increase stocking rate to maintain output"]
	elif which == "Q2":
	q = "Under straight line depreciation, what is the annual depreciation formula?"
	opts = ["Annual Depreciation = Cost / Life",
	"Annual Depreciation = (Cost − Salvage) / Life",
	"Annual Depreciation = (Cost − Salvage) × Life",
	"Annual Depreciation equals Ending BV − Beginning BV"]
	else:
	q = "During slow pasture growth in a rotational system, which action is most conservative?"
	opts = ["Reduce stocking rate by around 10–15%",
	"Wait for rain without changing anything",
	"Graze all paddocks continuously to keep animals fed",
	"Overseed high traffic paddocks only"]
	head, ev = run_mcq(build_index_from_corpus(DEFAULT_CORPUS_TEXT), q, opts, top_k=3)
	return f"Q: {q}\n\n" + "\n".join([f"- {chr(65+i)}) {opt}" for i, opt in enumerate(opts)]) + f"\n\n{head}\n\n{ev}"

	with gr.Row():
	b1 = gr.Button("Load demo Q1")
	b2 = gr.Button("Load demo Q2")
	b3 = gr.Button("Load demo Q3")
	b1.click(lambda: load_demo("Q1"), outputs=demo_out)
	b2.click(lambda: load_demo("Q2"), outputs=demo_out)
	b3.click(lambda: load_demo("Q3"), outputs=demo_out)

	if __name__ == "__main__":
	demo.launch()