PaperProf / share_trace.py
Mehdi
feat: add share_trace.py — run session + push agent trace to HF Hub dataset
3f8b85e
Raw
History Blame Contribute Delete
9.82 kB
"""
share_trace.py — Run a live PaperProf session and push the agent trace to HF Hub.
Records each LLM step (question generation, answer evaluation, MCQ generation)
as a structured dataset so the community can see how PaperProf works end-to-end.
Usage:
python share_trace.py
Output:
Dataset pushed to build-small-hackathon/PaperProf-traces
"""
import json
import time
import uuid
import os
import sys
from datetime import datetime, timezone
sys.path.insert(0, os.path.dirname(__file__))
TRACE_REPO = "build-small-hackathon/PaperProf-traces"
# Three chunks from different domains — covers the full diversity of PaperProf use cases
DEMO_CHUNKS = [
{
"topic": "Operating Systems — Virtual Memory",
"chunk": (
"Virtual memory is a memory management technique that gives each process the "
"illusion of having access to a large, contiguous block of memory. The OS maps "
"virtual addresses used by programs to physical addresses in RAM using a page table. "
"When a process accesses a page not currently in RAM, a page fault occurs and the OS "
"loads the required page from disk (swap space). This allows systems to run programs "
"larger than physical RAM and provides memory isolation between processes."
),
"student_answers": {
"open": "Virtual memory allows programs to use more memory than physically available by mapping virtual addresses to physical ones using a page table.",
"wrong": "Virtual memory is just another name for RAM, it speeds up the CPU cache.",
},
},
{
"topic": "Machine Learning — Gradient Descent",
"chunk": (
"Gradient descent is an iterative optimization algorithm used to minimize a loss "
"function by updating model parameters in the direction opposite to the gradient. "
"In each iteration, the gradient of the loss with respect to the parameters is "
"computed, and the parameters are updated as θ = θ − α∇L(θ), where α is the "
"learning rate. Too large a learning rate causes divergence; too small slows "
"convergence. Stochastic gradient descent (SGD) approximates the true gradient "
"using a random mini-batch at each step, making it scalable to large datasets."
),
"student_answers": {
"open": "Gradient descent minimizes the loss by repeatedly moving parameters opposite to the gradient, scaled by the learning rate.",
"wrong": "Gradient descent always finds the global minimum of any function.",
},
},
{
"topic": "Networking — TCP Three-Way Handshake",
"chunk": (
"The TCP three-way handshake establishes a reliable connection between a client "
"and server before data transfer begins. The client sends a SYN segment, the server "
"responds with SYN-ACK, and the client completes the handshake with an ACK. Each "
"side advertises its initial sequence number during this exchange, which is used to "
"order and acknowledge packets throughout the connection. This ensures both parties "
"are ready to send and receive before any application data flows."
),
"student_answers": {
"open": "The TCP handshake uses SYN, SYN-ACK, and ACK to synchronize sequence numbers and confirm both sides are ready to communicate.",
"wrong": "TCP uses a two-way handshake: SYN from client and ACK from server.",
},
},
]
def timed(fn, *args, **kwargs):
t0 = time.time()
result = fn(*args, **kwargs)
return result, round(time.time() - t0, 2)
def run_session(chunk_info: dict, session_id: str, model_id: str) -> list[dict]:
from core.questioner import generate_question, generate_mcq
from core.evaluator import evaluate_answer
steps = []
chunk = chunk_info["chunk"]
topic = chunk_info["topic"]
answers = chunk_info["student_answers"]
print(f"\n{'='*60}")
print(f"Topic: {topic}")
print(f"{'='*60}")
# Step 1 — Open question generation
print("[1/4] Generating open question…")
question, dur = timed(generate_question, chunk, language="English", difficulty="Normal")
print(f" Q: {question} ({dur}s)")
steps.append({
"session_id": session_id,
"step": 1,
"type": "question_generation",
"topic": topic,
"input": {"chunk": chunk, "difficulty": "Normal", "language": "English"},
"output": {"question": question},
"duration_s": dur,
"model": model_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
})
# Step 2 — Evaluate a correct answer
print("[2/4] Evaluating correct answer…")
feedback_ok, dur = timed(evaluate_answer, question, chunk, answers["open"], language="English")
print(f" Feedback (correct): {feedback_ok[:80]}… ({dur}s)")
steps.append({
"session_id": session_id,
"step": 2,
"type": "answer_evaluation",
"topic": topic,
"input": {
"chunk": chunk,
"question": question,
"student_answer": answers["open"],
"expected_quality": "correct",
},
"output": {"feedback": feedback_ok},
"duration_s": dur,
"model": model_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
})
# Step 3 — Evaluate a wrong answer
print("[3/4] Evaluating incorrect answer…")
feedback_bad, dur = timed(evaluate_answer, question, chunk, answers["wrong"], language="English")
print(f" Feedback (wrong): {feedback_bad[:80]}… ({dur}s)")
steps.append({
"session_id": session_id,
"step": 3,
"type": "answer_evaluation",
"topic": topic,
"input": {
"chunk": chunk,
"question": question,
"student_answer": answers["wrong"],
"expected_quality": "incorrect",
},
"output": {"feedback": feedback_bad},
"duration_s": dur,
"model": model_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
})
# Step 4 — MCQ generation
print("[4/4] Generating MCQ…")
mcq, dur = timed(generate_mcq, chunk, language="English")
print(f" MCQ question: {str(mcq.get('question',''))[:80]} ({dur}s)")
steps.append({
"session_id": session_id,
"step": 4,
"type": "mcq_generation",
"topic": topic,
"input": {"chunk": chunk, "language": "English"},
"output": {"mcq": mcq},
"duration_s": dur,
"model": model_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
})
return steps
def push_trace(all_steps: list[dict], model_id: str):
from huggingface_hub import HfApi
token = os.environ.get("HF_TOKEN")
api = HfApi(token=token)
api.create_repo(TRACE_REPO, repo_type="dataset", exist_ok=True, private=False)
# JSONL trace file
jsonl = "\n".join(json.dumps(s, ensure_ascii=False) for s in all_steps)
trace_bytes = jsonl.encode()
api.upload_file(
path_or_fileobj=trace_bytes,
path_in_repo="paperprof_trace.jsonl",
repo_id=TRACE_REPO,
repo_type="dataset",
commit_message="chore: upload PaperProf agent trace",
)
readme = f"""---
license: apache-2.0
task_categories:
- question-answering
- text-generation
language:
- en
tags:
- agent-trace
- education
- paperprof
- build-small-hackathon
---
# PaperProf Agent Trace
Step-by-step trace of [PaperProf](https://huggingface.co/spaces/build-small-hackathon/PaperProf),
an AI study buddy that turns course PDFs into interactive quiz sessions.
## What's in this dataset
Each row in `paperprof_trace.jsonl` is one LLM call. Fields:
| Field | Description |
|---|---|
| `session_id` | Groups steps from the same session |
| `step` | Step index within the session (1–4) |
| `type` | `question_generation` / `answer_evaluation` / `mcq_generation` |
| `topic` | Domain of the source chunk |
| `input` | Exact input sent to the model (chunk, question, student answer…) |
| `output` | Raw model output |
| `duration_s` | Wall-clock inference time |
| `model` | Model ID used |
## Session structure
Each session runs 4 steps on one text chunk:
1. **Open question generation** — the model writes a focused exam question
2. **Correct answer evaluation** — structured tutor feedback on a good answer
3. **Wrong answer evaluation** — structured tutor feedback on a bad answer
4. **MCQ generation** — 4-option question with per-option explanations
Three sessions are included, covering: Operating Systems, Machine Learning, and Networking.
## Model
`{model_id}`
Built for the Build Small Hackathon, June 2026, by Team PaperProf (EPITA).
"""
api.upload_file(
path_or_fileobj=readme.encode(),
path_in_repo="README.md",
repo_id=TRACE_REPO,
repo_type="dataset",
commit_message="chore: add dataset card",
)
print(f"\n✅ Trace pushed → https://huggingface.co/datasets/{TRACE_REPO}")
def main():
from model.llm import get_llm, DEFAULT_MODEL_ID
print("Loading model (first call may take 60–90s locally)…")
get_llm() # warm up
model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
all_steps = []
for chunk_info in DEMO_CHUNKS:
session_id = str(uuid.uuid4())[:8]
steps = run_session(chunk_info, session_id, model_id)
all_steps.extend(steps)
print(f"\n[push] {len(all_steps)} steps captured across {len(DEMO_CHUNKS)} sessions…")
push_trace(all_steps, model_id)
if __name__ == "__main__":
main()