Spaces:

Nipun
/

agent-101

Sleeping

Nipun Claude Opus 4.6 (1M context) commited on Apr 15

Commit

4c218d7

1 Parent(s): 4068623

Add Mermaid flowchart tab, simplify How-it-works, new tools + hard examples

- New tab: Flowchart — a Mermaid diagram showing the plain-LLM path vs.
the tool-calling agent loop, rendered via Mermaid.js from CDN.
- How it works tab trimmed: short piece-table + 7-line pseudocode up
front; the long system-prompt and per-tool schemas moved into a
collapsible accordion.
- Two new tools: get_time (timezones/cities) and get_exchange_rate
(demo-rate currency conversion) with matching JSON schemas.
- Harder arithmetic examples (powers, percentages, square roots,
mixed-unit multi-step) to make plain-LLM failures more obvious.
- Parse tool-call leaks in three shapes now — JSON, Python
[tool(k=v)] (Gemma 3 symptom), and <function=foo {...}> (Llama).
Gemma was emitting [convert_units(from_unit=celsius, ...)] as final
content; the loop now re-enters and runs it.
- Tightened system prompt to forbid Python/JSON tool-call syntax in the
natural-language reply.
- Custom CSS: gradient title card, colour-coded plain vs. agent panels,
highlighted final-answer box.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

app.py +286 -56

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ needed on the Space. Set HF_TOKEN as a Space secret.
 from __future__ import annotations
 import inspect
 import json
 import os
@@ -47,21 +48,26 @@ SYSTEM_PROMPT = """You are a helpful assistant with access to tools.
 Rules you must follow:
 1. For ANY arithmetic — even simple multiplication like 5 * 7 — call the
    `calculate` tool. Never compute numbers in your head.
-2. For real-time facts like weather, call the matching tool (`get_weather`).
-   Do not guess or say you don't know.
 3. For unit conversions, call `convert_units` — don't approximate.
-4. For questions about the CS 203 course ("what week did we cover X?"),
    call `search_notes`.
-5. For definitions of CS / ML terms, call `define_word`.
-6. Multi-step questions need multiple tool calls. Example:
    "5 km/day for a week in miles" → first `convert_units`, then
    `calculate` to multiply by 7.
-7. After every tool result, decide: do I need another tool, or can I
    write the final answer? Only answer once you have ALL the numbers.
-8. If the question genuinely doesn't need any tool (e.g. "capital of
    France"), answer directly.
-Always return your final answer as plain, natural English — no JSON.
 """
@@ -139,6 +145,63 @@ def search_notes(query: str) -> str:
     return json.dumps({"message": f"No results for '{query}'. Try: {', '.join(list(topics.keys())[:5])}"})
 def define_word(word: str) -> str:
     definitions = {
         "overfitting":    "When a model learns training data too well (including noise) and performs poorly on new data.",
@@ -221,6 +284,34 @@ TOOL_SCHEMAS = [
             },
         },
     },
 ]
 TOOL_FUNCTIONS = {
@@ -229,6 +320,8 @@ TOOL_FUNCTIONS = {
     "convert_units": convert_units,
     "search_notes": search_notes,
     "define_word": define_word,
 }
@@ -281,20 +374,59 @@ def _parse_args(raw):
     return {}
-# Fallback: some models (small Llamas especially) emit tool calls as
-# plain-text JSON in `content` instead of structured `tool_calls`.
-# We scrape the content for those shapes so the loop doesn't silently die.
-_LEAKED_CALL_RE = re.compile(
-    r'\{[^{}]*"name"\s*:\s*"(?P<name>[a-z_]+)"[^{}]*"arguments"\s*:\s*(?P<args>\{[^{}]*\})[^{}]*\}',
-    re.IGNORECASE,
 )
 def _extract_leaked_tool_calls(content: str):
     if not content:
         return []
     calls = []
-    for m in _LEAKED_CALL_RE.finditer(content):
         name = m.group("name")
         try:
             args = json.loads(m.group("args"))
@@ -302,6 +434,13 @@ def _extract_leaked_tool_calls(content: str):
             continue
         if name in TOOL_FUNCTIONS:
             calls.append({"name": name, "arguments": args})
     return calls
@@ -445,78 +584,160 @@ TOOLS_MARKDOWN = "\n\n".join(
 HOW_IT_WORKS = f"""
-## The agent loop
 ```
-messages = [system prompt, user question]
-repeat up to max_steps:
-    response = model.chat(messages, tools=tool_schemas)
-    if response has NO tool_calls:
-        return response as final answer
-    for each tool_call in response:
-        result = TOOL_FUNCTIONS[tool_call.name](**tool_call.args)
-        append result to messages
-    go back to top
 ```
-That's it. The model is a decision-maker — **your** code executes the
-tools.
----
-## System prompt
-```text
-{SYSTEM_PROMPT}
-```
----
-## Tools
-{TOOLS_MARKDOWN}
 """
 # --- UI ------------------------------------------------------------------
 EXAMPLES = [
     "What is 4729 times 8314?",
     "What's the temperature in Gandhinagar in Fahrenheit?",
     "How much hotter is Delhi than Bangalore right now, in degrees Celsius?",
     "If I run 5 km every day for a week, how many miles is that total?",
     "Which week of CS 203 covered Docker?",
     "What does 'overfitting' mean?",
     "What is the capital of France?",
 ]
-DESCRIPTION = """
-# Agent 101 — LLM vs. LLM-with-tools
-Both panels use the **same model**. The only difference: the right panel
-hands the model a toolkit (calculator, weather lookup, unit converter,
-course-notes search, CS/ML dictionary) and runs an **agent loop**.
-Watch what happens on questions the plain LLM can't answer from memory —
-exact arithmetic, real-time weather, course-specific facts.
-Based on the Week 12 lab for CS 203 at IIT Gandhinagar —
-[colab notebook](https://github.com/nipunbatra/stt-ai-teaching/blob/master/lecture-demos/week12/colab-notebooks/01-agents-from-scratch.ipynb).
 """
-with gr.Blocks(title="Agent 101", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(DESCRIPTION)
     with gr.Tabs():
         with gr.Tab("Demo"):
             with gr.Row():
                 question = gr.Textbox(
                     label="Ask something",
-                    placeholder="e.g. How much hotter is Delhi than Bangalore right now?",
                     lines=2,
                     scale=4,
                 )
@@ -526,25 +747,34 @@ with gr.Blocks(title="Agent 101", theme=gr.themes.Soft()) as demo:
                     value=DEFAULT_MODEL,
                     scale=2,
                 )
-            go = gr.Button("Run both", variant="primary")
             with gr.Row():
-                with gr.Column():
                     gr.Markdown("### 🚫 Without tools\n_Plain LLM — no calculator, no weather, no notes._")
                     out_plain = gr.Markdown()
-                with gr.Column():
                     gr.Markdown("### 🛠️ With tools (agent loop)\n_Same model, but it can call functions._")
-                    out_final = gr.Markdown()
                     with gr.Accordion("Step-by-step trace", open=True):
                         out_trace = gr.Markdown()
-            gr.Examples(examples=EXAMPLES, inputs=question)
             go.click(run_both, inputs=[question, model_choice], outputs=[out_plain, out_trace, out_final])
             question.submit(run_both, inputs=[question, model_choice], outputs=[out_plain, out_trace, out_final])
         with gr.Tab("How it works"):
             gr.Markdown(HOW_IT_WORKS)
 if __name__ == "__main__":
     demo.queue().launch()

 from __future__ import annotations
+import datetime as _dt
 import inspect
 import json
 import os
 Rules you must follow:
 1. For ANY arithmetic — even simple multiplication like 5 * 7 — call the
    `calculate` tool. Never compute numbers in your head.
+2. For real-time facts like weather or current time, call the matching
+   tool (`get_weather`, `get_time`). Do not guess or say you don't know.
 3. For unit conversions, call `convert_units` — don't approximate.
+4. For currency conversion, call `get_exchange_rate`.
+5. For questions about the CS 203 course ("what week did we cover X?"),
    call `search_notes`.
+6. For definitions of CS / ML terms, call `define_word`.
+7. Multi-step questions need multiple tool calls. Example:
    "5 km/day for a week in miles" → first `convert_units`, then
    `calculate` to multiply by 7.
+8. After every tool result, decide: do I need another tool, or can I
    write the final answer? Only answer once you have ALL the numbers.
+9. If the question genuinely doesn't need any tool (e.g. "capital of
    France"), answer directly.
+CRITICAL: when you ARE calling a tool, use the provider's structured
+tool_calls interface — do NOT write tool calls as Python-style text like
+`[get_weather(city="Delhi")]` or as JSON in your reply. If you want to
+call a tool, emit a real tool_call; otherwise write a final natural-
+English answer with the numbers spelled out.
 """
     return json.dumps({"message": f"No results for '{query}'. Try: {', '.join(list(topics.keys())[:5])}"})
+def get_time(timezone: str = "UTC") -> str:
+    """Return the current wall-clock time in a named timezone or UTC offset."""
+    offsets = {
+        "utc": 0, "ist": 5.5, "gmt": 0, "bst": 1,
+        "edt": -4, "est": -5, "pdt": -7, "pst": -8,
+        "jst": 9, "kst": 9, "cst": 8, "cet": 1, "eet": 2,
+        "sgt": 8, "hkt": 8, "aest": 10,
+        "gandhinagar": 5.5, "mumbai": 5.5, "delhi": 5.5, "bangalore": 5.5,
+        "tokyo": 9, "seoul": 9, "singapore": 8, "london": 0, "paris": 1,
+        "new york": -4, "san francisco": -7, "sydney": 10,
+    }
+    key = timezone.lower().strip()
+    offset_h = offsets.get(key)
+    if offset_h is None:
+        m = re.match(r"^utc([+-])(\d+(?:\.\d+)?)$", key)
+        if m:
+            offset_h = float(m.group(2)) * (1 if m.group(1) == "+" else -1)
+    if offset_h is None:
+        return json.dumps({"error": f"Unknown timezone '{timezone}'. Try UTC, IST, JST, 'Tokyo', 'New York', etc."})
+    now_utc = _dt.datetime.utcnow()
+    local = now_utc + _dt.timedelta(hours=offset_h)
+    return json.dumps({
+        "timezone": timezone,
+        "utc_offset_hours": offset_h,
+        "iso": local.strftime("%Y-%m-%d %H:%M:%S"),
+        "day_of_week": local.strftime("%A"),
+    })
+def get_exchange_rate(from_currency: str, to_currency: str, amount: float = 1.0) -> str:
+    """Mock currency converter — rates are approximate and fixed for the demo."""
+    # Rates expressed as "1 unit of X in USD".
+    usd_rates = {
+        "usd": 1.0, "eur": 1.08, "gbp": 1.27, "jpy": 0.0067,
+        "inr": 0.012, "cny": 0.14, "aud": 0.66, "cad": 0.73,
+        "sgd": 0.74, "chf": 1.13, "krw": 0.00075,
+    }
+    try:
+        amount = float(amount)
+    except (TypeError, ValueError):
+        return json.dumps({"error": f"amount '{amount}' is not a number"})
+    f = str(from_currency).lower().strip()
+    t = str(to_currency).lower().strip()
+    if f not in usd_rates or t not in usd_rates:
+        return json.dumps({
+            "error": f"Unsupported currency pair {from_currency}->{to_currency}",
+            "supported": sorted(usd_rates.keys()),
+        })
+    usd = amount * usd_rates[f]
+    converted = usd / usd_rates[t]
+    return json.dumps({
+        "from": f.upper(), "to": t.upper(),
+        "amount": amount, "converted": round(converted, 2),
+        "note": "rates are approximate demo values, not live market data",
+    })
 def define_word(word: str) -> str:
     definitions = {
         "overfitting":    "When a model learns training data too well (including noise) and performs poorly on new data.",
             },
         },
     },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_time",
+            "description": "Get the current wall-clock time in a given timezone or city. Accepts UTC, IST, JST, EST, PDT, 'Tokyo', 'New York', etc.",
+            "parameters": {
+                "type": "object",
+                "properties": {"timezone": {"type": "string", "description": "Timezone code or city, e.g. 'IST' or 'Tokyo'"}},
+                "required": ["timezone"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_exchange_rate",
+            "description": "Convert a currency amount using fixed demo rates. Supports USD, EUR, GBP, JPY, INR, CNY, AUD, CAD, SGD, CHF, KRW.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "from_currency": {"type": "string", "description": "Source ISO code, e.g. 'USD'"},
+                    "to_currency":   {"type": "string", "description": "Target ISO code, e.g. 'INR'"},
+                    "amount":        {"type": "number", "description": "Amount to convert"},
+                },
+                "required": ["from_currency", "to_currency", "amount"],
+            },
+        },
+    },
 ]
 TOOL_FUNCTIONS = {
     "convert_units": convert_units,
     "search_notes": search_notes,
     "define_word": define_word,
+    "get_time": get_time,
+    "get_exchange_rate": get_exchange_rate,
 }
     return {}
+# Fallback: some models emit tool calls as plain text instead of as
+# structured `tool_calls`. We scrape a few common shapes so the loop
+# doesn't silently give up:
+#   - JSON     : {"name": "foo", "arguments": {...}}
+#   - Python   : [foo(k=v, k="v")]   (Gemma 3)
+#   - Llama    : <function=foo {"k": "v"}>
+_JSON_LEAK_RE = re.compile(
+    r'\{[^{}]*"name"\s*:\s*"(?P<name>[a-zA-Z_]+)"[^{}]*"arguments"\s*:\s*(?P<args>\{[^{}]*\})[^{}]*\}'
 )
+_PY_LEAK_RE = re.compile(r"\[?\s*(?P<name>[a-zA-Z_]\w*)\s*\((?P<args>[^)]*)\)\s*\]?")
+_LLAMA_LEAK_RE = re.compile(r"<function=(?P<name>[a-zA-Z_]\w*)\s*(?P<args>\{.*?\})\s*>", re.DOTALL)
+def _coerce_scalar(s: str):
+    s = s.strip()
+    if not s:
+        return s
+    if (s[0] == s[-1]) and s[0] in {'"', "'"}:
+        return s[1:-1]
+    try:
+        if "." in s:
+            return float(s)
+        return int(s)
+    except ValueError:
+        return s
+def _parse_py_args(arg_str: str) -> dict:
+    """Parse a loose k=v, k=v string used in Python-like tool-call leaks."""
+    args = {}
+    # Split on commas that are not inside quotes.
+    parts = re.findall(r'(?:[^,"\']|"[^"]*"|\'[^\']*\')+', arg_str)
+    for part in parts:
+        if "=" not in part:
+            continue
+        k, v = part.split("=", 1)
+        args[k.strip()] = _coerce_scalar(v)
+    return args
 def _extract_leaked_tool_calls(content: str):
     if not content:
         return []
     calls = []
+    for m in _JSON_LEAK_RE.finditer(content):
+        name = m.group("name")
+        try:
+            args = json.loads(m.group("args"))
+        except Exception:
+            continue
+        if name in TOOL_FUNCTIONS:
+            calls.append({"name": name, "arguments": args})
+    for m in _LLAMA_LEAK_RE.finditer(content):
         name = m.group("name")
         try:
             args = json.loads(m.group("args"))
             continue
         if name in TOOL_FUNCTIONS:
             calls.append({"name": name, "arguments": args})
+    if not calls:
+        for m in _PY_LEAK_RE.finditer(content):
+            name = m.group("name")
+            if name not in TOOL_FUNCTIONS:
+                continue
+            args = _parse_py_args(m.group("args"))
+            calls.append({"name": name, "arguments": args})
     return calls
 HOW_IT_WORKS = f"""
+## The three pieces of an agent
+| Piece | What it does |
+|:--|:--|
+| **LLM** | Decides what to do next |
+| **Tools** | Python functions the LLM can call |
+| **Loop** | Keep asking the LLM until it's done |
+## Pseudocode
+```python
+while True:
+    response = llm.chat(messages, tools=schemas)
+    if not response.tool_calls:
+        return response.text          # done
+    for call in response.tool_calls:
+        result = TOOLS[call.name](**call.args)
+        messages.append(result)       # feed back, loop again
 ```
+The LLM never runs code. It just *names* a tool and *your* code runs it.
+"""
+TOOLS_DETAIL = f"""
+## System prompt
+```text
+{SYSTEM_PROMPT}
 ```
+## Tools the agent has
+{TOOLS_MARKDOWN}
+"""
+# Mermaid flowchart — rendered inside a raw HTML block with CDN Mermaid.
+FLOWCHART_HTML = """
+<div style="padding: 1rem 0;">
+<h2>Without tools vs. with tools</h2>
+<p>Same model on both sides. The only difference is the loop on the right.</p>
+<div class="mermaid">
+flowchart TB
+  subgraph PLAIN["🚫 Without tools"]
+    direction TB
+    U1["User question"] --> L1["LLM generates<br/>text answer"]
+    L1 --> A1["Final answer<br/>(may hallucinate,<br/>can't use real data)"]
+  end
+  subgraph AGENT["🛠️ With tools (agent loop)"]
+    direction TB
+    U2["User question"] --> L2["LLM sees question<br/>+ tool menu"]
+    L2 --> D{"Tool call<br/>needed?"}
+    D -- "No" --> F["Final answer<br/>(grounded in<br/>tool results)"]
+    D -- "Yes" --> T["Run tool<br/>in your code"]
+    T --> R["Append result<br/>to messages"]
+    R --> L2
+  end
+  classDef plain fill:#fdecea,stroke:#c0392b,color:#2c1810;
+  classDef agent fill:#e8f5e9,stroke:#1e8449,color:#0b2e13;
+  class PLAIN plain
+  class AGENT agent
+</div>
+<script type="module">
+  import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs";
+  mermaid.initialize({ startOnLoad: true, theme: "default", securityLevel: "loose" });
+  // re-run when Gradio swaps tabs
+  const obs = new MutationObserver(() => mermaid.run());
+  obs.observe(document.body, { childList: true, subtree: true });
+</script>
+</div>
+"""
+CUSTOM_CSS = """
+#title-card {
+  background: linear-gradient(135deg, #4f46e5 0%, #ec4899 100%);
+  color: white;
+  padding: 1.25rem 1.5rem;
+  border-radius: 16px;
+  margin-bottom: 1rem;
+}
+#title-card h1 { color: white; margin: 0 0 .25rem 0; font-size: 1.8rem; }
+#title-card p  { color: rgba(255,255,255,.92); margin: 0; font-size: .95rem; }
+.panel {
+  border-radius: 14px;
+  padding: 1rem 1.1rem;
+  min-height: 220px;
+}
+.panel-plain { background: #fdecea; border: 1px solid #f5b7b1; }
+.panel-agent { background: #e8f5e9; border: 1px solid #a9dfbf; }
+.panel h3 { margin-top: 0 !important; }
+.final-box {
+  background: white;
+  border-left: 4px solid #1e8449;
+  padding: .75rem 1rem;
+  border-radius: 8px;
+  margin-bottom: .75rem;
+}
 """
+# --- UI ------------------------------------------------------------------
 # --- UI ------------------------------------------------------------------
 EXAMPLES = [
+    # --- Hard arithmetic — plain LLMs routinely get these wrong ---
     "What is 4729 times 8314?",
+    "Compute (127 ** 3) + (49 ** 4).",
+    "What is 31.5% of 128,400?",
+    "What is the square root of 987654, to 4 decimals?",
+    "Evaluate 2847 * 9183 - 17^5.",
+    # --- Single-tool: real-time data ---
     "What's the temperature in Gandhinagar in Fahrenheit?",
+    "What time is it in Tokyo right now?",
+    # --- Multi-tool chains ---
     "How much hotter is Delhi than Bangalore right now, in degrees Celsius?",
     "If I run 5 km every day for a week, how many miles is that total?",
+    "I have 1000 USD. How much is that in INR, and what time is it in Delhi?",
+    "If Paris is 18 C and New York is 22 C, what's the average in Fahrenheit?",
+    "I earn 2500 EUR/month. After converting to INR, what is my annual salary?",
+    # --- Course knowledge base ---
     "Which week of CS 203 covered Docker?",
     "What does 'overfitting' mean?",
+    # --- No-tool question (sanity check) ---
     "What is the capital of France?",
 ]
+TITLE_CARD = """
+<div id="title-card">
+  <h1>🤖 Agent 101 — LLM vs. LLM-with-tools</h1>
+  <p>Same model both sides. Left: plain LLM. Right: the same model with a toolkit and an agent loop. Watch what happens on questions the LLM can't answer from memory.</p>
+  <p style="margin-top:.4rem;font-size:.85rem;">Built for CS 203 at IIT Gandhinagar · <a href="https://github.com/nipunbatra/stt-ai-teaching/blob/master/lecture-demos/week12/colab-notebooks/01-agents-from-scratch.ipynb" style="color:white;text-decoration:underline;">companion Colab</a></p>
+</div>
 """
+with gr.Blocks(title="Agent 101", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
+    gr.HTML(TITLE_CARD)
     with gr.Tabs():
         with gr.Tab("Demo"):
             with gr.Row():
                 question = gr.Textbox(
                     label="Ask something",
+                    placeholder="e.g. I have 1000 USD. How much is that in INR, and what time is it in Delhi?",
                     lines=2,
                     scale=4,
                 )
                     value=DEFAULT_MODEL,
                     scale=2,
                 )
+            go = gr.Button("Run both", variant="primary", size="lg")
             with gr.Row():
+                with gr.Column(elem_classes="panel panel-plain"):
                     gr.Markdown("### 🚫 Without tools\n_Plain LLM — no calculator, no weather, no notes._")
                     out_plain = gr.Markdown()
+                with gr.Column(elem_classes="panel panel-agent"):
                     gr.Markdown("### 🛠️ With tools (agent loop)\n_Same model, but it can call functions._")
+                    out_final = gr.Markdown(elem_classes="final-box")
                     with gr.Accordion("Step-by-step trace", open=True):
                         out_trace = gr.Markdown()
+            gr.Examples(
+                examples=EXAMPLES,
+                inputs=question,
+                label="Click an example to try it",
+            )
             go.click(run_both, inputs=[question, model_choice], outputs=[out_plain, out_trace, out_final])
             question.submit(run_both, inputs=[question, model_choice], outputs=[out_plain, out_trace, out_final])
+        with gr.Tab("Flowchart"):
+            gr.HTML(FLOWCHART_HTML)
         with gr.Tab("How it works"):
             gr.Markdown(HOW_IT_WORKS)
+            with gr.Accordion("System prompt + tool definitions", open=False):
+                gr.Markdown(TOOLS_DETAIL)
 if __name__ == "__main__":
     demo.queue().launch()