Final_Assignment_Template

Sleeping

App Files Files Community

johnnychiang commited on Jan 9

Commit

6051f37

verified ·

1 Parent(s): dd67f62

Update app.py

Browse files

Files changed (1) hide show

app.py +439 -125

app.py CHANGED Viewed

@@ -1,178 +1,492 @@
 import os
-import gradio as gr
 import requests
 import pandas as pd
-import re
-from huggingface_hub import InferenceClient
-# ===============================
-# Constants
-# ===============================
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# ===============================
-# Basic Agent (PASS VERSION)
-# ===============================
-class BasicAgent:
-    """
-    Minimal GAIA Level-1 agent.
-    Target: >=30% exact match
-    """
-    def __init__(self):
-        print("BasicAgent initialized (PASS MODE).")
-        # 必須在 Space → Settings → Secrets 設定
-        self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-        if not self.hf_token:
-            raise RuntimeError("HF_TOKEN missing. Set it in Space Settings → Secrets.")
-        # 模型（可在 Variables 改）
-        self.model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
-        # ✅ 正確用法：不要給 base_url
-        self.client = InferenceClient(
-            model=self.model_id,
-            token=self.hf_token,
-            timeout=120,
-        )
-        self.system = (
-            "You answer questions with EXACT MATCH.\n"
-            "Return ONLY the final answer.\n"
-            "No explanation.\n"
-            "No extra words.\n"
-            "No punctuation unless required.\n"
-            "No quotes.\n"
-        )
-    def _sanitize(self, text: str) -> str:
-        if not text:
-            return ""
-        t = str(text).strip()
-        t = re.sub(r"(?i)final answer\s*[:\-]*", "", t)
-        t = re.sub(r"(?i)answer\s*[:\-]*", "", t)
-        lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
-        if lines:
-            t = lines[-1]
-        t = t.strip().strip('"').strip("'")
-        t = re.sub(r"[.,;:!?]$", "", t)
-        return t
-    def __call__(self, question: str) -> str:
-        print(f"Q: {question[:60]}")
-        prompt = f"{self.system}\nQuestion: {question}\nAnswer:"
         try:
-            out = self.client.text_generation(
                 prompt,
-                max_new_tokens=64,
                 temperature=0.0,
                 do_sample=False,
                 return_full_text=False,
             )
-        except Exception:
-            out = self.client.chat_completion(
-                messages=[
-                    {"role": "system", "content": self.system},
-                    {"role": "user", "content": question},
-                ],
-                max_tokens=64,
-                temperature=0.0,
-            ).choices[0].message.content
-        ans = self._sanitize(out)
-        print(f"A: {ans}")
-        return ans
-# ===============================
-# Run & Submit
-# ===============================
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
-    if not profile:
-        return "Please login with Hugging Face.", None
-    username = profile.username
-    print(f"User: {username}")
-    questions_url = f"{DEFAULT_API_URL}/questions"
-    submit_url = f"{DEFAULT_API_URL}/submit"
     try:
-        agent = BasicAgent()
     except Exception as e:
-        return f"Agent init error: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    resp = requests.get(questions_url, timeout=20)
-    resp.raise_for_status()
-    questions = resp.json()
     answers_payload = []
-    log_rows = []
-    for q in questions:
-        task_id = q["task_id"]
-        question = q["question"]
         try:
-            ans = agent(question)
-        except Exception:
-            ans = ""
-        answers_payload.append({
-            "task_id": task_id,
-            "submitted_answer": ans
-        })
-        log_rows.append({
-            "Task ID": task_id,
-            "Question": question,
-            "Submitted Answer": ans
-        })
-    submission = {
-        "username": username,
         "agent_code": agent_code,
-        "answers": answers_payload
     }
-    resp = requests.post(submit_url, json=submission, timeout=60)
-    resp.raise_for_status()
-    result = resp.json()
-    status = (
-        f"Submission Successful!\n"
-        f"User: {result.get('username')}\n"
-        f"Score: {result.get('score')}% "
-        f"({result.get('correct_count')}/{result.get('total_attempted')})\n"
-        f"{result.get('message')}"
-    )
-    return status, pd.DataFrame(log_rows)
-# ===============================
-# Gradio UI
-# ===============================
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner (PASS MODE)")
     gr.LoginButton()
-    run_btn = gr.Button("Run Evaluation & Submit All Answers")
-    status = gr.Textbox(label="Result", lines=6)
-    table = gr.DataFrame(label="Answers", wrap=True)
-    run_btn.click(fn=run_and_submit_all, outputs=[status, table])
 if __name__ == "__main__":
-    demo.launch()

 import os
+import re
+import json
+import math
 import requests
 import pandas as pd
+import gradio as gr
+from bs4 import BeautifulSoup
+from sympy import sympify
+from pint import UnitRegistry
+try:
+    from huggingface_hub import InferenceClient
+except Exception:
+    InferenceClient = None
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
+HF_API_BASE = "https://huggingface.co/api"
+OPEN_METEO = "https://api.open-meteo.com/v1/forecast"
+ureg = UnitRegistry()
+Q = ureg.Quantity
+def http_get(url, timeout=20, headers=None, params=None):
+    headers = headers or {
+        "User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co)"
+    }
+    r = requests.get(url, timeout=timeout, headers=headers, params=params)
+    r.raise_for_status()
+    return r
+def wikidata_query(sparql: str):
+    r = http_get(
+        WIKIDATA_SPARQL,
+        params={"format": "json", "query": sparql},
+        headers={"Accept": "application/sparql-results+json"}
+    )
+    return r.json()
+def clean_answer(s: str) -> str:
+    if s is None:
+        return ""
+    s = str(s).strip()
+    # remove FINAL ANSWER patterns
+    s = re.sub(r"(?i)\bFINAL\s*ANSWER\b\s*[:\-]*\s*", "", s).strip()
+    # remove markdown/code fences
+    s = re.sub(r"```.*?```", "", s, flags=re.S).strip()
+    # keep last non-empty line (common for model outputs)
+    lines = [ln.strip() for ln in s.splitlines() if ln.strip()]
+    if lines:
+        s = lines[-1]
+    # strip quotes
+    s = s.strip().strip('"').strip("'").strip()
+    # collapse spaces
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def looks_like_math(q: str) -> bool:
+    # crude heuristic: contains digits and operators
+    return bool(re.search(r"\d", q)) and bool(re.search(r"[+\-*/^=()]", q))
+def try_solve_math(q: str):
+    """
+    Try to extract a math expression and evaluate.
+    """
+    # grab something that looks like an expression
+    m = re.search(r"([-+*/^().\d\s]+)", q)
+    if not m:
+        return None
+    expr = m.group(1).strip()
+    if len(expr) < 3:
+        return None
+    expr = expr.replace("^", "**")
+    try:
+        val = sympify(expr).evalf()
+        # if near int, output int
+        if abs(val - int(val)) < 1e-10:
+            return str(int(val))
+        return str(val)
+    except Exception:
+        return None
+def try_unit_convert(q: str):
+    """
+    Very basic unit conversion:
+    e.g., "Convert 5 miles to km"
+    """
+    # match "convert <num> <unit> to <unit>"
+    m = re.search(r"(?i)\bconvert\s+([-+]?\d+(?:\.\d+)?)\s*([a-zA-Z°]+)\s+to\s+([a-zA-Z°]+)\b", q)
+    if not m:
+        return None
+    num = float(m.group(1))
+    u1 = m.group(2)
+    u2 = m.group(3)
+    try:
+        out = (Q(num, u1)).to(u2)
+        # output without unit text unless question requires it; GAIA exact match often wants number only
+        # we'll return just magnitude, trimmed
+        mag = out.magnitude
+        if abs(mag - int(mag)) < 1e-10:
+            return str(int(mag))
+        return str(mag)
+    except Exception:
+        return None
+def ddg_search_snippet(query: str, max_results=5):
+    """
+    DuckDuckGo HTML scraping (no paid key).
+    Returns list of (title, url, snippet)
+    """
+    url = "https://duckduckgo.com/html/"
+    r = http_get(url, params={"q": query}, timeout=20)
+    soup = BeautifulSoup(r.text, "lxml")
+    results = []
+    for res in soup.select(".result")[:max_results]:
+        a = res.select_one(".result__a")
+        sn = res.select_one(".result__snippet")
+        if a:
+            title = a.get_text(" ", strip=True)
+            link = a.get("href")
+            snippet = sn.get_text(" ", strip=True) if sn else ""
+            results.append((title, link, snippet))
+    return results
+def hf_model_info(model_id: str):
+    r = http_get(f"{HF_API_BASE}/models/{model_id}", timeout=20)
+    return r.json()
+def hf_search_models(query: str, limit=5):
+    r = http_get(f"{HF_API_BASE}/models", params={"search": query, "limit": limit}, timeout=20)
+    return r.json()
+def open_meteo_weather(city: str):
+    # naive: use geocoding via Open-Meteo geocoding
+    geo = http_get(
+        "https://geocoding-api.open-meteo.com/v1/search",
+        params={"name": city, "count": 1, "language": "en", "format": "json"},
+        timeout=20
+    ).json()
+    if not geo.get("results"):
+        return None
+    lat = geo["results"][0]["latitude"]
+    lon = geo["results"][0]["longitude"]
+    data = http_get(
+        OPEN_METEO,
+        params={
+            "latitude": lat,
+            "longitude": lon,
+            "current": "temperature_2m,weather_code,wind_speed_10m",
+        },
+        timeout=20
+    ).json()
+    cur = data.get("current", {})
+    # return temperature only (often GAIA asks a single value)
+    if "temperature_2m" in cur:
+        t = cur["temperature_2m"]
+        if abs(t - int(t)) < 1e-10:
+            return str(int(t))
+        return str(t)
+    return None
+def wikidata_simple_lookup(entity: str, prop: str):
+    """
+    Use Wikidata to fetch a single property for a named entity.
+    prop: one of 'capital', 'population', 'area', 'birth', 'death', 'country', 'founder', etc.
+    We'll map prop -> Wikidata property IDs and return a clean string.
+    """
+    prop_map = {
+        "capital": "P36",
+        "population": "P1082",
+        "area": "P2046",
+        "birth": "P569",
+        "death": "P570",
+        "country": "P17",
+        "founder": "P112",
+        "headquarters": "P159",
+    }
+    pid = prop_map.get(prop)
+    if not pid:
+        return None
+    # Try entity as label search then property
+    sparql = f"""
+    SELECT ?valueLabel WHERE {{
+      ?item rdfs:label "{entity}"@en .
+      OPTIONAL {{ ?item wdt:{pid} ?value . }}
+      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
+    }}
+    LIMIT 1
+    """
+    try:
+        data = wikidata_query(sparql)
+        bindings = data.get("results", {}).get("bindings", [])
+        if not bindings:
+            return None
+        v = bindings[0].get("valueLabel", {}).get("value")
+        return clean_answer(v)
+    except Exception:
+        return None
+def download_task_file(task_id: str, save_dir="/tmp"):
+    url = f"{DEFAULT_API_URL}/files/{task_id}"
+    try:
+        r = http_get(url, timeout=30)
+        # try detect filename from headers
+        fname = f"{task_id}.bin"
+        cd = r.headers.get("content-disposition", "")
+        m = re.search(r'filename="?([^"]+)"?', cd)
+        if m:
+            fname = m.group(1)
+        path = os.path.join(save_dir, fname)
+        with open(path, "wb") as f:
+            f.write(r.content)
+        return path
+    except Exception:
+        return None
+class ToolFirstAgent:
+    """
+    Tool-first agent for GAIA Level-1 exact-match scoring.
+    Designed to work WITHOUT paid models.
+    Optional fallback to a free small model if HF_TOKEN is set.
+    """
+    def __init__(self):
+        self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        self.model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
+        self.llm = None
+        if self.hf_token and InferenceClient is not None:
+            # IMPORTANT: do NOT pass both model and base_url in constructor.
+            # We'll use router and pass model at call-time (supported by huggingface_hub client).
+            try:
+                self.llm = InferenceClient(token=self.hf_token, base_url="https://router.huggingface.co", timeout=120)
+                print("✅ LLM fallback enabled via HF router.")
+            except Exception as e:
+                print("⚠️ LLM fallback init failed, continue tool-only:", e)
+                self.llm = None
+        else:
+            print("ℹ️ Running in tool-only mode (no HF_TOKEN or huggingface_hub missing).")
+    def llm_answer(self, question: str) -> str:
+        if not self.llm:
+            return ""
+        system = (
+            "Return ONLY the final answer for this question.\n"
+            "No explanation. No extra words.\n"
+            "If it is a name/number/date, output it exactly.\n"
+        )
+        prompt = f"{system}\nQuestion: {question}\nAnswer:"
         try:
+            out = self.llm.text_generation(
                 prompt,
+                model=self.model_id,
+                max_new_tokens=96,
                 temperature=0.0,
                 do_sample=False,
                 return_full_text=False,
             )
+            return clean_answer(out)
+        except Exception as e:
+            print("LLM text_generation failed:", e)
+            return ""
+    def answer(self, question: str, task_id: str = None) -> str:
+        q = question.strip()
+        # 0) if task has a file, try download (some GAIA Qs rely on it)
+        if task_id:
+            fpath = download_task_file(task_id)
+            # For now, just note: without knowing file types, we won't parse deeply.
+            # But downloading sometimes is required; you can extend later.
+            if fpath:
+                print(f"Downloaded file for task {task_id}: {fpath}")
+        # 1) math
+        if looks_like_math(q):
+            m = try_solve_math(q)
+            if m:
+                return clean_answer(m)
+        # 2) unit conversion
+        u = try_unit_convert(q)
+        if u:
+            return clean_answer(u)
+        # 3) weather questions: "weather in <city>"
+        m = re.search(r"(?i)\bweather in ([A-Za-z \-]+)\b", q)
+        if m:
+            city = m.group(1).strip()
+            w = open_meteo_weather(city)
+            if w:
+                return clean_answer(w)
+        # 4) Hugging Face / model popularity questions
+        # e.g. "most downloaded model", "downloads of Qwen/..."
+        if "hugging face" in q.lower() or "download" in q.lower() or "downloads" in q.lower():
+            mm = re.search(r"([A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+)", q)
+            if mm:
+                mid = mm.group(1)
+                try:
+                    info = hf_model_info(mid)
+                    # common: downloads field
+                    if "downloads" in info:
+                        return clean_answer(str(info["downloads"]))
+                except Exception:
+                    pass
+        # 5) Wikidata lookups (capitals, birth, etc.)
+        # Capital of X
+        m = re.search(r"(?i)\bcapital of ([A-Za-z \-]+)\b", q)
+        if m:
+            ent = m.group(1).strip()
+            v = wikidata_simple_lookup(ent, "capital")
+            if v:
+                return clean_answer(v)
+        # Birth date of X
+        m = re.search(r"(?i)\bwhen was ([A-Za-z .\-]+) born\b", q)
+        if m:
+            ent = m.group(1).strip()
+            v = wikidata_simple_lookup(ent, "birth")
+            if v:
+                # often wikidata returns ISO datetime; keep only date part
+                v = v.split("T")[0]
+                return clean_answer(v)
+        # Population of X
+        m = re.search(r"(?i)\bpopulation of ([A-Za-z \-]+)\b", q)
+        if m:
+            ent = m.group(1).strip()
+            v = wikidata_simple_lookup(ent, "population")
+            if v:
+                # sometimes returns "1,234,567" vs "1234567"; exact match varies.
+                # keep as-is; but remove commas if question likely expects plain digits
+                if re.search(r"(?i)\bhow many\b|\bpopulation\b", q):
+                    v2 = v.replace(",", "")
+                    return clean_answer(v2)
+                return clean_answer(v)
+        # 6) lightweight web search fallback (snippets)
+        # Works for factoid questions with clear short answers
+        try:
+            results = ddg_search_snippet(q, max_results=3)
+            if results:
+                # Heuristic: if question asks for a year, grab 4-digit year from snippet
+                if re.search(r"\b(19|20)\d{2}\b", q):
+                    for _, __, sn in results:
+                        yy = re.search(r"\b(19|20)\d{2}\b", sn)
+                        if yy:
+                            return clean_answer(yy.group(0))
+                # If asks "Who is ..." try first snippet capitalized name chunk
+                if q.lower().startswith("who is") or "who was" in q.lower():
+                    # naive: take first result title before "-" or "|"
+                    title = results[0][0]
+                    title = re.split(r"[-|–]", title)[0].strip()
+                    if title:
+                        return clean_answer(title)
+        except Exception as e:
+            print("DDG fallback failed:", e)
+        # 7) optional LLM fallback (free small model) — last resort
+        llm = self.llm_answer(q)
+        if llm:
+            # If too long, ask again implicitly by trimming to last line already done.
+            # Also strip trailing punctuation
+            llm = re.sub(r"[.。!！]+$", "", llm).strip()
+            return clean_answer(llm)
+        # 8) final fallback
+        return "I don't know"
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
+    if profile:
+        username = f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
     try:
+        agent = ToolFirstAgent()
     except Exception as e:
+        return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    # Fetch Questions
+    try:
+        response = requests.get(questions_url, timeout=20)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+            return "Fetched questions list is empty.", None
+    except Exception as e:
+        return f"Error fetching questions: {e}", None
+    results_log = []
     answers_payload = []
+    for item in questions_data:
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or question_text is None:
+            continue
         try:
+            submitted_answer = agent.answer(question_text, task_id=task_id)
+            submitted_answer = clean_answer(submitted_answer)
+        except Exception as e:
+            submitted_answer = f"AGENT ERROR: {e}"
+        answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+        results_log.append(
+            {"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}
+        )
+    submission_data = {
+        "username": username.strip(),
         "agent_code": agent_code,
+        "answers": answers_payload,
     }
+    try:
+        response = requests.post(submit_url, json=submission_data, timeout=90)
+        response.raise_for_status()
+        result_data = response.json()
+        final_status = (
+            f"Submission Successful!\n"
+            f"User: {result_data.get('username')}\n"
+            f"Overall Score: {result_data.get('score', 'N/A')}% "
+            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+            f"Message: {result_data.get('message', 'No message received.')}"
+        )
+        return final_status, pd.DataFrame(results_log)
+    except Exception as e:
+        return f"Submission Failed: {e}", pd.DataFrame(results_log)
 with gr.Blocks() as demo:
+    gr.Markdown("# Basic Agent Evaluation Runner (Tool-first, no paid model)")
+    gr.Markdown(
+        """
+        **Instructions**
+        1. Login with the button.
+        2. Click Run to fetch questions, answer them, submit, and get score.
+        **Notes**
+        - Works without paid models.
+        - Optional HF_TOKEN enables small-model fallback (free tier permitting).
+        """
+    )
     gr.LoginButton()
+    run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(label="Run Status / Submission Result", lines=6, interactive=False)
+    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
+    demo.launch(debug=True, share=False)