Paperbag commited on
Commit
963c7d3
·
1 Parent(s): 9e3bdbf

Add scripts for environment setup and Node.js installation; include AGENTS.md for Windows shell configuration

Browse files
Files changed (6) hide show
  1. AGENTS.md +2 -0
  2. check_env.ps1 +15 -0
  3. check_env2.ps1 +15 -0
  4. gaia_matcher.py +160 -0
  5. install_node.sh +8 -0
  6. setupext.sh +7 -0
AGENTS.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [windows]
2
+ shell = "cmd"
check_env.ps1 ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $badVars = @()
2
+ Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Control\Session Manager\Environment' |
3
+ Get-Member -MemberType NoteProperty |
4
+ ForEach-Object {
5
+ $name = $_.Name
6
+ $val = (Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Control\Session Manager\Environment').$name
7
+ if ($val -match '[^\x09\x20-\x7E]') {
8
+ $badVars += "$name`: $val"
9
+ }
10
+ }
11
+ if ($badVars.Count -eq 0) {
12
+ Write-Output "No corrupted env vars found in HKLM"
13
+ } else {
14
+ $badVars | ForEach-Object { Write-Output $_ }
15
+ }
check_env2.ps1 ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $badVars = @()
2
+ Get-ItemProperty -Path 'HKCU:\Environment' |
3
+ Get-Member -MemberType NoteProperty |
4
+ ForEach-Object {
5
+ $name = $_.Name
6
+ $val = (Get-ItemProperty -Path 'HKCU:\Environment').$name
7
+ if ($val -match '[^\x09\x20-\x7E]') {
8
+ $badVars += "$name`: $val"
9
+ }
10
+ }
11
+ if ($badVars.Count -eq 0) {
12
+ Write-Output "No corrupted env vars found in HKCU"
13
+ } else {
14
+ $badVars | ForEach-Object { Write-Output $_ }
15
+ }
gaia_matcher.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import shutil
4
+ import requests
5
+ import pandas as pd
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ QUESTIONS_URL = "https://agents-course-unit4-scoring.hf.space/questions"
9
+ GAIA_REPO_ID = "gaia-benchmark/GAIA"
10
+ GAIA_VAL_FILENAME = "2023/validation/metadata.parquet"
11
+ CACHE_DIR = os.path.join(os.path.dirname(__file__), "data")
12
+ CACHE_PATH = os.path.join(CACHE_DIR, "gaia_metadata.parquet")
13
+ OUT_CSV = os.path.join(os.path.dirname(__file__), "gaia_ground_truth.csv")
14
+ OUT_JSON = os.path.join(os.path.dirname(__file__), "gaia_ground_truth.json")
15
+
16
+
17
+ def get_hf_token():
18
+ # Precedence of env vars and helpful message
19
+ for k in ("HUGGINGFACEHUB_API_TOKEN", "HF_TOKEN", "HUGGINGFACE_TOKEN"):
20
+ v = os.getenv(k)
21
+ if v:
22
+ return v
23
+ return None
24
+
25
+
26
+ def download_parquet(token: str, dest_path: str) -> str:
27
+ # Download via hf_hub_download to a temporary location then move to dest
28
+ tmp = hf_hub_download(
29
+ repo_id=GAIA_REPO_ID,
30
+ filename=GAIA_VAL_FILENAME,
31
+ repo_type="dataset",
32
+ token=token,
33
+ )
34
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
35
+ shutil.copy(tmp, dest_path)
36
+ return dest_path
37
+
38
+
39
+ def detect_columns(df: pd.DataFrame):
40
+ cols = [c for c in df.columns]
41
+ # task id candidates
42
+ task_candidates = ["task_id", "id", "Task ID", "TaskID", "taskid"]
43
+ answer_candidates = ["Final answer", "final_answer", "answer", "Final Answer", "final answer", "Final_answer"]
44
+
45
+ task_col = next((c for c in cols if c in task_candidates), None)
46
+ answer_col = next((c for c in cols if c in answer_candidates), None)
47
+
48
+ # Fallback: case-insensitive match
49
+ if not task_col:
50
+ lowered = {c.lower(): c for c in cols}
51
+ for cand in task_candidates:
52
+ if cand.lower() in lowered:
53
+ task_col = lowered[cand.lower()]
54
+ break
55
+ if not answer_col:
56
+ lowered = {c.lower(): c for c in cols}
57
+ for cand in answer_candidates:
58
+ if cand.lower() in lowered:
59
+ answer_col = lowered[cand.lower()]
60
+ break
61
+
62
+ return task_col, answer_col
63
+
64
+
65
+ def safe_read_parquet(path: str) -> pd.DataFrame:
66
+ # Try pandas default, then pyarrow engine if needed
67
+ try:
68
+ return pd.read_parquet(path)
69
+ except Exception:
70
+ try:
71
+ return pd.read_parquet(path, engine="pyarrow")
72
+ except Exception as e:
73
+ raise
74
+
75
+
76
+ def main():
77
+ print(f"Fetching questions from {QUESTIONS_URL}...")
78
+ try:
79
+ resp = requests.get(QUESTIONS_URL, timeout=10)
80
+ resp.raise_for_status()
81
+ current_questions = resp.json()
82
+ except Exception as e:
83
+ print(f"Error fetching questions: {e}")
84
+ current_questions = []
85
+
86
+ token = get_hf_token()
87
+ if not token:
88
+ print("Warning: No HF token found in env (HUGGINGFACEHUB_API_TOKEN/HF_TOKEN). Trying public access; gated datasets may fail.")
89
+
90
+ # Ensure parquet is cached locally
91
+ if os.path.exists(CACHE_PATH):
92
+ print(f"Using cached GAIA parquet at {CACHE_PATH}")
93
+ parquet_path = CACHE_PATH
94
+ else:
95
+ try:
96
+ print("Downloading GAIA validation metadata (this may require a HF token)...")
97
+ parquet_path = download_parquet(token, CACHE_PATH)
98
+ print(f"Downloaded and cached to {parquet_path}")
99
+ except Exception as e:
100
+ print(f"Error downloading parquet: {e}")
101
+ print("Abort: could not obtain GAIA metadata. Consider setting HF_TOKEN or using an offline parquet in data/")
102
+ return
103
+
104
+ try:
105
+ df = safe_read_parquet(parquet_path)
106
+ except Exception as e:
107
+ print(f"Error reading parquet: {e}")
108
+ return
109
+
110
+ task_col, answer_col = detect_columns(df)
111
+ if not task_col or not answer_col:
112
+ print("Could not detect task_id or answer column. Available columns:\n", df.columns.tolist())
113
+ return
114
+
115
+ # Normalize to strings
116
+ df[task_col] = df[task_col].astype(str).str.strip()
117
+ df[answer_col] = df[answer_col].astype(str).str.strip()
118
+ answer_map = dict(zip(df[task_col], df[answer_col]))
119
+
120
+ results = []
121
+ found = 0
122
+ total = len(current_questions)
123
+ for i, q in enumerate(current_questions):
124
+ task_id = q.get("task_id")
125
+ task_id_str = str(task_id).strip() if task_id is not None else ""
126
+ answer = None
127
+ if task_id_str and task_id_str in answer_map:
128
+ answer = answer_map[task_id_str]
129
+ else:
130
+ # Try relaxed matching: maybe numeric vs string formatting
131
+ # Check without leading zeros and as int
132
+ try:
133
+ tid_int = str(int(task_id_str))
134
+ answer = answer_map.get(tid_int)
135
+ except Exception:
136
+ answer = None
137
+
138
+ ok = answer is not None and answer.lower() != "nan"
139
+ if ok:
140
+ found += 1
141
+ results.append({
142
+ "index": i + 1,
143
+ "task_id": task_id_str,
144
+ "question": (q.get("question") or "")[:1000],
145
+ "answer": answer if ok else None,
146
+ "found": bool(ok),
147
+ })
148
+
149
+ # Save outputs
150
+ out_df = pd.DataFrame(results)
151
+ out_df.to_csv(OUT_CSV, index=False)
152
+ with open(OUT_JSON, "w", encoding="utf-8") as f:
153
+ json.dump(results, f, ensure_ascii=False, indent=2)
154
+
155
+ print(f"Matched answers: {found}/{total}")
156
+ print(f"Results saved to: {OUT_CSV} and {OUT_JSON}")
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()
install_node.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ cd /tmp
3
+ tar -xf node-v20.18.0-linux-x64.tar.xz
4
+ sudo cp -r node-v20.18.0-linux-x64/* /usr/local/
5
+ /usr/local/bin/node --version
6
+ /usr/local/bin/npm --version
7
+ sudo /usr/local/bin/npm install -g @openai/codex
8
+ codex --version
setupext.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
3
+ sudo apt-get install -y nodejs
4
+ npm -v
5
+ node -v
6
+ npm install -g @openai/codex
7
+ codex --version