Update README.md
Browse files
README.md
CHANGED
|
@@ -37,6 +37,17 @@ WebJudge preserves critical intermediate screenshots while mitigating the token
|
|
| 37 |
|
| 38 |
### Comparison against Existing Evaluation Methods
|
| 39 |
<table>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
<tr>
|
| 41 |
<th rowspan="4">GPT-4o</th>
|
| 42 |
<td>Autonomous Eval</td>
|
|
@@ -112,7 +123,7 @@ WebJudge preserves critical intermediate screenshots while mitigating the token
|
|
| 112 |
</tr>
|
| 113 |
|
| 114 |
<tr>
|
| 115 |
-
<th
|
| 116 |
<td>WebJudge-7B</td>
|
| 117 |
<td>86.0</td>
|
| 118 |
<td>87.3</td>
|
|
|
|
| 37 |
|
| 38 |
### Comparison against Existing Evaluation Methods
|
| 39 |
<table>
|
| 40 |
+
<tr>
|
| 41 |
+
<th>Model</th>
|
| 42 |
+
<th>Auto-Eval</th>
|
| 43 |
+
<td>SeeAct</td>
|
| 44 |
+
<td>Agent-E</td>
|
| 45 |
+
<td>Browser Use</td>
|
| 46 |
+
<td>Claude 3.5 </td>
|
| 47 |
+
<td>Claude 3.7</td>
|
| 48 |
+
<td>Operator</td>
|
| 49 |
+
<th>Avg AR</th>
|
| 50 |
+
</tr>
|
| 51 |
<tr>
|
| 52 |
<th rowspan="4">GPT-4o</th>
|
| 53 |
<td>Autonomous Eval</td>
|
|
|
|
| 123 |
</tr>
|
| 124 |
|
| 125 |
<tr>
|
| 126 |
+
<th></th>
|
| 127 |
<td>WebJudge-7B</td>
|
| 128 |
<td>86.0</td>
|
| 129 |
<td>87.3</td>
|