Spaces:
Running
Running
jedick
commited on
Commit
·
d790ca4
1
Parent(s):
48c27bb
Update for Gradio 6
Browse files- app.py +63 -86
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -180,16 +180,43 @@ def run_fewshot_classifier(old_revision: str, new_revision: str):
|
|
| 180 |
return run_classifier(old_revision, new_revision, prompt_style="few-shot")
|
| 181 |
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
@logfire.instrument("Step 4: Run judge")
|
| 184 |
def run_judge(
|
| 185 |
old_revision: str,
|
| 186 |
new_revision: str,
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
judge_mode: str,
|
| 190 |
):
|
| 191 |
"""
|
| 192 |
-
Run
|
| 193 |
|
| 194 |
Args:
|
| 195 |
old_revision: Old revision text
|
|
@@ -199,18 +226,18 @@ def run_judge(
|
|
| 199 |
judge_mode: Mode for judge function ("unaligned", "aligned-fewshot", "aligned-heuristic")
|
| 200 |
|
| 201 |
Returns:
|
| 202 |
-
Tuple of (noteworthy, reasoning) (bool, str)
|
| 203 |
"""
|
| 204 |
|
| 205 |
# Values to return if there is an error
|
| 206 |
-
noteworthy, reasoning = None, None
|
| 207 |
if (
|
| 208 |
not old_revision
|
| 209 |
or not new_revision
|
| 210 |
or not heuristic_rationale
|
| 211 |
or not fewshot_rationale
|
| 212 |
):
|
| 213 |
-
return noteworthy, reasoning
|
| 214 |
|
| 215 |
try:
|
| 216 |
# Run judge
|
|
@@ -232,52 +259,27 @@ def run_judge(
|
|
| 232 |
error_msg = f"Error running judge: {str(e)}"
|
| 233 |
raise gr.Error(error_msg, print_exception=False)
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
def format_noteworthy(noteworthy, reasoning):
|
| 239 |
-
"""
|
| 240 |
-
Format judge's noteworthy label as text.
|
| 241 |
-
"""
|
| 242 |
if not reasoning:
|
| 243 |
-
|
| 244 |
-
return None
|
| 245 |
-
else:
|
| 246 |
-
# Format noteworthy boolean as text
|
| 247 |
-
return str(noteworthy)
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
def compute_confidence(
|
| 251 |
-
heuristic_noteworthy,
|
| 252 |
-
fewshot_noteworthy,
|
| 253 |
-
judge_noteworthy,
|
| 254 |
-
heuristic_rationale,
|
| 255 |
-
fewshot_rationale,
|
| 256 |
-
judge_reasoning,
|
| 257 |
-
):
|
| 258 |
-
"""
|
| 259 |
-
Compute a confidence label using the noteworthy booleans.
|
| 260 |
-
"""
|
| 261 |
-
# Return None if any of the rationales or reasoning is missing.
|
| 262 |
-
if not heuristic_rationale or not fewshot_rationale or not judge_reasoning:
|
| 263 |
-
return None
|
| 264 |
-
if heuristic_noteworthy == fewshot_noteworthy == judge_noteworthy:
|
| 265 |
-
# Classifiers and judge all agree
|
| 266 |
-
return "High"
|
| 267 |
-
elif heuristic_noteworthy != fewshot_noteworthy:
|
| 268 |
-
# Classifiers disagree, judge decides
|
| 269 |
-
return "Moderate"
|
| 270 |
else:
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
|
|
|
| 274 |
|
| 275 |
-
# Setup theme without background image
|
| 276 |
-
theme = gr.Theme.from_hub("NoCrypt/miku")
|
| 277 |
-
theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000")
|
| 278 |
|
| 279 |
# Create Gradio interface
|
| 280 |
-
with gr.Blocks(
|
| 281 |
with gr.Row():
|
| 282 |
with gr.Column(scale=2):
|
| 283 |
gr.Markdown(
|
|
@@ -360,12 +362,12 @@ with gr.Blocks(theme=theme, title="Noteworthy Differences") as demo:
|
|
| 360 |
lines=1,
|
| 361 |
interactive=False,
|
| 362 |
)
|
| 363 |
-
rerun_btn = gr.Button("Rerun Model"
|
| 364 |
|
| 365 |
-
#
|
| 366 |
-
heuristic_noteworthy = gr.
|
| 367 |
-
fewshot_noteworthy = gr.
|
| 368 |
-
judge_noteworthy = gr.
|
| 369 |
|
| 370 |
random_btn.click(
|
| 371 |
fn=get_random_wikipedia_title,
|
|
@@ -407,28 +409,13 @@ with gr.Blocks(theme=theme, title="Noteworthy Differences") as demo:
|
|
| 407 |
inputs=[
|
| 408 |
old_revision,
|
| 409 |
new_revision,
|
| 410 |
-
heuristic_rationale,
|
| 411 |
-
fewshot_rationale,
|
| 412 |
-
judge_mode_dropdown,
|
| 413 |
-
],
|
| 414 |
-
outputs=[judge_noteworthy, judge_reasoning],
|
| 415 |
-
api_name=False,
|
| 416 |
-
).then(
|
| 417 |
-
fn=format_noteworthy,
|
| 418 |
-
inputs=[judge_noteworthy, judge_reasoning],
|
| 419 |
-
outputs=[noteworthy_text],
|
| 420 |
-
api_name=False,
|
| 421 |
-
).then(
|
| 422 |
-
fn=compute_confidence,
|
| 423 |
-
inputs=[
|
| 424 |
heuristic_noteworthy,
|
| 425 |
fewshot_noteworthy,
|
| 426 |
-
judge_noteworthy,
|
| 427 |
heuristic_rationale,
|
| 428 |
fewshot_rationale,
|
| 429 |
-
|
| 430 |
],
|
| 431 |
-
outputs=[confidence],
|
| 432 |
api_name=False,
|
| 433 |
)
|
| 434 |
|
|
@@ -449,30 +436,20 @@ with gr.Blocks(theme=theme, title="Noteworthy Differences") as demo:
|
|
| 449 |
inputs=[
|
| 450 |
old_revision,
|
| 451 |
new_revision,
|
| 452 |
-
heuristic_rationale,
|
| 453 |
-
fewshot_rationale,
|
| 454 |
-
judge_mode_dropdown,
|
| 455 |
-
],
|
| 456 |
-
outputs=[judge_noteworthy, judge_reasoning],
|
| 457 |
-
api_name=False,
|
| 458 |
-
).then(
|
| 459 |
-
fn=format_noteworthy,
|
| 460 |
-
inputs=[judge_noteworthy, judge_reasoning],
|
| 461 |
-
outputs=[noteworthy_text],
|
| 462 |
-
api_name=False,
|
| 463 |
-
).then(
|
| 464 |
-
fn=compute_confidence,
|
| 465 |
-
inputs=[
|
| 466 |
heuristic_noteworthy,
|
| 467 |
fewshot_noteworthy,
|
| 468 |
-
judge_noteworthy,
|
| 469 |
heuristic_rationale,
|
| 470 |
fewshot_rationale,
|
| 471 |
-
|
| 472 |
],
|
| 473 |
-
outputs=[confidence],
|
| 474 |
api_name=False,
|
| 475 |
)
|
| 476 |
|
| 477 |
if __name__ == "__main__":
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
return run_classifier(old_revision, new_revision, prompt_style="few-shot")
|
| 181 |
|
| 182 |
|
| 183 |
+
def compute_confidence(
|
| 184 |
+
heuristic_noteworthy,
|
| 185 |
+
fewshot_noteworthy,
|
| 186 |
+
judge_noteworthy,
|
| 187 |
+
heuristic_rationale,
|
| 188 |
+
fewshot_rationale,
|
| 189 |
+
judge_reasoning,
|
| 190 |
+
):
|
| 191 |
+
"""
|
| 192 |
+
Compute a confidence label using the noteworthy booleans.
|
| 193 |
+
"""
|
| 194 |
+
# Return None if any of the rationales or reasoning is missing.
|
| 195 |
+
if not heuristic_rationale or not fewshot_rationale or not judge_reasoning:
|
| 196 |
+
return None
|
| 197 |
+
if heuristic_noteworthy == fewshot_noteworthy == judge_noteworthy:
|
| 198 |
+
# Classifiers and judge all agree
|
| 199 |
+
return "High"
|
| 200 |
+
elif heuristic_noteworthy != fewshot_noteworthy:
|
| 201 |
+
# Classifiers disagree, judge decides
|
| 202 |
+
return "Moderate"
|
| 203 |
+
else:
|
| 204 |
+
# Classifiers agree, judge vetoes
|
| 205 |
+
return "Questionable"
|
| 206 |
+
|
| 207 |
+
|
| 208 |
@logfire.instrument("Step 4: Run judge")
|
| 209 |
def run_judge(
|
| 210 |
old_revision: str,
|
| 211 |
new_revision: str,
|
| 212 |
+
heuristic_noteworthy: bool,
|
| 213 |
+
fewshot_noteworthy: bool,
|
| 214 |
+
heuristic_rationale: str,
|
| 215 |
+
fewshot_rationale: str,
|
| 216 |
judge_mode: str,
|
| 217 |
):
|
| 218 |
"""
|
| 219 |
+
Run judge on the revisions and classifiers' rationales.
|
| 220 |
|
| 221 |
Args:
|
| 222 |
old_revision: Old revision text
|
|
|
|
| 226 |
judge_mode: Mode for judge function ("unaligned", "aligned-fewshot", "aligned-heuristic")
|
| 227 |
|
| 228 |
Returns:
|
| 229 |
+
Tuple of (noteworthy, noteworthy_text, reasoning, confidence) (bool, str, str, str)
|
| 230 |
"""
|
| 231 |
|
| 232 |
# Values to return if there is an error
|
| 233 |
+
noteworthy, noteworthy_text, reasoning, confidence = None, None, None, None
|
| 234 |
if (
|
| 235 |
not old_revision
|
| 236 |
or not new_revision
|
| 237 |
or not heuristic_rationale
|
| 238 |
or not fewshot_rationale
|
| 239 |
):
|
| 240 |
+
return noteworthy, noteworthy_text, reasoning, confidence
|
| 241 |
|
| 242 |
try:
|
| 243 |
# Run judge
|
|
|
|
| 259 |
error_msg = f"Error running judge: {str(e)}"
|
| 260 |
raise gr.Error(error_msg, print_exception=False)
|
| 261 |
|
| 262 |
+
# Format noteworthy label (boolean) as text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
if not reasoning:
|
| 264 |
+
noteworthy_text = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
else:
|
| 266 |
+
noteworthy_text = str(noteworthy)
|
| 267 |
+
|
| 268 |
+
# Get confidence score
|
| 269 |
+
confidence = compute_confidence(
|
| 270 |
+
heuristic_noteworthy,
|
| 271 |
+
fewshot_noteworthy,
|
| 272 |
+
noteworthy,
|
| 273 |
+
heuristic_rationale,
|
| 274 |
+
fewshot_rationale,
|
| 275 |
+
reasoning,
|
| 276 |
+
)
|
| 277 |
|
| 278 |
+
return noteworthy, noteworthy_text, reasoning, confidence
|
| 279 |
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
# Create Gradio interface
|
| 282 |
+
with gr.Blocks(title="Noteworthy Differences") as demo:
|
| 283 |
with gr.Row():
|
| 284 |
with gr.Column(scale=2):
|
| 285 |
gr.Markdown(
|
|
|
|
| 362 |
lines=1,
|
| 363 |
interactive=False,
|
| 364 |
)
|
| 365 |
+
rerun_btn = gr.Button("Rerun Model")
|
| 366 |
|
| 367 |
+
# States to store boolean values
|
| 368 |
+
heuristic_noteworthy = gr.State()
|
| 369 |
+
fewshot_noteworthy = gr.State()
|
| 370 |
+
judge_noteworthy = gr.State()
|
| 371 |
|
| 372 |
random_btn.click(
|
| 373 |
fn=get_random_wikipedia_title,
|
|
|
|
| 409 |
inputs=[
|
| 410 |
old_revision,
|
| 411 |
new_revision,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
heuristic_noteworthy,
|
| 413 |
fewshot_noteworthy,
|
|
|
|
| 414 |
heuristic_rationale,
|
| 415 |
fewshot_rationale,
|
| 416 |
+
judge_mode_dropdown,
|
| 417 |
],
|
| 418 |
+
outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence],
|
| 419 |
api_name=False,
|
| 420 |
)
|
| 421 |
|
|
|
|
| 436 |
inputs=[
|
| 437 |
old_revision,
|
| 438 |
new_revision,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
heuristic_noteworthy,
|
| 440 |
fewshot_noteworthy,
|
|
|
|
| 441 |
heuristic_rationale,
|
| 442 |
fewshot_rationale,
|
| 443 |
+
judge_mode_dropdown,
|
| 444 |
],
|
| 445 |
+
outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence],
|
| 446 |
api_name=False,
|
| 447 |
)
|
| 448 |
|
| 449 |
if __name__ == "__main__":
|
| 450 |
+
|
| 451 |
+
# Setup theme without background image
|
| 452 |
+
theme = gr.Theme.from_hub("NoCrypt/miku")
|
| 453 |
+
theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000")
|
| 454 |
+
|
| 455 |
+
demo.launch(theme=theme)
|
requirements.txt
CHANGED
|
@@ -2,7 +2,7 @@ google-genai
|
|
| 2 |
pydantic
|
| 3 |
pandas
|
| 4 |
dotenv
|
| 5 |
-
gradio
|
| 6 |
requests
|
| 7 |
logfire
|
| 8 |
opentelemetry-instrumentation-google-genai
|
|
|
|
| 2 |
pydantic
|
| 3 |
pandas
|
| 4 |
dotenv
|
| 5 |
+
gradio>=6.0.1
|
| 6 |
requests
|
| 7 |
logfire
|
| 8 |
opentelemetry-instrumentation-google-genai
|