File size: 19,190 Bytes
48c27bb
 
 
 
 
 
 
 
 
30b1c80
8d79272
30b1c80
 
 
48c27bb
 
 
 
 
 
30b1c80
 
 
 
8d79272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c27bb
 
f395a83
8d79272
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c02ce2
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d79272
 
 
 
 
 
 
f395a83
8d79272
48c27bb
 
 
 
 
3c02ce2
8d79272
48c27bb
 
 
 
 
 
 
 
 
 
 
8d79272
 
48c27bb
3c02ce2
8d79272
48c27bb
 
 
8d79272
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
8d79272
3c02ce2
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d79272
 
 
 
 
f395a83
8d79272
48c27bb
 
 
8d79272
 
 
 
 
f395a83
8d79272
48c27bb
 
 
d790ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c27bb
 
 
d790ca4
 
 
 
48c27bb
8d79272
 
 
 
 
 
 
 
 
 
 
 
 
 
f395a83
8d79272
 
 
 
 
 
 
 
48c27bb
 
d790ca4
48c27bb
 
 
 
 
 
 
 
 
d790ca4
48c27bb
 
 
d790ca4
48c27bb
 
 
 
 
 
d790ca4
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d790ca4
48c27bb
d790ca4
48c27bb
d790ca4
 
 
 
 
 
 
 
 
 
 
48c27bb
d790ca4
48c27bb
 
 
d790ca4
48c27bb
3e97aef
48c27bb
3c02ce2
3e97aef
 
 
 
 
48c27bb
 
 
 
 
3c02ce2
8d79272
3c02ce2
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e97aef
 
 
3c02ce2
3e97aef
48c27bb
 
 
 
 
 
 
3e97aef
 
 
 
 
 
 
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d790ca4
48c27bb
30b1c80
 
 
 
 
 
 
 
 
 
 
d790ca4
 
 
 
8d79272
 
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d79272
 
 
 
 
48c27bb
 
8d79272
48c27bb
 
 
 
8d79272
48c27bb
 
 
 
8d79272
48c27bb
 
 
 
8d79272
48c27bb
 
 
 
 
 
 
 
 
 
 
d790ca4
8d79272
48c27bb
d790ca4
48c27bb
 
 
 
 
 
 
8d79272
48c27bb
 
 
 
8d79272
48c27bb
 
 
 
 
 
 
 
 
 
 
d790ca4
8d79272
48c27bb
d790ca4
48c27bb
 
 
30b1c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c27bb
d790ca4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
import gradio as gr
from wiki_data_fetcher import (
    get_previous_revisions,
    get_revision_from_age,
    get_wikipedia_introduction,
    extract_revision_info,
    get_revisions_behind,
    get_random_wikipedia_title,
)
from feedback import save_feedback_agree, save_feedback_disagree
from contextlib import nullcontext
from dotenv import load_dotenv
import logfire
import os

# Load API keys
load_dotenv()
# Setup logging with Logfire
logfire.configure()

# Import this after logfire.configure() so we don't get
# LogfireNotConfiguredWarning: Instrumentation will have no effect
from models import classifier, judge


def start_parent_span(title: str, number: int, units: str):
    """
    Start a parent span and return the context for propagation to children.
    See https://logfire.pydantic.dev/docs/how-to-guides/distributed-tracing/#manual-context-propagation
    """
    span_name = f"{title} - {number} {units}"
    with logfire.span(span_name) as span:
        span.__enter__()
        context = logfire.get_context()
    return context


def fetch_current_revision(title: str, context=None):
    """
    Wrapper to run _fetch_current_revision in provided Logfire context.
    We use this to minimize indentation in the wrapped function.
    """
    with logfire.attach_context(context) if context else nullcontext():
        return _fetch_current_revision(title)


@logfire.instrument("Fetch current revision")
def _fetch_current_revision(title: str):
    """
    Fetch current revision of a Wikipedia article and return its introduction.

    Args:
        title: Wikipedia article title

    Returns:
        Tuple of (introduction, timestamp)
    """
    if not title or not title.strip():
        error_msg = "Please enter a Wikipedia page title."
        raise gr.Error(error_msg, print_exception=False)
        return None, None

    try:
        # Get current revision (revision 0)
        json_data = get_previous_revisions(title, revisions=0)
        revision_info = extract_revision_info(json_data, revnum=0)

        if not revision_info.get("revid"):
            error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
            raise gr.Error(error_msg, print_exception=False)
            return None, None

        revid = revision_info["revid"]
        timestamp = revision_info["timestamp"]

        # Get introduction
        introduction = get_wikipedia_introduction(revid)

        if introduction is None:
            introduction = f"Error: Could not retrieve introduction for current revision (revid: {revid})"

        # Format timestamp for display
        timestamp = f"**Timestamp:** {timestamp}" if timestamp else ""

        # Return introduction text and timestamp
        return introduction, timestamp

    except Exception as e:
        error_msg = f"Error occurred: {str(e)}"
        raise gr.Error(error_msg, print_exception=False)
        return None, None


def fetch_previous_revision(
    title: str, number: int, units: str, new_revision: str, context=None
):
    with logfire.attach_context(context) if context else nullcontext():
        return _fetch_previous_revision(title, number, units, new_revision)


@logfire.instrument("Fetch previous revision")
def _fetch_previous_revision(title: str, number: int, units: str, new_revision: str):
    """
    Fetch previous revision of a Wikipedia article and return its introduction.

    Args:
        title: Wikipedia article title
        number: Number of revisions or days behind
        units: "revisions" or "days"

    Returns:
        Tuple of (introduction, timestamp)
    """

    # If we get here with an empty new revision, then an error should have been raised
    # in fetch_current_revision, so just return empty values without raising another error
    if not new_revision:
        return None, None

    try:
        # Get previous revision based on units
        if units == "revisions":
            json_data = get_previous_revisions(title, revisions=number)
            revision_info = extract_revision_info(json_data, revnum=number)
        else:  # units == "days"
            revision_info = get_revision_from_age(title, age_days=number)

        if not revision_info.get("revid"):
            error_msg = f"Error: Could not find revision {number} {'revisions' if units == 'revisions' else 'days'} behind for '{title}'."
            raise gr.Error(error_msg, print_exception=False)
            return None, None

        revid = revision_info["revid"]
        timestamp = revision_info["timestamp"]

        # Get introduction
        introduction = get_wikipedia_introduction(revid)

        if introduction is None:
            introduction = f"Error: Could not retrieve introduction for previous revision (revid: {revid})"

        # Get revisions_behind
        if units == "revisions":
            revisions_behind = revision_info["revnum"]
        else:
            revisions_behind = get_revisions_behind(title, revid)
            # For a negative number, replace the negative sign with ">"
            if revisions_behind < 0:
                revisions_behind = str(revisions_behind).replace("-", ">")

        # Format timestamp for display
        timestamp = (
            f"**Timestamp:** {timestamp}, {revisions_behind} revisions behind"
            if timestamp
            else ""
        )

        # Return introduction text and timestamp
        return introduction, timestamp

    except Exception as e:
        error_msg = f"Error occurred: {str(e)}"
        raise gr.Error(error_msg, print_exception=False)
        return None, None


def run_classifier(old_revision: str, new_revision: str, prompt_style: str):
    """
    Run a classification model on the revisions.

    Args:
        old_revision: Old revision text
        new_revision: New revision text
        prompt_style: heuristic or few-shot

    Returns:
        Tuple of (noteworthy, rationale) (bool, str)
    """

    # Values to return if there is an error
    noteworthy, rationale = None, None
    if not old_revision or not new_revision:
        return noteworthy, rationale

    try:
        # Run classifier model
        result = classifier(old_revision, new_revision, prompt_style=prompt_style)
        if result:
            noteworthy = result.get("noteworthy", None)
            rationale = result.get("rationale", "")
        else:
            error_msg = f"Error: Could not get {prompt_style} model result"
            raise gr.Error(error_msg, print_exception=False)

    except Exception as e:
        error_msg = f"Error running model: {str(e)}"
        raise gr.Error(error_msg, print_exception=False)

    return noteworthy, rationale


def run_heuristic_classifier(old_revision: str, new_revision: str, context=None):
    with logfire.attach_context(context) if context else nullcontext():
        return _run_heuristic_classifier(old_revision, new_revision)


@logfire.instrument("Run heuristic classifier")
def _run_heuristic_classifier(old_revision: str, new_revision: str):
    return run_classifier(old_revision, new_revision, prompt_style="heuristic")


def run_fewshot_classifier(old_revision: str, new_revision: str, context=None):
    with logfire.attach_context(context) if context else nullcontext():
        return _run_fewshot_classifier(old_revision, new_revision)


@logfire.instrument("Run few-shot classifier")
def _run_fewshot_classifier(old_revision: str, new_revision: str):
    return run_classifier(old_revision, new_revision, prompt_style="few-shot")


def compute_confidence(
    heuristic_noteworthy,
    fewshot_noteworthy,
    judge_noteworthy,
    heuristic_rationale,
    fewshot_rationale,
    judge_reasoning,
):
    """
    Compute a confidence label using the noteworthy booleans.
    """
    # Return None if any of the rationales or reasoning is missing.
    if not heuristic_rationale or not fewshot_rationale or not judge_reasoning:
        return None
    if heuristic_noteworthy == fewshot_noteworthy == judge_noteworthy:
        # Classifiers and judge all agree
        return "High"
    elif heuristic_noteworthy != fewshot_noteworthy:
        # Classifiers disagree, judge decides
        return "Moderate"
    else:
        # Classifiers agree, judge vetoes
        return "Questionable"


def run_judge(
    old_revision: str,
    new_revision: str,
    heuristic_noteworthy: bool,
    fewshot_noteworthy: bool,
    heuristic_rationale: str,
    fewshot_rationale: str,
    judge_mode: str,
    context=None,
):
    with logfire.attach_context(context) if context else nullcontext():
        return _run_judge(
            old_revision,
            new_revision,
            heuristic_noteworthy,
            heuristic_noteworthy,
            heuristic_rationale,
            fewshot_rationale,
            judge_mode,
        )


@logfire.instrument("Run judge")
def _run_judge(
    old_revision: str,
    new_revision: str,
    heuristic_noteworthy: bool,
    fewshot_noteworthy: bool,
    heuristic_rationale: str,
    fewshot_rationale: str,
    judge_mode: str,
):
    """
    Run judge on the revisions and classifiers' rationales.

    Args:
        old_revision: Old revision text
        new_revision: New revision text
        heuristic_rationale: Heuristic model's rationale
        fewshot_rationale: Few-shot model's rationale
        judge_mode: Mode for judge function ("unaligned", "aligned-fewshot", "aligned-heuristic")

    Returns:
        Tuple of (noteworthy, noteworthy_text, reasoning, confidence) (bool, str, str, str)
    """

    # Values to return if there is an error
    noteworthy, noteworthy_text, reasoning, confidence = None, None, None, None
    if (
        not old_revision
        or not new_revision
        or not heuristic_rationale
        or not fewshot_rationale
    ):
        return noteworthy, noteworthy_text, reasoning, confidence

    try:
        # Run judge
        result = judge(
            old_revision,
            new_revision,
            heuristic_rationale,
            fewshot_rationale,
            mode=judge_mode,
        )
        if result:
            noteworthy = result.get("noteworthy", "")
            reasoning = result.get("reasoning", "")
        else:
            error_msg = f"Error: Could not get judge's result"
            raise gr.Error(error_msg, print_exception=False)

    except Exception as e:
        error_msg = f"Error running judge: {str(e)}"
        raise gr.Error(error_msg, print_exception=False)

    # Format noteworthy label (boolean) as text
    if not reasoning:
        noteworthy_text = None
    else:
        noteworthy_text = str(noteworthy)

    # Get confidence score
    confidence = compute_confidence(
        heuristic_noteworthy,
        fewshot_noteworthy,
        noteworthy,
        heuristic_rationale,
        fewshot_rationale,
        reasoning,
    )

    return noteworthy, noteworthy_text, reasoning, confidence


# Create Gradio interface
with gr.Blocks(title="Noteworthy Differences") as demo:
    with gr.Row():
        gr.Markdown(
            """
        Compare current and old revisions of a Wikipedia article - you choose the number of revisions or days behind.<br>
        Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.<br>
        The judge was aligned with human preferences as described in the
        [GitHub repository](https://github.com/jedick/noteworthy-differences).
        """
        )

    with gr.Row():
        title_input = gr.Textbox(
            label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
        )
        number_input = gr.Number(label="Number", value=50, minimum=0, precision=0)
        units_dropdown = gr.Dropdown(
            choices=["revisions", "days"], value="revisions", label="Unit"
        )
        judge_mode_dropdown = gr.Dropdown(
            choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
            value="aligned-heuristic",
            label="Judge Mode",
        )
        with gr.Column():
            random_btn = gr.Button("Get Random Page Title")
            submit_btn = gr.Button("Fetch Revisions and Run Model", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Old Revision")
            old_timestamp = gr.Markdown("")
            old_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False)
            gr.Markdown(
                """#### Query Instructions
            - Page title is case sensitive; use underscores or spaces
            - Specify any number of days or up to 499 revisions behind
              - The closest available revision is retrieved
            - Only article introductions are downloaded
            """
            )

        with gr.Column():
            gr.Markdown("### Current Revision")
            new_timestamp = gr.Markdown("")
            new_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False)
            gr.Markdown(
                """#### Confidence Key
                - **High:** heuristic = few-shot, judge agrees
                - **Moderate:** heuristic β‰  few-shot, judge decides
                - **Questionable:** heuristic = few-shot, judge vetoes
                """
            )

        with gr.Column():
            gr.Markdown("### Model Output")
            heuristic_rationale = gr.Textbox(
                label="Heuristic Model's Rationale",
                lines=2,
                max_lines=7,
            )
            fewshot_rationale = gr.Textbox(
                label="Few-shot Model's Rationale",
                lines=2,
                max_lines=7,
            )
            judge_reasoning = gr.Textbox(
                label="Judge's Reasoning",
                lines=2,
                max_lines=7,
            )
            with gr.Row(variant="default"):
                noteworthy_text = gr.Textbox(
                    label="Noteworthy Differences",
                    lines=1,
                    interactive=False,
                )
                confidence = gr.Textbox(
                    label="Confidence",
                    lines=1,
                    interactive=False,
                )
            rerun_btn = gr.Button("Rerun Model")

            gr.Markdown("### Your feedback")
            feedback_status = gr.Textbox(
                label="",
                lines=1,
                interactive=False,
                visible=True,
            )
            with gr.Row():
                thumbs_up_btn = gr.Button("πŸ‘ Agree", variant="primary")
                thumbs_down_btn = gr.Button("πŸ‘Ž Disagree", variant="secondary")

    # States to store boolean values
    heuristic_noteworthy = gr.State()
    fewshot_noteworthy = gr.State()
    judge_noteworthy = gr.State()
    # State to store Logfire context
    context = gr.State()

    random_btn.click(
        fn=get_random_wikipedia_title,
        inputs=None,
        outputs=[title_input],
    )

    gr.on(
        # Press Enter in textbox or use button to submit
        triggers=[title_input.submit, submit_btn.click],
        # Clear the new_revision and new_timestamp values before proceeding.
        # The empty values will propagate to the other components (through function return values) if there is an error.
        fn=lambda: (gr.update(value=""), gr.update(value="")),
        inputs=None,
        outputs=[new_revision, new_timestamp],
        api_name=False,
    ).then(
        # Initialize Logfire context
        fn=start_parent_span,
        inputs=[title_input, number_input, units_dropdown],
        outputs=context,
    ).then(
        fn=fetch_current_revision,
        inputs=[title_input, context],
        outputs=[new_revision, new_timestamp],
        api_name=False,
    ).then(
        fn=fetch_previous_revision,
        inputs=[title_input, number_input, units_dropdown, new_revision, context],
        outputs=[old_revision, old_timestamp],
        api_name=False,
    ).then(
        fn=run_heuristic_classifier,
        inputs=[old_revision, new_revision, context],
        outputs=[heuristic_noteworthy, heuristic_rationale],
        api_name=False,
    ).then(
        fn=run_fewshot_classifier,
        inputs=[old_revision, new_revision, context],
        outputs=[fewshot_noteworthy, fewshot_rationale],
        api_name=False,
    ).then(
        fn=run_judge,
        inputs=[
            old_revision,
            new_revision,
            heuristic_noteworthy,
            fewshot_noteworthy,
            heuristic_rationale,
            fewshot_rationale,
            judge_mode_dropdown,
            context,
        ],
        outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence],
        api_name=False,
    )

    # Rerun model when rerun button is clicked
    gr.on(
        triggers=[rerun_btn.click],
        fn=run_heuristic_classifier,
        inputs=[old_revision, new_revision, context],
        outputs=[heuristic_noteworthy, heuristic_rationale],
        api_name=False,
    ).then(
        fn=run_fewshot_classifier,
        inputs=[old_revision, new_revision, context],
        outputs=[fewshot_noteworthy, fewshot_rationale],
        api_name=False,
    ).then(
        fn=run_judge,
        inputs=[
            old_revision,
            new_revision,
            heuristic_noteworthy,
            fewshot_noteworthy,
            heuristic_rationale,
            fewshot_rationale,
            judge_mode_dropdown,
            context,
        ],
        outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence],
        api_name=False,
    )

    # Feedback button handlers
    thumbs_up_btn.click(
        fn=save_feedback_agree,
        inputs=[
            title_input,
            number_input,
            units_dropdown,
            judge_mode_dropdown,
            old_revision,
            new_revision,
            old_timestamp,
            new_timestamp,
            heuristic_rationale,
            fewshot_rationale,
            judge_reasoning,
            noteworthy_text,
            confidence,
            heuristic_noteworthy,
            fewshot_noteworthy,
            judge_noteworthy,
        ],
        outputs=[feedback_status],
        api_name=False,
    )

    thumbs_down_btn.click(
        fn=save_feedback_disagree,
        inputs=[
            title_input,
            number_input,
            units_dropdown,
            judge_mode_dropdown,
            old_revision,
            new_revision,
            old_timestamp,
            new_timestamp,
            heuristic_rationale,
            fewshot_rationale,
            judge_reasoning,
            noteworthy_text,
            confidence,
            heuristic_noteworthy,
            fewshot_noteworthy,
            judge_noteworthy,
        ],
        outputs=[feedback_status],
        api_name=False,
    )

if __name__ == "__main__":

    # Setup theme without background image
    theme = gr.Theme.from_hub("NoCrypt/miku")
    theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000")

    demo.launch(theme=theme)