import gradio as gr from wiki_data_fetcher import ( get_previous_revisions, get_revision_from_age, get_wikipedia_introduction, extract_revision_info, get_revisions_behind, get_random_wikipedia_title, ) from models import classifier, judge import logfire from dotenv import load_dotenv # Load API keys load_dotenv() # Setup logging with Logfire logfire.configure() # If running a standalone Gradio app via `demo.launch()` within a script, # Logfire's auto-instrumentation for FastAPI is often automatically handled # if installed. If mounting within a separate FastAPI app, use: # logfire.instrument_fastapi(app) @logfire.instrument("Step 1: Fetch current revision") def fetch_current_revision(title: str): """ Fetch current revision of a Wikipedia article and return its introduction. Args: title: Wikipedia article title Returns: Tuple of (introduction, timestamp) """ if not title or not title.strip(): error_msg = "Please enter a Wikipedia page title." raise gr.Error(error_msg, print_exception=False) return None, None try: # Get current revision (revision 0) json_data = get_previous_revisions(title, revisions=0) revision_info = extract_revision_info(json_data, revnum=0) if not revision_info.get("revid"): error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title." raise gr.Error(error_msg, print_exception=False) return None, None revid = revision_info["revid"] timestamp = revision_info["timestamp"] # Get introduction introduction = get_wikipedia_introduction(revid) if introduction is None: introduction = f"Error: Could not retrieve introduction for current revision (revid: {revid})" # Format timestamp for display timestamp = f"**Timestamp:** {timestamp}" if timestamp else "" # Return introduction text and timestamp return introduction, timestamp except Exception as e: error_msg = f"Error occurred: {str(e)}" raise gr.Error(error_msg, print_exception=False) return None, None @logfire.instrument("Step 2: Fetch previous revision") def fetch_previous_revision(title: str, unit: str, number: int, new_revision: str): """ Fetch previous revision of a Wikipedia article and return its introduction. Args: title: Wikipedia article title unit: "revisions" or "days" number: Number of revisions or days behind Returns: Tuple of (introduction, timestamp) """ # If we get here with an empty new revision, then an error should have been raised # in fetch_current_revision, so just return empty values without raising another error if not new_revision: return None, None try: # Get previous revision based on unit if unit == "revisions": json_data = get_previous_revisions(title, revisions=number) revision_info = extract_revision_info(json_data, revnum=number) else: # unit == "days" revision_info = get_revision_from_age(title, age_days=number) if not revision_info.get("revid"): error_msg = f"Error: Could not find revision {number} {'revisions' if unit == 'revisions' else 'days'} behind for '{title}'." raise gr.Error(error_msg, print_exception=False) return None, None revid = revision_info["revid"] timestamp = revision_info["timestamp"] # Get introduction introduction = get_wikipedia_introduction(revid) if introduction is None: introduction = f"Error: Could not retrieve introduction for previous revision (revid: {revid})" # Get revisions_behind if unit == "revisions": revisions_behind = revision_info["revnum"] else: revisions_behind = get_revisions_behind(title, revid) # For a negative number, replace the negative sign with ">" if revisions_behind < 0: revisions_behind = str(revisions_behind).replace("-", ">") # Format timestamp for display timestamp = ( f"**Timestamp:** {timestamp}, {revisions_behind} revisions behind" if timestamp else "" ) # Return introduction text and timestamp return introduction, timestamp except Exception as e: error_msg = f"Error occurred: {str(e)}" raise gr.Error(error_msg, print_exception=False) return None, None def run_classifier(old_revision: str, new_revision: str, prompt_style: str): """ Run a classification model on the revisions. Args: old_revision: Old revision text new_revision: New revision text prompt_style: heuristic or few-shot Returns: Tuple of (noteworthy, rationale) (bool, str) """ # Values to return if there is an error noteworthy, rationale = None, None if not old_revision or not new_revision: return noteworthy, rationale try: # Run classifier model result = classifier(old_revision, new_revision, prompt_style=prompt_style) if result: noteworthy = result.get("noteworthy", None) rationale = result.get("rationale", "") else: error_msg = f"Error: Could not get {prompt_style} model result" raise gr.Error(error_msg, print_exception=False) except Exception as e: error_msg = f"Error running model: {str(e)}" raise gr.Error(error_msg, print_exception=False) return noteworthy, rationale @logfire.instrument("Step 3a: Run heuristic classifier") def run_heuristic_classifier(old_revision: str, new_revision: str): return run_classifier(old_revision, new_revision, prompt_style="heuristic") @logfire.instrument("Step 3b: Run few-shot classifier") def run_fewshot_classifier(old_revision: str, new_revision: str): return run_classifier(old_revision, new_revision, prompt_style="few-shot") def compute_confidence( heuristic_noteworthy, fewshot_noteworthy, judge_noteworthy, heuristic_rationale, fewshot_rationale, judge_reasoning, ): """ Compute a confidence label using the noteworthy booleans. """ # Return None if any of the rationales or reasoning is missing. if not heuristic_rationale or not fewshot_rationale or not judge_reasoning: return None if heuristic_noteworthy == fewshot_noteworthy == judge_noteworthy: # Classifiers and judge all agree return "High" elif heuristic_noteworthy != fewshot_noteworthy: # Classifiers disagree, judge decides return "Moderate" else: # Classifiers agree, judge vetoes return "Questionable" @logfire.instrument("Step 4: Run judge") def run_judge( old_revision: str, new_revision: str, heuristic_noteworthy: bool, fewshot_noteworthy: bool, heuristic_rationale: str, fewshot_rationale: str, judge_mode: str, ): """ Run judge on the revisions and classifiers' rationales. Args: old_revision: Old revision text new_revision: New revision text heuristic_rationale: Heuristic model's rationale fewshot_rationale: Few-shot model's rationale judge_mode: Mode for judge function ("unaligned", "aligned-fewshot", "aligned-heuristic") Returns: Tuple of (noteworthy, noteworthy_text, reasoning, confidence) (bool, str, str, str) """ # Values to return if there is an error noteworthy, noteworthy_text, reasoning, confidence = None, None, None, None if ( not old_revision or not new_revision or not heuristic_rationale or not fewshot_rationale ): return noteworthy, noteworthy_text, reasoning, confidence try: # Run judge result = judge( old_revision, new_revision, heuristic_rationale, fewshot_rationale, mode=judge_mode, ) if result: noteworthy = result.get("noteworthy", "") reasoning = result.get("reasoning", "") else: error_msg = f"Error: Could not get judge's result" raise gr.Error(error_msg, print_exception=False) except Exception as e: error_msg = f"Error running judge: {str(e)}" raise gr.Error(error_msg, print_exception=False) # Format noteworthy label (boolean) as text if not reasoning: noteworthy_text = None else: noteworthy_text = str(noteworthy) # Get confidence score confidence = compute_confidence( heuristic_noteworthy, fewshot_noteworthy, noteworthy, heuristic_rationale, fewshot_rationale, reasoning, ) return noteworthy, noteworthy_text, reasoning, confidence # Create Gradio interface with gr.Blocks(title="Noteworthy Differences") as demo: with gr.Row(): gr.Markdown( """ Compare current and old revisions of a Wikipedia article - you choose the number of revisions or days behind.
Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.
The judge was aligned with human preferences as described in the [GitHub repository](https://github.com/jedick/noteworthy-differences). """ ) with gr.Row(): title_input = gr.Textbox( label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value="" ) number_input = gr.Number(label="Number", value=50, minimum=0, precision=0) unit_dropdown = gr.Dropdown( choices=["revisions", "days"], value="revisions", label="Unit" ) judge_mode_dropdown = gr.Dropdown( choices=["unaligned", "aligned-fewshot", "aligned-heuristic"], value="aligned-heuristic", label="Judge Mode", ) with gr.Column(): random_btn = gr.Button("Get Random Page Title") submit_btn = gr.Button("Fetch Revisions and Run Model", variant="primary") with gr.Row(): with gr.Column(): gr.Markdown("### Old Revision") old_timestamp = gr.Markdown("") old_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False) gr.Markdown( """#### Query Instructions - Page title is case sensitive; use underscores or spaces - Specify any number of days or up to 499 revisions behind - The closest available revision is retrieved - Only article introductions are downloaded """ ) with gr.Column(): gr.Markdown("### Current Revision") new_timestamp = gr.Markdown("") new_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False) gr.Markdown( """#### Confidence Key - **High:** heuristic = few-shot, judge agrees - **Moderate:** heuristic ≠ few-shot, judge decides - **Questionable:** heuristic = few-shot, judge vetoes """ ) with gr.Column(): gr.Markdown("### Model Output") heuristic_rationale = gr.Textbox( label="Heuristic Model's Rationale", lines=2, max_lines=7, ) fewshot_rationale = gr.Textbox( label="Few-shot Model's Rationale", lines=2, max_lines=7, ) judge_reasoning = gr.Textbox( label="Judge's Reasoning", lines=2, max_lines=7, ) with gr.Row(variant="default"): noteworthy_text = gr.Textbox( label="Noteworthy Differences", lines=1, interactive=False, ) confidence = gr.Textbox( label="Confidence", lines=1, interactive=False, ) rerun_btn = gr.Button("Rerun Model") # States to store boolean values heuristic_noteworthy = gr.State() fewshot_noteworthy = gr.State() judge_noteworthy = gr.State() random_btn.click( fn=get_random_wikipedia_title, inputs=None, outputs=[title_input], ) gr.on( # Press Enter in textbox or use button to submit triggers=[title_input.submit, submit_btn.click], # Clear the new_revision and new_timestamp values before proceeding. # The empty values will propagate to the other components (through function return values) if there is an error. fn=lambda: (gr.update(value=""), gr.update(value="")), inputs=None, outputs=[new_revision, new_timestamp], api_name=False, ).then( fn=fetch_current_revision, inputs=[title_input], outputs=[new_revision, new_timestamp], api_name=False, ).then( fn=fetch_previous_revision, inputs=[title_input, unit_dropdown, number_input, new_revision], outputs=[old_revision, old_timestamp], api_name=False, ).then( fn=run_heuristic_classifier, inputs=[old_revision, new_revision], outputs=[heuristic_noteworthy, heuristic_rationale], api_name=False, ).then( fn=run_fewshot_classifier, inputs=[old_revision, new_revision], outputs=[fewshot_noteworthy, fewshot_rationale], api_name=False, ).then( fn=run_judge, inputs=[ old_revision, new_revision, heuristic_noteworthy, fewshot_noteworthy, heuristic_rationale, fewshot_rationale, judge_mode_dropdown, ], outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence], api_name=False, ) # Rerun model when rerun button is clicked gr.on( triggers=[rerun_btn.click], fn=run_heuristic_classifier, inputs=[old_revision, new_revision], outputs=[heuristic_noteworthy, heuristic_rationale], api_name=False, ).then( fn=run_fewshot_classifier, inputs=[old_revision, new_revision], outputs=[fewshot_noteworthy, fewshot_rationale], api_name=False, ).then( fn=run_judge, inputs=[ old_revision, new_revision, heuristic_noteworthy, fewshot_noteworthy, heuristic_rationale, fewshot_rationale, judge_mode_dropdown, ], outputs=[judge_noteworthy, noteworthy_text, judge_reasoning, confidence], api_name=False, ) if __name__ == "__main__": # Setup theme without background image theme = gr.Theme.from_hub("NoCrypt/miku") theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000") demo.launch(theme=theme)