import json import os import random from urllib.request import urlopen import pandas as pd import plotly.graph_objects as go import gradio as gr # ------------------------------------------------------------------- # Load data # ------------------------------------------------------------------- DATA_SOURCE = "https://os.zhdk.cloud.switch.ch/115-canonical-processed-final/langident/langident-lid-ensemble_multilingual_v2-0-2/langid-ocrqa_v2-0-0.json" with urlopen(DATA_SOURCE) as response: data = json.load(response) # ------------------------------------------------------------------- # Flatten yearly OCRQA data # ------------------------------------------------------------------- rows = [] for media in data.get("media_list", []): provider = media.get("data_provider") newspaper = media.get("media_title") for stats in media.get("media_statistics", []): if stats.get("granularity") != "year": continue try: year = int(stats["element"].rsplit("-", 1)[-1]) except Exception: continue media_stats = stats.get("media_stats", {}) avg_ocrqa = media_stats.get("avg_ocrqa") if avg_ocrqa is None: continue rows.append( { "provider": provider, "newspaper": newspaper, "year": year, "avg_ocrqa": avg_ocrqa, "issues": media_stats.get("issues"), "content_items_out": media_stats.get("content_items_out"), } ) df = pd.DataFrame(rows).sort_values(["provider", "newspaper", "year"]) if df.empty: raise ValueError("No yearly OCRQA data found.") # ------------------------------------------------------------------- # Alias lookups (ALL-ALIAS.jsonl) # ------------------------------------------------------------------- media_title_map: dict[str, str] = {} # media_alias → full title provider_name_map: dict[str, str] = {} # provider_alias → full name _alias_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "ALL-ALIAS.jsonl" ) with open(_alias_path, encoding="utf-8") as _f: for _line in _f: _line = _line.strip() if not _line: continue _entry = json.loads(_line) media_title_map[_entry["media_alias"].strip()] = _entry["media_title"] _pa = _entry["provider_alias"].strip() if _pa not in provider_name_map: provider_name_map[_pa] = _entry["provider_name"] def newspaper_label(alias: str) -> str: alias = alias.strip() title = media_title_map.get(alias, alias) return f"{title} [{alias}]" if title != alias else alias def provider_label(alias: str) -> str: alias = alias.strip() name = provider_name_map.get(alias, alias) # Strip a trailing "(ALIAS)" already embedded in provider_name suffix = f"({alias})" if name.endswith(suffix): name = name[: -len(suffix)].strip() return f"{name} [{alias}]" provider_options = [("All", "All")] + sorted( [(provider_label(p), p) for p in df["provider"].dropna().unique()], key=lambda x: x[0], ) # ------------------------------------------------------------------- # Rankings # ------------------------------------------------------------------- ranking_by_provider = ( df.groupby(["provider", "newspaper"], as_index=False)["avg_ocrqa"] .mean() .rename(columns={"avg_ocrqa": "mean_ocrqa"}) ) ranking_global = ( df.groupby("newspaper", as_index=False)["avg_ocrqa"] .mean() .rename(columns={"avg_ocrqa": "mean_ocrqa"}) ) def get_ranked_df(provider="All", query=""): if provider == "All": ranked = ranking_global.copy() else: ranked = ranking_by_provider.loc[ ranking_by_provider["provider"] == provider, ["newspaper", "mean_ocrqa"] ].copy() ranked = ranked.sort_values( ["mean_ocrqa", "newspaper"], ascending=[False, True] ).reset_index(drop=True) if query: q = query.strip() def _matches(alias: str) -> bool: if q in alias: return True return q in media_title_map.get(alias.strip(), "") ranked = ranked[ranked["newspaper"].apply(_matches)].reset_index(drop=True) return ranked def choose_newspapers(ranked, n_best, n_worst, n_random, seed=13): ranked_names = ranked["newspaper"].tolist() best = ranked_names[: int(n_best)] if n_best > 0 else [] worst = ranked_names[-int(n_worst) :] if n_worst > 0 else [] remaining_for_random = [ n for n in ranked_names if n not in set(best) and n not in set(worst) ] rng = random.Random(seed) n_random = min(int(n_random), len(remaining_for_random)) random_pick = rng.sample(remaining_for_random, n_random) if n_random > 0 else [] selected = best + worst + random_pick # Deduplicate while preserving order selected = list(dict.fromkeys(selected)) # Choices should remain OCRQA-ranked, not in selection order choices = ranked_names return choices, selected def update_newspapers(provider, query, n_best, n_worst, n_random): ranked = get_ranked_df(provider, query) choices, selected = choose_newspapers(ranked, n_best, n_worst, n_random) labeled_choices = [(newspaper_label(n), n) for n in choices] return gr.update(choices=labeled_choices, value=selected) def make_plot(provider, selected_newspapers): if not selected_newspapers: fig = go.Figure() fig.update_layout( title="Select one or more newspapers", xaxis_title="Year", yaxis_title="Average OCRQA", yaxis=dict(range=[0, 1.05]), template="plotly_white", height=650, ) return fig subset = df.copy() if provider == "All" else df[df["provider"] == provider].copy() subset = subset[subset["newspaper"].isin(selected_newspapers)] if subset.empty: fig = go.Figure() fig.update_layout( title="No data for the current selection", xaxis_title="Year", yaxis_title="Average OCRQA", yaxis=dict(range=[0, 1.05]), template="plotly_white", height=650, ) return fig # Preserve ranking order in legend/traces ranked = get_ranked_df(provider, "") ranked_order = [ n for n in ranked["newspaper"].tolist() if n in set(selected_newspapers) ] fig = go.Figure() for newspaper in ranked_order: dfn = subset[subset["newspaper"] == newspaper].sort_values("year") if dfn.empty: continue fig.add_trace( go.Scatter( x=dfn["year"], y=dfn["avg_ocrqa"], mode="markers", name=newspaper_label(newspaper), customdata=dfn[["issues", "content_items_out"]].values, hovertemplate=( "%{fullData.name}
" "Year: %{x}
" "Average OCRQA: %{y:.3f}
" "Issues: %{customdata[0]}
" "Content items: %{customdata[1]}" "" ), ) ) year_min = subset["year"].min() year_max = subset["year"].max() if year_max - year_min < 10: mid = (year_min + year_max) / 2 year_min = int(mid - 5) year_max = int(mid + 5) provider_display = provider if provider == "All" else provider_label(provider) fig.update_layout( title=f"OCRQA by newspaper — provider: {provider_display}", xaxis_title="Year", xaxis=dict(range=[year_min - 1, year_max + 1]), yaxis_title="Average OCRQA", yaxis=dict(range=[0, 1.05]), template="plotly_white", height=650, ) return fig # ------------------------------------------------------------------- # Initial state # ------------------------------------------------------------------- initial_provider = "All" initial_query = "" initial_best = 10 initial_worst = 0 initial_random = 0 initial_ranked = get_ranked_df(initial_provider, initial_query) initial_choices, initial_selected = choose_newspapers( initial_ranked, initial_best, initial_worst, initial_random ) # ------------------------------------------------------------------- # UI # ------------------------------------------------------------------- with gr.Blocks() as demo: gr.Markdown("## OCR Quality Assessment exploration") gr.Markdown( "For details on how OCRQA scores are computed, see the OCRQA demo.' ) with gr.Row(): provider = gr.Dropdown( choices=provider_options, value=initial_provider, label="Provider", ) query = gr.Textbox( value=initial_query, label="Filter newspapers (case-sensitive)", placeholder="Type a newspaper title", ) with gr.Row(): n_best = gr.Slider( minimum=0, maximum=400, value=initial_best, step=1, label="Best OCRQA", ) n_worst = gr.Slider( minimum=0, maximum=400, value=initial_worst, step=1, label="Worst OCRQA", ) n_random = gr.Slider( minimum=0, maximum=400, value=initial_random, step=1, label="Random OCRQA", ) newspaper = gr.Dropdown( choices=[(newspaper_label(n), n) for n in initial_choices], value=initial_selected, multiselect=True, label="Newspapers (filtered and ranked)", ) plot = gr.Plot() selector_inputs = [provider, query, n_best, n_worst, n_random] for trigger in selector_inputs: trigger.change( fn=update_newspapers, inputs=selector_inputs, outputs=newspaper, ) trigger.change( fn=lambda provider, newspaper: make_plot(provider, newspaper), inputs=[provider, newspaper], outputs=plot, ) newspaper.change( fn=make_plot, inputs=[provider, newspaper], outputs=plot, ) demo.load( fn=make_plot, inputs=[provider, newspaper], outputs=plot, ) demo.launch(ssr_mode=False)