""" ILSA-LLM-Extractor-Dataset — Clickable Source Viewer """ import gradio as gr import pandas as pd from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset REPO_ID = "dedemerve/ILSA-LLM-Extractor-Dataset" MAX_CELL_CHARS = 300 _CACHE = {} def _normalize(name: str) -> str: return name.strip().lower() def _find_col(df: pd.DataFrame, candidates): norm_map = {_normalize(c): c for c in df.columns} for cand in candidates: if cand in norm_map: return norm_map[cand] return None def _is_blank(val) -> bool: if val is None: return True try: if pd.isna(val): return True except (TypeError, ValueError): pass return str(val).strip().lower() in ("", "none", "null", "nan", "n/a", "") def _make_source_link(row, doi_url_col, doi_col, title_col, venue_col) -> str: # 1. doi_url sütunu varsa direkt kullan if doi_url_col: url = row.get(doi_url_col) if not _is_blank(url): url = str(url).strip() label = "Open via DOI" if "doi.org" in url else "Search Google Scholar" return f"[{label}]({url})" # 2. doi_url yoksa doi sütunundan oluştur if doi_col: doi = row.get(doi_col) if not _is_blank(doi): doi = str(doi).strip() url = doi if doi.startswith("http") else f"https://doi.org/{doi}" return f"[Open via DOI]({url})" # 3. Hiç doi yoksa Google Scholar import urllib.parse title = row.get(title_col) if title_col else None venue = row.get(venue_col) if venue_col else None parts = [str(v).strip() for v in (title, venue) if not _is_blank(v)] q = " ".join(parts).strip() if q: return f"[Search Google Scholar](https://scholar.google.com/scholar?q={urllib.parse.quote_plus(q)})" return "" def _truncate(val) -> str: if _is_blank(val): return "" s = str(val) return s if len(s) <= MAX_CELL_CHARS else s[:MAX_CELL_CHARS] + "…" _LONGTEXT_HINTS = ("interpretation", "summary", "description", "definition", "finding", "confounder", "notes", "abstract", "text") _SHORT_HINTS = ("year", "doi", "type", "category", "used", "id", "url") def _column_width(col: str) -> str: c = col.lower() if col == "Source": return "190px" if "title" in c or c in ("name", "venue"): return "260px" if any(h in c for h in _LONGTEXT_HINTS): return "320px" if any(h in c for h in _SHORT_HINTS): return "110px" return "160px" def get_subsets(): try: return sorted(get_dataset_config_names(REPO_ID)) except Exception as e: return [f"ERROR: {e}"] def get_splits(subset: str): try: return get_dataset_split_names(REPO_ID, subset) except Exception: return ["train"] def _load_raw(subset: str, split: str) -> pd.DataFrame: key = (subset, split) if key not in _CACHE: ds = load_dataset(REPO_ID, name=subset, split=split) _CACHE[key] = ds.to_pandas() return _CACHE[key] def build_table(subset: str, split: str, search_text: str, max_rows: int): if not subset or not split: return gr.update(), "Please select a subset and a split." try: df = _load_raw(subset, split).copy() except Exception as e: return gr.update(), f"Could not load data: {e}" total_rows = len(df) title_col = _find_col(df, ["title"]) doi_url_col = _find_col(df, ["doi_url"]) doi_col = _find_col(df, ["doi"]) venue_col = _find_col(df, ["venue", "journal", "source"]) if search_text and title_col: df = df[df[title_col].astype(str).str.contains(search_text, case=False, na=False)] filtered_rows = len(df) df = df.head(max_rows) has_link_info = bool(doi_url_col or doi_col or title_col) # doi_url sütununu gizle (Source olarak zaten gösterilecek) cols_to_drop = [c for c in [doi_url_col] if c and c in df.columns] if has_link_info: source_col = df.apply( lambda r: _make_source_link(r, doi_url_col, doi_col, title_col, venue_col), axis=1 ) df = df.drop(columns=cols_to_drop) df.insert(0, "Source", source_col) for col in df.columns: if col == "Source": continue df[col] = df[col].apply(_truncate) datatype = ["markdown" if col == "Source" else "str" for col in df.columns] column_widths = [_column_width(col) for col in df.columns] if has_link_info: info = ( f"**{subset}/{split}** — {total_rows} rows total, " f"{filtered_rows} after filtering, showing {len(df)} rows. " f"Rows with a DOI link directly to the source; rows without one link to a Google Scholar search." ) else: info = ( f"**{subset}/{split}** — {total_rows} rows total, showing {len(df)} rows. " f"No title/doi column found; table shown as-is." ) return gr.update(value=df, datatype=datatype, column_widths=column_widths), info def on_subset_change(subset: str): splits = get_splits(subset) default_split = splits[0] if splits else None return gr.update(choices=splits, value=default_split) with gr.Blocks(title="ILSA-LLM-Extractor — Clickable Sources") as demo: gr.Markdown( f""" # ILSA-LLM-Extractor-Dataset — Clickable Source Viewer Hugging Face's standard viewer does not render DOI cells as clickable links. Here, every row links to **doi.org** when a DOI is available, or to a **Google Scholar search** based on the title when it isn't. Dataset: [`{REPO_ID}`](https://huggingface.co/datasets/{REPO_ID}) """ ) with gr.Row(): subset_dd = gr.Dropdown(choices=get_subsets(), label="Subset", value=None) split_dd = gr.Dropdown(choices=[], label="Split", value=None) with gr.Row(): search_box = gr.Textbox(label="Search by title (optional)", placeholder="e.g. PISA, civic, ICCS...") max_rows_box = gr.Slider(minimum=20, maximum=2000, value=200, step=20, label="Max rows to display") load_btn = gr.Button("Load / Filter", variant="primary") status_md = gr.Markdown("Select a subset to get started.") table = gr.Dataframe(label="Results", wrap=True, datatype="str", max_height=650) subset_dd.change(on_subset_change, inputs=subset_dd, outputs=split_dd) split_dd.change(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) load_btn.click(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) search_box.submit(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) if __name__ == "__main__": demo.launch()