Spaces:
Sleeping
Sleeping
| """ | |
| ILSA-LLM-Extractor-Dataset — Clickable Source Viewer | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset | |
| REPO_ID = "dedemerve/ILSA-LLM-Extractor-Dataset" | |
| MAX_CELL_CHARS = 300 | |
| _CACHE = {} | |
| def _normalize(name: str) -> str: | |
| return name.strip().lower() | |
| def _find_col(df: pd.DataFrame, candidates): | |
| norm_map = {_normalize(c): c for c in df.columns} | |
| for cand in candidates: | |
| if cand in norm_map: | |
| return norm_map[cand] | |
| return None | |
| def _is_blank(val) -> bool: | |
| if val is None: | |
| return True | |
| try: | |
| if pd.isna(val): | |
| return True | |
| except (TypeError, ValueError): | |
| pass | |
| return str(val).strip().lower() in ("", "none", "null", "nan", "n/a", "<na>") | |
| def _make_source_link(row, doi_url_col, doi_col, title_col, venue_col) -> str: | |
| # 1. doi_url sütunu varsa direkt kullan | |
| if doi_url_col: | |
| url = row.get(doi_url_col) | |
| if not _is_blank(url): | |
| url = str(url).strip() | |
| label = "Open via DOI" if "doi.org" in url else "Search Google Scholar" | |
| return f"[{label}]({url})" | |
| # 2. doi_url yoksa doi sütunundan oluştur | |
| if doi_col: | |
| doi = row.get(doi_col) | |
| if not _is_blank(doi): | |
| doi = str(doi).strip() | |
| url = doi if doi.startswith("http") else f"https://doi.org/{doi}" | |
| return f"[Open via DOI]({url})" | |
| # 3. Hiç doi yoksa Google Scholar | |
| import urllib.parse | |
| title = row.get(title_col) if title_col else None | |
| venue = row.get(venue_col) if venue_col else None | |
| parts = [str(v).strip() for v in (title, venue) if not _is_blank(v)] | |
| q = " ".join(parts).strip() | |
| if q: | |
| return f"[Search Google Scholar](https://scholar.google.com/scholar?q={urllib.parse.quote_plus(q)})" | |
| return "" | |
| def _truncate(val) -> str: | |
| if _is_blank(val): | |
| return "" | |
| s = str(val) | |
| return s if len(s) <= MAX_CELL_CHARS else s[:MAX_CELL_CHARS] + "…" | |
| _LONGTEXT_HINTS = ("interpretation", "summary", "description", "definition", "finding", | |
| "confounder", "notes", "abstract", "text") | |
| _SHORT_HINTS = ("year", "doi", "type", "category", "used", "id", "url") | |
| def _column_width(col: str) -> str: | |
| c = col.lower() | |
| if col == "Source": | |
| return "190px" | |
| if "title" in c or c in ("name", "venue"): | |
| return "260px" | |
| if any(h in c for h in _LONGTEXT_HINTS): | |
| return "320px" | |
| if any(h in c for h in _SHORT_HINTS): | |
| return "110px" | |
| return "160px" | |
| def get_subsets(): | |
| try: | |
| return sorted(get_dataset_config_names(REPO_ID)) | |
| except Exception as e: | |
| return [f"ERROR: {e}"] | |
| def get_splits(subset: str): | |
| try: | |
| return get_dataset_split_names(REPO_ID, subset) | |
| except Exception: | |
| return ["train"] | |
| def _load_raw(subset: str, split: str) -> pd.DataFrame: | |
| key = (subset, split) | |
| if key not in _CACHE: | |
| ds = load_dataset(REPO_ID, name=subset, split=split) | |
| _CACHE[key] = ds.to_pandas() | |
| return _CACHE[key] | |
| def build_table(subset: str, split: str, search_text: str, max_rows: int): | |
| if not subset or not split: | |
| return gr.update(), "Please select a subset and a split." | |
| try: | |
| df = _load_raw(subset, split).copy() | |
| except Exception as e: | |
| return gr.update(), f"Could not load data: {e}" | |
| total_rows = len(df) | |
| title_col = _find_col(df, ["title"]) | |
| doi_url_col = _find_col(df, ["doi_url"]) | |
| doi_col = _find_col(df, ["doi"]) | |
| venue_col = _find_col(df, ["venue", "journal", "source"]) | |
| if search_text and title_col: | |
| df = df[df[title_col].astype(str).str.contains(search_text, case=False, na=False)] | |
| filtered_rows = len(df) | |
| df = df.head(max_rows) | |
| has_link_info = bool(doi_url_col or doi_col or title_col) | |
| # doi_url sütununu gizle (Source olarak zaten gösterilecek) | |
| cols_to_drop = [c for c in [doi_url_col] if c and c in df.columns] | |
| if has_link_info: | |
| source_col = df.apply( | |
| lambda r: _make_source_link(r, doi_url_col, doi_col, title_col, venue_col), axis=1 | |
| ) | |
| df = df.drop(columns=cols_to_drop) | |
| df.insert(0, "Source", source_col) | |
| for col in df.columns: | |
| if col == "Source": | |
| continue | |
| df[col] = df[col].apply(_truncate) | |
| datatype = ["markdown" if col == "Source" else "str" for col in df.columns] | |
| column_widths = [_column_width(col) for col in df.columns] | |
| if has_link_info: | |
| info = ( | |
| f"**{subset}/{split}** — {total_rows} rows total, " | |
| f"{filtered_rows} after filtering, showing {len(df)} rows. " | |
| f"Rows with a DOI link directly to the source; rows without one link to a Google Scholar search." | |
| ) | |
| else: | |
| info = ( | |
| f"**{subset}/{split}** — {total_rows} rows total, showing {len(df)} rows. " | |
| f"No title/doi column found; table shown as-is." | |
| ) | |
| return gr.update(value=df, datatype=datatype, column_widths=column_widths), info | |
| def on_subset_change(subset: str): | |
| splits = get_splits(subset) | |
| default_split = splits[0] if splits else None | |
| return gr.update(choices=splits, value=default_split) | |
| with gr.Blocks(title="ILSA-LLM-Extractor — Clickable Sources") as demo: | |
| gr.Markdown( | |
| f""" | |
| # ILSA-LLM-Extractor-Dataset — Clickable Source Viewer | |
| Hugging Face's standard viewer does not render DOI cells as clickable links. | |
| Here, every row links to **doi.org** when a DOI is available, or to a | |
| **Google Scholar search** based on the title when it isn't. | |
| Dataset: [`{REPO_ID}`](https://huggingface.co/datasets/{REPO_ID}) | |
| """ | |
| ) | |
| with gr.Row(): | |
| subset_dd = gr.Dropdown(choices=get_subsets(), label="Subset", value=None) | |
| split_dd = gr.Dropdown(choices=[], label="Split", value=None) | |
| with gr.Row(): | |
| search_box = gr.Textbox(label="Search by title (optional)", placeholder="e.g. PISA, civic, ICCS...") | |
| max_rows_box = gr.Slider(minimum=20, maximum=2000, value=200, step=20, label="Max rows to display") | |
| load_btn = gr.Button("Load / Filter", variant="primary") | |
| status_md = gr.Markdown("Select a subset to get started.") | |
| table = gr.Dataframe(label="Results", wrap=True, datatype="str", max_height=650) | |
| subset_dd.change(on_subset_change, inputs=subset_dd, outputs=split_dd) | |
| split_dd.change(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) | |
| load_btn.click(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) | |
| search_box.submit(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md]) | |
| if __name__ == "__main__": | |
| demo.launch() | |