dedemerve's picture
Upload app.py with huggingface_hub
ba062e5 verified
Raw
History Blame Contribute Delete
6.88 kB
"""
ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
"""
import gradio as gr
import pandas as pd
from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset
REPO_ID = "dedemerve/ILSA-LLM-Extractor-Dataset"
MAX_CELL_CHARS = 300
_CACHE = {}
def _normalize(name: str) -> str:
return name.strip().lower()
def _find_col(df: pd.DataFrame, candidates):
norm_map = {_normalize(c): c for c in df.columns}
for cand in candidates:
if cand in norm_map:
return norm_map[cand]
return None
def _is_blank(val) -> bool:
if val is None:
return True
try:
if pd.isna(val):
return True
except (TypeError, ValueError):
pass
return str(val).strip().lower() in ("", "none", "null", "nan", "n/a", "<na>")
def _make_source_link(row, doi_url_col, doi_col, title_col, venue_col) -> str:
# 1. doi_url sütunu varsa direkt kullan
if doi_url_col:
url = row.get(doi_url_col)
if not _is_blank(url):
url = str(url).strip()
label = "Open via DOI" if "doi.org" in url else "Search Google Scholar"
return f"[{label}]({url})"
# 2. doi_url yoksa doi sütunundan oluştur
if doi_col:
doi = row.get(doi_col)
if not _is_blank(doi):
doi = str(doi).strip()
url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
return f"[Open via DOI]({url})"
# 3. Hiç doi yoksa Google Scholar
import urllib.parse
title = row.get(title_col) if title_col else None
venue = row.get(venue_col) if venue_col else None
parts = [str(v).strip() for v in (title, venue) if not _is_blank(v)]
q = " ".join(parts).strip()
if q:
return f"[Search Google Scholar](https://scholar.google.com/scholar?q={urllib.parse.quote_plus(q)})"
return ""
def _truncate(val) -> str:
if _is_blank(val):
return ""
s = str(val)
return s if len(s) <= MAX_CELL_CHARS else s[:MAX_CELL_CHARS] + "…"
_LONGTEXT_HINTS = ("interpretation", "summary", "description", "definition", "finding",
"confounder", "notes", "abstract", "text")
_SHORT_HINTS = ("year", "doi", "type", "category", "used", "id", "url")
def _column_width(col: str) -> str:
c = col.lower()
if col == "Source":
return "190px"
if "title" in c or c in ("name", "venue"):
return "260px"
if any(h in c for h in _LONGTEXT_HINTS):
return "320px"
if any(h in c for h in _SHORT_HINTS):
return "110px"
return "160px"
def get_subsets():
try:
return sorted(get_dataset_config_names(REPO_ID))
except Exception as e:
return [f"ERROR: {e}"]
def get_splits(subset: str):
try:
return get_dataset_split_names(REPO_ID, subset)
except Exception:
return ["train"]
def _load_raw(subset: str, split: str) -> pd.DataFrame:
key = (subset, split)
if key not in _CACHE:
ds = load_dataset(REPO_ID, name=subset, split=split)
_CACHE[key] = ds.to_pandas()
return _CACHE[key]
def build_table(subset: str, split: str, search_text: str, max_rows: int):
if not subset or not split:
return gr.update(), "Please select a subset and a split."
try:
df = _load_raw(subset, split).copy()
except Exception as e:
return gr.update(), f"Could not load data: {e}"
total_rows = len(df)
title_col = _find_col(df, ["title"])
doi_url_col = _find_col(df, ["doi_url"])
doi_col = _find_col(df, ["doi"])
venue_col = _find_col(df, ["venue", "journal", "source"])
if search_text and title_col:
df = df[df[title_col].astype(str).str.contains(search_text, case=False, na=False)]
filtered_rows = len(df)
df = df.head(max_rows)
has_link_info = bool(doi_url_col or doi_col or title_col)
# doi_url sütununu gizle (Source olarak zaten gösterilecek)
cols_to_drop = [c for c in [doi_url_col] if c and c in df.columns]
if has_link_info:
source_col = df.apply(
lambda r: _make_source_link(r, doi_url_col, doi_col, title_col, venue_col), axis=1
)
df = df.drop(columns=cols_to_drop)
df.insert(0, "Source", source_col)
for col in df.columns:
if col == "Source":
continue
df[col] = df[col].apply(_truncate)
datatype = ["markdown" if col == "Source" else "str" for col in df.columns]
column_widths = [_column_width(col) for col in df.columns]
if has_link_info:
info = (
f"**{subset}/{split}** — {total_rows} rows total, "
f"{filtered_rows} after filtering, showing {len(df)} rows. "
f"Rows with a DOI link directly to the source; rows without one link to a Google Scholar search."
)
else:
info = (
f"**{subset}/{split}** — {total_rows} rows total, showing {len(df)} rows. "
f"No title/doi column found; table shown as-is."
)
return gr.update(value=df, datatype=datatype, column_widths=column_widths), info
def on_subset_change(subset: str):
splits = get_splits(subset)
default_split = splits[0] if splits else None
return gr.update(choices=splits, value=default_split)
with gr.Blocks(title="ILSA-LLM-Extractor — Clickable Sources") as demo:
gr.Markdown(
f"""
# ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
Hugging Face's standard viewer does not render DOI cells as clickable links.
Here, every row links to **doi.org** when a DOI is available, or to a
**Google Scholar search** based on the title when it isn't.
Dataset: [`{REPO_ID}`](https://huggingface.co/datasets/{REPO_ID})
"""
)
with gr.Row():
subset_dd = gr.Dropdown(choices=get_subsets(), label="Subset", value=None)
split_dd = gr.Dropdown(choices=[], label="Split", value=None)
with gr.Row():
search_box = gr.Textbox(label="Search by title (optional)", placeholder="e.g. PISA, civic, ICCS...")
max_rows_box = gr.Slider(minimum=20, maximum=2000, value=200, step=20, label="Max rows to display")
load_btn = gr.Button("Load / Filter", variant="primary")
status_md = gr.Markdown("Select a subset to get started.")
table = gr.Dataframe(label="Results", wrap=True, datatype="str", max_height=650)
subset_dd.change(on_subset_change, inputs=subset_dd, outputs=split_dd)
split_dd.change(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
load_btn.click(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
search_box.submit(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
if __name__ == "__main__":
demo.launch()