Spaces:

dedemerve
/

ILSA-LLM-Extractor-Viewer

Sleeping

App Files Files Community

ILSA-LLM-Extractor-Viewer / app.py

dedemerve

Upload app.py with huggingface_hub

ba062e5 verified 12 days ago

Raw

History Blame Contribute Delete

6.88 kB

	"""
	ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
	"""

	import gradio as gr
	import pandas as pd
	from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset

	REPO_ID = "dedemerve/ILSA-LLM-Extractor-Dataset"
	MAX_CELL_CHARS = 300

	_CACHE = {}


	def _normalize(name: str) -> str:
	return name.strip().lower()


	def _find_col(df: pd.DataFrame, candidates):
	norm_map = {_normalize(c): c for c in df.columns}
	for cand in candidates:
	if cand in norm_map:
	return norm_map[cand]
	return None


	def _is_blank(val) -> bool:
	if val is None:
	return True
	try:
	if pd.isna(val):
	return True
	except (TypeError, ValueError):
	pass
	return str(val).strip().lower() in ("", "none", "null", "nan", "n/a", "<na>")


	def _make_source_link(row, doi_url_col, doi_col, title_col, venue_col) -> str:
	# 1. doi_url sütunu varsa direkt kullan
	if doi_url_col:
	url = row.get(doi_url_col)
	if not _is_blank(url):
	url = str(url).strip()
	label = "Open via DOI" if "doi.org" in url else "Search Google Scholar"
	return f"[{label}]({url})"

	# 2. doi_url yoksa doi sütunundan oluştur
	if doi_col:
	doi = row.get(doi_col)
	if not _is_blank(doi):
	doi = str(doi).strip()
	url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
	return f"[Open via DOI]({url})"

	# 3. Hiç doi yoksa Google Scholar
	import urllib.parse
	title = row.get(title_col) if title_col else None
	venue = row.get(venue_col) if venue_col else None
	parts = [str(v).strip() for v in (title, venue) if not _is_blank(v)]
	q = " ".join(parts).strip()
	if q:
	return f"[Search Google Scholar](https://scholar.google.com/scholar?q={urllib.parse.quote_plus(q)})"
	return ""


	def _truncate(val) -> str:
	if _is_blank(val):
	return ""
	s = str(val)
	return s if len(s) <= MAX_CELL_CHARS else s[:MAX_CELL_CHARS] + "…"


	_LONGTEXT_HINTS = ("interpretation", "summary", "description", "definition", "finding",
	"confounder", "notes", "abstract", "text")
	_SHORT_HINTS = ("year", "doi", "type", "category", "used", "id", "url")


	def _column_width(col: str) -> str:
	c = col.lower()
	if col == "Source":
	return "190px"
	if "title" in c or c in ("name", "venue"):
	return "260px"
	if any(h in c for h in _LONGTEXT_HINTS):
	return "320px"
	if any(h in c for h in _SHORT_HINTS):
	return "110px"
	return "160px"


	def get_subsets():
	try:
	return sorted(get_dataset_config_names(REPO_ID))
	except Exception as e:
	return [f"ERROR: {e}"]


	def get_splits(subset: str):
	try:
	return get_dataset_split_names(REPO_ID, subset)
	except Exception:
	return ["train"]


	def _load_raw(subset: str, split: str) -> pd.DataFrame:
	key = (subset, split)
	if key not in _CACHE:
	ds = load_dataset(REPO_ID, name=subset, split=split)
	_CACHE[key] = ds.to_pandas()
	return _CACHE[key]


	def build_table(subset: str, split: str, search_text: str, max_rows: int):
	if not subset or not split:
	return gr.update(), "Please select a subset and a split."
	try:
	df = _load_raw(subset, split).copy()
	except Exception as e:
	return gr.update(), f"Could not load data: {e}"

	total_rows = len(df)
	title_col = _find_col(df, ["title"])
	doi_url_col = _find_col(df, ["doi_url"])
	doi_col = _find_col(df, ["doi"])
	venue_col = _find_col(df, ["venue", "journal", "source"])

	if search_text and title_col:
	df = df[df[title_col].astype(str).str.contains(search_text, case=False, na=False)]

	filtered_rows = len(df)
	df = df.head(max_rows)

	has_link_info = bool(doi_url_col or doi_col or title_col)

	# doi_url sütununu gizle (Source olarak zaten gösterilecek)
	cols_to_drop = [c for c in [doi_url_col] if c and c in df.columns]
	if has_link_info:
	source_col = df.apply(
	lambda r: _make_source_link(r, doi_url_col, doi_col, title_col, venue_col), axis=1
	)
	df = df.drop(columns=cols_to_drop)
	df.insert(0, "Source", source_col)

	for col in df.columns:
	if col == "Source":
	continue
	df[col] = df[col].apply(_truncate)

	datatype = ["markdown" if col == "Source" else "str" for col in df.columns]
	column_widths = [_column_width(col) for col in df.columns]

	if has_link_info:
	info = (
	f"{subset}/{split} — {total_rows} rows total, "
	f"{filtered_rows} after filtering, showing {len(df)} rows. "
	f"Rows with a DOI link directly to the source; rows without one link to a Google Scholar search."
	)
	else:
	info = (
	f"{subset}/{split} — {total_rows} rows total, showing {len(df)} rows. "
	f"No title/doi column found; table shown as-is."
	)

	return gr.update(value=df, datatype=datatype, column_widths=column_widths), info


	def on_subset_change(subset: str):
	splits = get_splits(subset)
	default_split = splits[0] if splits else None
	return gr.update(choices=splits, value=default_split)


	with gr.Blocks(title="ILSA-LLM-Extractor — Clickable Sources") as demo:
	gr.Markdown(
	f"""
	# ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
	Hugging Face's standard viewer does not render DOI cells as clickable links.
	Here, every row links to doi.org when a DOI is available, or to a
	Google Scholar search based on the title when it isn't.

	Dataset: [`{REPO_ID}`](https://huggingface.co/datasets/{REPO_ID})
	"""
	)
	with gr.Row():
	subset_dd = gr.Dropdown(choices=get_subsets(), label="Subset", value=None)
	split_dd = gr.Dropdown(choices=[], label="Split", value=None)
	with gr.Row():
	search_box = gr.Textbox(label="Search by title (optional)", placeholder="e.g. PISA, civic, ICCS...")
	max_rows_box = gr.Slider(minimum=20, maximum=2000, value=200, step=20, label="Max rows to display")
	load_btn = gr.Button("Load / Filter", variant="primary")

	status_md = gr.Markdown("Select a subset to get started.")
	table = gr.Dataframe(label="Results", wrap=True, datatype="str", max_height=650)

	subset_dd.change(on_subset_change, inputs=subset_dd, outputs=split_dd)
	split_dd.change(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
	load_btn.click(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
	search_box.submit(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])

	if __name__ == "__main__":
	demo.launch()