"""
ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
"""

import gradio as gr
import pandas as pd
from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset

REPO_ID = "dedemerve/ILSA-LLM-Extractor-Dataset"
MAX_CELL_CHARS = 300

_CACHE = {}


def _normalize(name: str) -> str:
    return name.strip().lower()


def _find_col(df: pd.DataFrame, candidates):
    norm_map = {_normalize(c): c for c in df.columns}
    for cand in candidates:
        if cand in norm_map:
            return norm_map[cand]
    return None


def _is_blank(val) -> bool:
    if val is None:
        return True
    try:
        if pd.isna(val):
            return True
    except (TypeError, ValueError):
        pass
    return str(val).strip().lower() in ("", "none", "null", "nan", "n/a", "<na>")


def _make_source_link(row, doi_url_col, doi_col, title_col, venue_col) -> str:
    # 1. doi_url sütunu varsa direkt kullan
    if doi_url_col:
        url = row.get(doi_url_col)
        if not _is_blank(url):
            url = str(url).strip()
            label = "Open via DOI" if "doi.org" in url else "Search Google Scholar"
            return f"[{label}]({url})"

    # 2. doi_url yoksa doi sütunundan oluştur
    if doi_col:
        doi = row.get(doi_col)
        if not _is_blank(doi):
            doi = str(doi).strip()
            url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
            return f"[Open via DOI]({url})"

    # 3. Hiç doi yoksa Google Scholar
    import urllib.parse
    title = row.get(title_col) if title_col else None
    venue = row.get(venue_col) if venue_col else None
    parts = [str(v).strip() for v in (title, venue) if not _is_blank(v)]
    q = " ".join(parts).strip()
    if q:
        return f"[Search Google Scholar](https://scholar.google.com/scholar?q={urllib.parse.quote_plus(q)})"
    return ""


def _truncate(val) -> str:
    if _is_blank(val):
        return ""
    s = str(val)
    return s if len(s) <= MAX_CELL_CHARS else s[:MAX_CELL_CHARS] + "…"


_LONGTEXT_HINTS = ("interpretation", "summary", "description", "definition", "finding",
                   "confounder", "notes", "abstract", "text")
_SHORT_HINTS = ("year", "doi", "type", "category", "used", "id", "url")


def _column_width(col: str) -> str:
    c = col.lower()
    if col == "Source":
        return "190px"
    if "title" in c or c in ("name", "venue"):
        return "260px"
    if any(h in c for h in _LONGTEXT_HINTS):
        return "320px"
    if any(h in c for h in _SHORT_HINTS):
        return "110px"
    return "160px"


def get_subsets():
    try:
        return sorted(get_dataset_config_names(REPO_ID))
    except Exception as e:
        return [f"ERROR: {e}"]


def get_splits(subset: str):
    try:
        return get_dataset_split_names(REPO_ID, subset)
    except Exception:
        return ["train"]


def _load_raw(subset: str, split: str) -> pd.DataFrame:
    key = (subset, split)
    if key not in _CACHE:
        ds = load_dataset(REPO_ID, name=subset, split=split)
        _CACHE[key] = ds.to_pandas()
    return _CACHE[key]


def build_table(subset: str, split: str, search_text: str, max_rows: int):
    if not subset or not split:
        return gr.update(), "Please select a subset and a split."
    try:
        df = _load_raw(subset, split).copy()
    except Exception as e:
        return gr.update(), f"Could not load data: {e}"

    total_rows = len(df)
    title_col   = _find_col(df, ["title"])
    doi_url_col = _find_col(df, ["doi_url"])
    doi_col     = _find_col(df, ["doi"])
    venue_col   = _find_col(df, ["venue", "journal", "source"])

    if search_text and title_col:
        df = df[df[title_col].astype(str).str.contains(search_text, case=False, na=False)]

    filtered_rows = len(df)
    df = df.head(max_rows)

    has_link_info = bool(doi_url_col or doi_col or title_col)

    # doi_url sütununu gizle (Source olarak zaten gösterilecek)
    cols_to_drop = [c for c in [doi_url_col] if c and c in df.columns]
    if has_link_info:
        source_col = df.apply(
            lambda r: _make_source_link(r, doi_url_col, doi_col, title_col, venue_col), axis=1
        )
        df = df.drop(columns=cols_to_drop)
        df.insert(0, "Source", source_col)

    for col in df.columns:
        if col == "Source":
            continue
        df[col] = df[col].apply(_truncate)

    datatype = ["markdown" if col == "Source" else "str" for col in df.columns]
    column_widths = [_column_width(col) for col in df.columns]

    if has_link_info:
        info = (
            f"**{subset}/{split}** — {total_rows} rows total, "
            f"{filtered_rows} after filtering, showing {len(df)} rows. "
            f"Rows with a DOI link directly to the source; rows without one link to a Google Scholar search."
        )
    else:
        info = (
            f"**{subset}/{split}** — {total_rows} rows total, showing {len(df)} rows. "
            f"No title/doi column found; table shown as-is."
        )

    return gr.update(value=df, datatype=datatype, column_widths=column_widths), info


def on_subset_change(subset: str):
    splits = get_splits(subset)
    default_split = splits[0] if splits else None
    return gr.update(choices=splits, value=default_split)


with gr.Blocks(title="ILSA-LLM-Extractor — Clickable Sources") as demo:
    gr.Markdown(
        f"""
        # ILSA-LLM-Extractor-Dataset — Clickable Source Viewer
        Hugging Face's standard viewer does not render DOI cells as clickable links.
        Here, every row links to **doi.org** when a DOI is available, or to a
        **Google Scholar search** based on the title when it isn't.

        Dataset: [`{REPO_ID}`](https://huggingface.co/datasets/{REPO_ID})
        """
    )
    with gr.Row():
        subset_dd = gr.Dropdown(choices=get_subsets(), label="Subset", value=None)
        split_dd  = gr.Dropdown(choices=[], label="Split", value=None)
    with gr.Row():
        search_box   = gr.Textbox(label="Search by title (optional)", placeholder="e.g. PISA, civic, ICCS...")
        max_rows_box = gr.Slider(minimum=20, maximum=2000, value=200, step=20, label="Max rows to display")
        load_btn     = gr.Button("Load / Filter", variant="primary")

    status_md = gr.Markdown("Select a subset to get started.")
    table     = gr.Dataframe(label="Results", wrap=True, datatype="str", max_height=650)

    subset_dd.change(on_subset_change, inputs=subset_dd, outputs=split_dd)
    split_dd.change(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
    load_btn.click(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])
    search_box.submit(build_table, inputs=[subset_dd, split_dd, search_box, max_rows_box], outputs=[table, status_md])

if __name__ == "__main__":
    demo.launch()