import json
import os
import re
import shutil
import tempfile
import ast
import base64
import asyncio
from concurrent.futures import ThreadPoolExecutor
from string import Template
from datetime import datetime
from pathlib import Path
from types import SimpleNamespace
from typing import Any

import fitz  # PyMuPDF
import gradio as gr
import pandas as pd
import yaml
from openai import AzureOpenAI

MAX_DESCRIPTION_FIELDS = 30
MAX_EXTRACT_FIELDS = 30
DEFAULT_STORAGE_ROOT = Path("/data/llm_fulltextscreener") # Path("/tmp/llm_fulltextscreener")
APP_STORAGE_ROOT = Path(os.getenv("APP_STORAGE_DIR", str(DEFAULT_STORAGE_ROOT)))
USERS_ROOT_DIR = APP_STORAGE_ROOT / "users"
MAX_EXPORTED_FILES = 20
MAX_INLINE_DOWNLOAD_BYTES = 8 * 1024 * 1024
VALID_DECISIONS = {"include", "exclude"}
PROMPTS_DIR = Path(__file__).resolve().parent / "prompts"
SYSTEM_PROMPT_PATH = PROMPTS_DIR / "system_prompt.txt"
USER_PROMPT_TEMPLATE_PATH = PROMPTS_DIR / "user_prompt_template.json"
SYSTEM_CRITERIA_PROMPT_PATH = PROMPTS_DIR / "system_criteria_prompt.txt"
USER_CRITERIA_TEMPLATE_PATH = PROMPTS_DIR / "user_criteria_template.json"
SYSTEM_LABELS_PROMPT_PATH = PROMPTS_DIR / "system_labels_prompt.txt"
USER_LABELS_TEMPLATE_PATH = PROMPTS_DIR / "user_labels_template.json"


def patch_asyncio_invalid_fd_cleanup() -> None:
    """
    Work around Python 3.11 selector-loop teardown race seen on some runtimes
    (including Spaces), where loop __del__ may raise:
    ValueError: Invalid file descriptor: -1
    """
    original_del = getattr(asyncio.BaseEventLoop, "__del__", None)
    if original_del is None or getattr(original_del, "_invalid_fd_guard", False):
        return

    def _guarded_del(self):
        try:
            original_del(self)
        except ValueError as exc:
            if "Invalid file descriptor" not in str(exc):
                raise

    _guarded_del._invalid_fd_guard = True
    asyncio.BaseEventLoop.__del__ = _guarded_del


patch_asyncio_invalid_fd_cleanup()


def is_debug_enabled() -> bool:
    return os.getenv("APP_DEBUG", "").strip().lower() in {"1", "true", "yes", "on"}


def debug_log(*parts: Any) -> None:
    if is_debug_enabled():
        print("[DEBUG]", *parts)


def normalize_key(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(text).strip().lower())


def sanitize_user_id(raw: str) -> str:
    cleaned = re.sub(r"[^a-zA-Z0-9._-]+", "_", str(raw or "").strip())
    return cleaned or "default"


def resolve_user_id(explicit_user_id: str | None = None, request: gr.Request | None = None) -> str:
    if explicit_user_id and str(explicit_user_id).strip():
        return sanitize_user_id(explicit_user_id)
    req_username = getattr(request, "username", None) if request is not None else None
    if req_username and str(req_username).strip():
        return sanitize_user_id(str(req_username))
    return "default"


def init_user_id(request: gr.Request | None = None) -> str:
    return resolve_user_id(request=request)


def get_user_session_dir(user_id: str) -> Path:
    return USERS_ROOT_DIR / sanitize_user_id(user_id)


def get_user_session_meta_path(user_id: str) -> Path:
    return get_user_session_dir(user_id) / "session.json"


def get_user_session_files_dir(user_id: str) -> Path:
    return get_user_session_dir(user_id) / "files"


def get_user_exports_dir(user_id: str) -> Path:
    return get_user_session_dir(user_id) / "exports"


def _ensure_session_dirs(user_id: str) -> None:
    get_user_session_files_dir(user_id).mkdir(parents=True, exist_ok=True)
    get_user_exports_dir(user_id).mkdir(parents=True, exist_ok=True)


def _setup_storage_paths() -> None:
    """Configure writable temp paths in Spaces."""
    USERS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
    tmp_default = get_user_session_files_dir("default")
    tmp_default.mkdir(parents=True, exist_ok=True)
    os.environ["TMPDIR"] = str(tmp_default.resolve())
    tempfile.tempdir = str(tmp_default.resolve())


def load_session_meta(user_id: str) -> dict[str, Any]:
    session_meta_path = get_user_session_meta_path(user_id)
    try:
        if session_meta_path.exists():
            return json.loads(session_meta_path.read_text(encoding="utf-8"))
    except Exception:
        return {}
    return {}


def save_session_meta(user_id: str, updates: dict[str, Any]) -> None:
    _ensure_session_dirs(user_id)
    session_meta_path = get_user_session_meta_path(user_id)
    data = load_session_meta(user_id)
    data.update(updates)
    session_meta_path.write_text(json.dumps(data, indent=2), encoding="utf-8")


def persist_uploaded_file(user_id: str, file_obj, dest_name: str) -> str | None:
    if file_obj is None:
        return None
    src = resolve_uploaded_path(file_obj)
    if not src.exists() or not src.is_file():
        return None
    _ensure_session_dirs(user_id)
    dest = get_user_session_files_dir(user_id) / dest_name
    try:
        if src.resolve() == dest.resolve():
            return str(dest.resolve())
    except Exception:
        pass
    shutil.copy2(src, dest)
    return str(dest.resolve())


def resolve_uploaded_path(file_obj) -> Path:
    if file_obj is None:
        return Path("")
    if isinstance(file_obj, (str, Path)):
        return Path(file_obj)
    file_name = getattr(file_obj, "name", "")
    if file_name:
        return Path(file_name)
    if isinstance(file_obj, dict):
        candidate = str(file_obj.get("name", "")).strip()
        if candidate:
            return Path(candidate)
    return Path("")


def persist_dataframe(user_id: str, df: pd.DataFrame) -> str:
    _ensure_session_dirs(user_id)
    dest = get_user_session_files_dir(user_id) / "working_table.xlsx"
    df.to_excel(dest, index=False)
    return str(dest.resolve())


def _cleanup_old_exports(user_id: str, max_files: int = MAX_EXPORTED_FILES) -> None:
    try:
        export_files = [p for p in get_user_exports_dir(user_id).glob("screened_*.xlsx") if p.is_file()]
        export_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
        for old_file in export_files[max_files:]:
            try:
                old_file.unlink()
            except Exception:
                continue
    except Exception:
        return


def persist_downloadable_dataframe(user_id: str, df: pd.DataFrame) -> str | None:
    _ensure_session_dirs(user_id)
    filename = f"screened_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}.xlsx"
    export_path = get_user_exports_dir(user_id) / filename
    try:
        df.to_excel(export_path, index=False)
        if not export_path.exists() or export_path.stat().st_size == 0:
            return None
        if is_debug_enabled():
            print(
                f"[DEBUG] Export ready: path={export_path.resolve()} size={export_path.stat().st_size} bytes"
            )
        _cleanup_old_exports(user_id)
        return str(export_path.resolve())
    except Exception:
        return None


def build_inline_download_html(path: str | None) -> str:
    if not path:
        return "<p>Download unavailable.</p>"
    candidate = Path(path)
    if not candidate.exists() or not candidate.is_file():
        return "<p>Download unavailable: exported file not found.</p>"
    try:
        raw = candidate.read_bytes()
    except Exception:
        return "<p>Download unavailable: could not read exported file.</p>"
    if len(raw) == 0:
        return "<p>Download unavailable: exported file is empty.</p>"
    if len(raw) > MAX_INLINE_DOWNLOAD_BYTES:
        size_mb = len(raw) / (1024 * 1024)
        return (
            f"<p>Inline download disabled for large files ({size_mb:.1f} MB). "
            "Reduce export size and try again.</p>"
        )
    b64 = base64.b64encode(raw).decode("ascii")
    filename = candidate.name
    href = (
        "data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,"
        f"{b64}"
    )
    return (
        "<p><strong>Download:</strong></p>"
        f'<a download="{filename}" href="{href}" '
        'style="display:inline-block;padding:8px 12px;border:1px solid #888;'
        'border-radius:6px;text-decoration:none;">Download Excel</a>'
    )


def empty_description_updates() -> list[dict[str, Any]]:
    return [
        gr.update(label=f"Description {idx + 1}", value="", visible=False)
        for idx in range(MAX_DESCRIPTION_FIELDS)
    ]


def empty_extracted_state(status: str, *, extracted_state: dict[str, Any] | None = None):
    return (
        extracted_state or {},
        [],
        *build_empty_extracted_input_updates(),
        "",
        0.0,
        "include",
        "",
        "",
        "",
        "",
        status,
    )


def is_missing(value: Any) -> bool:
    if pd.isna(value):
        return True
    if isinstance(value, str) and value.strip() == "":
        return True
    return False


def parse_csv_columns(raw_text: str, available_columns: list[str]) -> list[str]:
    if not raw_text or not raw_text.strip():
        return []
    requested = [item.strip() for item in raw_text.split(",") if item.strip()]
    return [col for col in requested if col in available_columns]


def choose_url_column(df: pd.DataFrame, preferred: str | None = None) -> str:
    if preferred and preferred in df.columns:
        return preferred

    for col in df.columns:
        col_l = str(col).lower()
        if "url" in col_l or "link" in col_l:
            return col

    return str(df.columns[0])


def parse_criteria_file(file_obj) -> dict[str, Any] | None:
    if file_obj is None:
        return None
    path = resolve_uploaded_path(file_obj)
    if str(path).strip() == "":
        return None
    if not path.exists() or not path.is_file():
        raise ValueError("Criteria file not found.")

    try:
        raw = path.read_text(encoding="utf-8")
    except Exception as exc:
        raise ValueError(f"Failed reading criteria file: {exc}") from exc

    try:
        parsed = yaml.safe_load(raw)
    except Exception as exc:
        raise ValueError(f"Invalid YAML in criteria file: {exc}") from exc

    if not isinstance(parsed, dict):
        raise ValueError("criteria.yml must contain a top-level mapping/object.")

    topic = str(parsed.get("topic", "")).strip()
    inclusion = parsed.get("inclusion_criteria", [])
    exclusion = parsed.get("exclusion_criteria", [])
    notes = str(parsed.get("notes", "")).strip()

    if not topic:
        raise ValueError("criteria.yml requires a non-empty 'topic'.")
    if not isinstance(inclusion, list):
        raise ValueError("'inclusion_criteria' must be a list of strings.")
    if not isinstance(exclusion, list):
        raise ValueError("'exclusion_criteria' must be a list of strings.")

    inclusion_clean = [str(item).strip() for item in inclusion if str(item).strip()]
    exclusion_clean = [str(item).strip() for item in exclusion if str(item).strip()]

    return {
        "topic": topic,
        "inclusion_criteria": inclusion_clean,
        "exclusion_criteria": exclusion_clean,
        "notes": notes,
    }


def parse_labels_csv(raw: Any) -> list[str]:
    if raw is None or pd.isna(raw):
        return []
    text = str(raw).strip()
    if not text:
        return []
    labels = [item.strip() for item in text.split(",") if item.strip()]
    return list(dict.fromkeys(labels))


def build_default_descriptions(columns: list[str]) -> dict[str, str]:
    return {col: f"Extract the value for '{col}' from the article text." for col in columns}


def build_description_values_from_inputs(
    description_columns: list[str],
    description_values: list[str],
    target_columns: list[str],
) -> dict[str, str]:
    defaults = build_default_descriptions(target_columns)
    if not isinstance(description_columns, list):
        description_columns = []

    for idx, col in enumerate(description_columns):
        if idx >= len(description_values):
            break
        col_name = str(col).strip()
        desc = str(description_values[idx]).strip()
        if col_name in defaults and desc:
            defaults[col_name] = desc
    return defaults


def build_description_input_updates(
    target_columns: list[str],
    previous_description_columns: list[str],
    previous_description_values: list[str],
) -> tuple[list[str], list[dict[str, Any]]]:
    description_map = build_description_values_from_inputs(
        previous_description_columns,
        previous_description_values,
        target_columns,
    )

    active_columns = target_columns[:MAX_DESCRIPTION_FIELDS]
    updates: list[dict[str, Any]] = []
    for idx in range(MAX_DESCRIPTION_FIELDS):
        if idx < len(active_columns):
            col = active_columns[idx]
            updates.append(
                gr.update(
                    label=f"Description: {col}",
                    value=description_map.get(col, ""),
                    visible=True,
                )
            )
        else:
            updates.append(
                gr.update(
                    label=f"Description {idx + 1}",
                    value="",
                    visible=False,
                )
            )

    return active_columns, updates


def build_empty_extracted_input_updates() -> list[dict[str, Any]]:
    return [
        gr.update(label=f"Extracted field {idx + 1}", value="", visible=False)
        for idx in range(MAX_EXTRACT_FIELDS)
    ]


def build_extracted_input_updates(
    target_columns: list[str],
    field_values: dict[str, Any],
) -> tuple[list[str], list[dict[str, Any]]]:
    active_columns = target_columns[:MAX_EXTRACT_FIELDS]
    updates: list[dict[str, Any]] = []
    for idx in range(MAX_EXTRACT_FIELDS):
        if idx < len(active_columns):
            col = active_columns[idx]
            updates.append(
                gr.update(
                    label=f"Extracted: {col}",
                    value=str(field_values.get(col, "")),
                    visible=True,
                )
            )
        else:
            updates.append(
                gr.update(
                    label=f"Extracted field {idx + 1}",
                    value="",
                    visible=False,
                )
            )
    return active_columns, updates


def build_extracted_values_from_inputs(
    extracted_columns: list[str],
    extracted_values: list[str],
) -> dict[str, str]:
    fields: dict[str, str] = {}
    if not isinstance(extracted_columns, list):
        return fields
    for idx, col in enumerate(extracted_columns):
        if idx >= len(extracted_values):
            break
        fields[str(col)] = str(extracted_values[idx]) if extracted_values[idx] is not None else ""
    return fields


def coerce_fields_from_llm(parsed: dict[str, Any], column_names: list[str]) -> dict[str, str]:
    raw_fields = parsed.get("fields", {})
    fields_dict: dict[str, Any] = {}

    if isinstance(raw_fields, dict):
        fields_dict = raw_fields
    elif isinstance(raw_fields, str):
        try:
            maybe_obj = json.loads(raw_fields)
            if isinstance(maybe_obj, dict):
                fields_dict = maybe_obj
        except json.JSONDecodeError:
            fields_dict = {}

    # Fallback: model may place extracted values at top level.
    if not fields_dict:
        fields_dict = {
            col: parsed.get(col, "")
            for col in column_names
            if col in parsed
        }

    # Fuzzy fallback: tolerate minor key format differences.
    if len(fields_dict) == 0:
        normalized_requested = {normalize_key(col): col for col in column_names}
        for key, value in parsed.items():
            if key in {"fields", "evidence", "confidence", "decision"}:
                continue
            norm = normalize_key(str(key))
            if norm in normalized_requested:
                fields_dict[normalized_requested[norm]] = value

    return {col: str(fields_dict.get(col, "")) for col in column_names}


def _parse_structured_text(raw: str) -> Any:
    txt = raw.strip()
    if not txt:
        return None
    try:
        return json.loads(txt)
    except Exception:
        pass
    try:
        return ast.literal_eval(txt)
    except Exception:
        return None


def _coerce_evidence_items(raw: Any) -> list[dict[str, str]]:
    items: list[dict[str, str]] = []
    if raw is None:
        return items

    if isinstance(raw, str):
        parsed = _parse_structured_text(raw)
        if parsed is None:
            return items
        raw = parsed

    if isinstance(raw, dict):
        # Supports {"FIELD": "..."} and {"FIELD": ["...", "..."]}
        for field, snippet_value in raw.items():
            if isinstance(snippet_value, list):
                for s in snippet_value:
                    snippet = str(s).strip()
                    if snippet:
                        items.append({"field": str(field).strip(), "snippet": snippet})
            else:
                snippet = str(snippet_value).strip()
                if snippet:
                    items.append({"field": str(field).strip(), "snippet": snippet})
        return items

    if isinstance(raw, list):
        for item in raw:
            if isinstance(item, dict):
                field = str(item.get("field", "")).strip()
                snippet = str(item.get("snippet", "")).strip()
                if field and snippet:
                    items.append({"field": field, "snippet": snippet})
            elif isinstance(item, str):
                # String entries without explicit field are ignored here.
                continue
    return items


def normalize_evidence_snippets(parsed: dict[str, Any], column_names: list[str], fields: dict[str, str]) -> list[dict[str, str]]:
    normalized: list[dict[str, str]] = []

    # Primary source
    for item in _coerce_evidence_items(parsed.get("evidence_snippets", [])):
        field = item["field"]
        snippet = item["snippet"]
        if field in column_names and snippet:
            normalized.append({"field": field, "snippet": snippet})

    # Secondary source: legacy or malformed `evidence` payload
    if not normalized:
        for item in _coerce_evidence_items(parsed.get("evidence", "")):
            field = item["field"]
            snippet = item["snippet"]
            if field in column_names and snippet:
                normalized.append({"field": field, "snippet": snippet})

    # Fallback for legacy single-string evidence.
    if not normalized:
        legacy_evidence = str(parsed.get("evidence", "")).strip()
        if legacy_evidence:
            non_empty_fields = [col for col in column_names if str(fields.get(col, "")).strip()]
            target_field = non_empty_fields[0] if non_empty_fields else (column_names[0] if column_names else "unknown")
            normalized.append({"field": target_field, "snippet": legacy_evidence})

    # De-duplicate exact repeats while preserving order.
    deduped: list[dict[str, str]] = []
    seen: set[tuple[str, str]] = set()
    for item in normalized:
        key = (item["field"], item["snippet"])
        if key in seen:
            continue
        seen.add(key)
        deduped.append(item)

    return deduped


def format_evidence_for_ui(snippets: list[dict[str, str]]) -> str:
    if not snippets:
        return ""
    return "\n".join([f"- {item['field']}: {item['snippet']}" for item in snippets])


def detect_incomplete_rows(df: pd.DataFrame, target_columns: list[str]) -> list[int]:
    return [
        int(idx)
        for idx, row in df.iterrows()
        if any(is_missing(row.get(col)) for col in target_columns)
    ]


def get_missing_columns(df: pd.DataFrame, row_index: int, target_columns: list[str]) -> list[str]:
    row = df.loc[row_index]
    return [col for col in target_columns if is_missing(row.get(col))]


def get_next_row(df: pd.DataFrame, incomplete_rows: list[int], position: int, target_columns: list[str]) -> tuple[int, int | None]:
    while position < len(incomplete_rows):
        row_idx = incomplete_rows[position]
        if len(get_missing_columns(df, row_idx, target_columns)) > 0:
            return position, row_idx
        position += 1
    return position, None


def _find_first_column(df: pd.DataFrame, candidates: list[str]) -> str | None:
    normalized = {str(col).lower().strip(): str(col) for col in df.columns}
    for candidate in candidates:
        if candidate in normalized:
            return normalized[candidate]
    for col in df.columns:
        col_l = str(col).lower()
        for candidate in candidates:
            if candidate in col_l:
                return str(col)
    return None


def article_details_markdown(df: pd.DataFrame, row_index: int) -> str:
    title_col = _find_first_column(df, ["title", "article title", "paper title", "study title"])
    author_col = _find_first_column(df, ["author", "authors", "first author"])
    title_value = ""
    author_value = ""
    if title_col is not None:
        raw = df.loc[row_index, title_col]
        title_value = "" if pd.isna(raw) else str(raw).strip()
    if author_col is not None:
        raw = df.loc[row_index, author_col]
        author_value = "" if pd.isna(raw) else str(raw).strip()
    if not title_value:
        title_value = "Unknown"
    if not author_value:
        author_value = "Unknown"
    return f"**Title:** {title_value}\n\n**Author(s):** {author_value}"


def render_current_row(
    df: pd.DataFrame | None,
    incomplete_rows: list[int] | None,
    position: int,
    url_column: str,
    target_columns: list[str],
) -> tuple[int, int | None, str, str, str, str, str]:
    if df is None or incomplete_rows is None or len(incomplete_rows) == 0:
        return (
            position,
            None,
            "",
            "No rows loaded.",
            "",
            "",
            "",
        )

    next_position, row_idx = get_next_row(df, incomplete_rows, position, target_columns)
    if row_idx is None:
        return (
            next_position,
            None,
            "",
            "All target rows are complete.",
            "",
            "",
            f"Processed {len(incomplete_rows)} / {len(incomplete_rows)} rows.",
        )

    article_md = article_details_markdown(df, row_idx)
    url_value = str(df.loc[row_idx, url_column]) if url_column in df.columns else ""
    url_md = f"[Open article URL]({url_value})" if url_value else "URL not available"
    missing_md = ""
    current_md = ""
    counter = f"Row {next_position + 1} of {len(incomplete_rows)} (index: {row_idx})"

    return next_position, row_idx, article_md, url_md, current_md, missing_md, counter


def _parse_target_columns_for_ui(
    target_columns_text: str,
    url_column_text: str,
    df: pd.DataFrame | None,
) -> list[str]:
    raw_requested = [item.strip() for item in (target_columns_text or "").split(",") if item.strip()]
    deduped_requested = list(dict.fromkeys(raw_requested))

    if df is None or df.empty:
        return deduped_requested

    available_columns = [str(c) for c in df.columns]
    url_column = choose_url_column(df, url_column_text.strip() if url_column_text else None)
    selected_target_columns = parse_csv_columns(target_columns_text, available_columns)
    if not selected_target_columns:
        selected_target_columns = [str(c) for c in df.columns if str(c) != url_column]
    return selected_target_columns


def refresh_description_inputs(
    target_columns_text: str,
    url_column_text: str,
    df: pd.DataFrame | None,
    description_columns: list[str],
    *description_values: str,
):
    target_columns = _parse_target_columns_for_ui(target_columns_text, url_column_text, df)
    normalized_description_columns, description_updates = build_description_input_updates(
        target_columns,
        description_columns if isinstance(description_columns, list) else [],
        list(description_values),
    )
    return (
        normalized_description_columns,
        *description_updates,
    )


def load_excel(
    file_obj,
    criteria_file_obj,
    user_id_input: str,
    target_columns_text: str,
    url_column_text: str,
    description_columns: list[str],
    *description_values: str,
    request: gr.Request | None = None,
):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)

    def _failure(message: str):
        download_html = build_inline_download_html(None)
        return (
            None,
            [],
            0,
            None,
            [],
            "",
            {},
            [],
            [],
            message,
            "",
            "",
            "",
            "",
            "",
            *empty_description_updates(),
            *build_empty_extracted_input_updates(),
            "",
            0.0,
            "include",
            "",
            "",
            "",
            "",
            download_html,
            gr.update(value=None),
        )

    if file_obj is None:
        return _failure("Please upload an Excel file.")

    try:
        excel_path = resolve_uploaded_path(file_obj)
        if str(excel_path).strip() == "":
            return _failure("Please upload an Excel file.")
        df = pd.read_excel(str(excel_path))
    except Exception as exc:
        return _failure(f"Failed to read Excel: {exc}")

    if df.empty:
        return _failure("Excel file is empty.")

    url_column = choose_url_column(df, url_column_text.strip() if url_column_text else None)
    selected_target_columns = _parse_target_columns_for_ui(target_columns_text, url_column_text, df)

    incomplete_rows = detect_incomplete_rows(df, selected_target_columns)
    normalized_description_columns, description_updates = build_description_input_updates(
        selected_target_columns,
        description_columns if isinstance(description_columns, list) else [],
        list(description_values),
    )
    extracted_columns_for_ui, extracted_updates = build_extracted_input_updates(selected_target_columns, {})

    position, row_idx, article_md, url_md, current_md, missing_md, counter = render_current_row(
        df,
        incomplete_rows,
        0,
        url_column,
        selected_target_columns,
    )

    status = (
        f"Loaded {len(df)} rows. Found {len(incomplete_rows)} rows with missing target values."
        if len(incomplete_rows) > 0
        else "Loaded file, but no incomplete rows were found for the selected target columns."
    )
    if len(selected_target_columns) > MAX_DESCRIPTION_FIELDS:
        status += (
            f" Showing description inputs for the first {MAX_DESCRIPTION_FIELDS} target columns."
        )
    if len(selected_target_columns) > MAX_EXTRACT_FIELDS:
        status += (
            f" Showing extracted inputs for the first {MAX_EXTRACT_FIELDS} target columns."
        )

    description_map = build_description_values_from_inputs(
        description_columns,
        list(description_values),
        selected_target_columns,
    )
    saved_description_values = [description_map.get(col, "") for col in normalized_description_columns]
    saved_excel_path = persist_uploaded_file(user_id, file_obj, "uploaded_excel.xlsx")
    working_df_path = persist_dataframe(user_id, df)
    downloadable_path = persist_downloadable_dataframe(user_id, df)
    download_html = build_inline_download_html(downloadable_path)
    if not downloadable_path:
        status += " Download export is currently unavailable; try again after processing a row."
    save_session_meta(
        user_id,
        {
            "target_columns_text": target_columns_text or "",
            "url_column_text": url_column_text or "",
            "description_columns": normalized_description_columns,
            "description_values": saved_description_values,
            "extracted_columns": extracted_columns_for_ui,
            "extracted_values": ["" for _ in extracted_columns_for_ui],
            "evidence": "",
            "confidence": 0.0,
            "decision": "include",
            "criteria_rationale": "",
            "labels_current": "",
            "labels_suggested": "",
            "labels_rationale": "",
            "excel_path": saved_excel_path or "",
            "criteria_path": persist_uploaded_file(user_id, criteria_file_obj, "criteria.yml")
            or load_session_meta(user_id).get("criteria_path", ""),
            "df_path": working_df_path,
            "download_path": downloadable_path or "",
        }
    )

    return (
        df,
        incomplete_rows,
        position,
        row_idx,
        selected_target_columns,
        url_column,
        {},
        normalized_description_columns,
        extracted_columns_for_ui,
        status,
        article_md,
        url_md,
        current_md,
        missing_md,
        counter,
        *description_updates,
        *extracted_updates,
        "",
        0.0,
        "include",
        "",
        "",
        "",
        "",
        download_html,
        gr.update(value=None),
    )


def parse_pdf(file_obj) -> str:
    if file_obj is None:
        raise ValueError("Please upload a PDF file.")
    path = resolve_uploaded_path(file_obj)
    if str(path).strip() == "":
        raise ValueError("Please upload a PDF file.")

    try:
        with fitz.open(str(path)) as doc:
            text_chunks = [page.get_text("text") for page in doc]
    except Exception as exc:
        raise ValueError(f"Invalid or unreadable PDF: {exc}") from exc

    text = "\n".join(text_chunks).strip()
    if not text:
        raise ValueError("No text extracted from PDF. OCR fallback is not implemented in this MVP.")

    return text


def load_prompt_file(path: Path) -> str:
    try:
        return path.read_text(encoding="utf-8").strip()
    except FileNotFoundError as exc:
        raise RuntimeError(f"Prompt file not found: {path}") from exc
    except Exception as exc:
        raise RuntimeError(f"Failed to load prompt file {path}: {exc}") from exc


def build_user_prompt(text: str, column_names: list[str], column_descriptions: dict[str, str]) -> dict[str, Any]:
    description_block = {col: column_descriptions.get(col, "") for col in column_names}
    template_raw = load_prompt_file(USER_PROMPT_TEMPLATE_PATH)
    template = Template(template_raw)

    rendered = template.substitute(
        fields_schema_json=json.dumps({col: "string" for col in column_names}),
        fill_only_requested_fields_json=json.dumps(column_names),
        column_descriptions_json=json.dumps(description_block),
        article_text=json.dumps(text),
    )

    try:
        return json.loads(rendered)
    except json.JSONDecodeError as exc:
        raise RuntimeError(f"User prompt template rendered invalid JSON: {exc}") from exc


def build_criteria_user_prompt(text: str, criteria: dict[str, Any]) -> dict[str, Any]:
    template_raw = load_prompt_file(USER_CRITERIA_TEMPLATE_PATH)
    template = Template(template_raw)
    rendered = template.substitute(
        topic_json=json.dumps(criteria.get("topic", "")),
        inclusion_criteria_json=json.dumps(criteria.get("inclusion_criteria", [])),
        exclusion_criteria_json=json.dumps(criteria.get("exclusion_criteria", [])),
        notes_json=json.dumps(criteria.get("notes", "")),
        article_text=json.dumps(text),
    )
    try:
        return json.loads(rendered)
    except json.JSONDecodeError as exc:
        raise RuntimeError(f"Criteria user prompt rendered invalid JSON: {exc}") from exc


def build_labels_user_prompt(text: str, current_labels: list[str]) -> dict[str, Any]:
    template_raw = load_prompt_file(USER_LABELS_TEMPLATE_PATH)
    template = Template(template_raw)
    rendered = template.substitute(
        current_labels_json=json.dumps(current_labels),
        article_text=json.dumps(text),
    )
    try:
        return json.loads(rendered)
    except json.JSONDecodeError as exc:
        raise RuntimeError(f"Labels user prompt rendered invalid JSON: {exc}") from exc


def _azure_client() -> AzureOpenAI:
    endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    api_key = os.getenv("AZURE_OPENAI_API_KEY")
    api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview")

    if not endpoint or not api_key:
        raise RuntimeError("AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY must be set.")

    return AzureOpenAI(
        azure_endpoint=endpoint,
        api_key=api_key,
        api_version=api_version,
    )


def _call_llm_json(system_prompt: str, user_prompt: dict[str, Any]) -> dict[str, Any]:
    deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4.1-mini")
    client = _azure_client()
    request_timeout = float(os.getenv("AZURE_OPENAI_TIMEOUT_SECONDS", "90"))

    try:
        response = client.chat.completions.create(
            model=deployment,
            temperature=0,
            response_format={"type": "json_object"},
            timeout=request_timeout,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": json.dumps(user_prompt)},
            ],
        )
    except Exception as exc:
        raise RuntimeError(f"Azure OpenAI request failed: {exc}") from exc

    content = response.choices[0].message.content if response.choices else ""
    if not content:
        raise RuntimeError("LLM returned empty content.")

    try:
        return json.loads(content)
    except json.JSONDecodeError as exc:
        raise RuntimeError(f"LLM output is not valid JSON: {exc}") from exc


def extract_with_llm(text: str, column_names: list[str], column_descriptions: dict[str, str]) -> dict[str, Any]:
    system_prompt = load_prompt_file(SYSTEM_PROMPT_PATH)
    user_prompt = build_user_prompt(text, column_names, column_descriptions)
    parsed = _call_llm_json(system_prompt, user_prompt)

    normalized_fields = coerce_fields_from_llm(parsed, column_names)
    normalized_evidence_snippets = normalize_evidence_snippets(parsed, column_names, normalized_fields)
    evidence_text = format_evidence_for_ui(normalized_evidence_snippets)

    if is_debug_enabled():
        print("[DEBUG] LLM parsed response:", parsed)
        print("[DEBUG] Parsed keys:", list(parsed.keys()))
        print("[DEBUG] Requested columns:", column_names)
        print("[DEBUG] Extracted fields:", normalized_fields)

    confidence_raw = parsed.get("confidence", 0)
    try:
        confidence = float(confidence_raw)
    except Exception:
        confidence = 0.0
    confidence = min(max(confidence, 0.0), 1.0)

    decision = str(parsed.get("decision", "include")).strip().lower()
    if decision not in VALID_DECISIONS:
        decision = "include"

    return {
        "fields": normalized_fields,
        "evidence": evidence_text,
        "evidence_snippets": normalized_evidence_snippets,
        "confidence": confidence,
        "decision": decision,
    }


def evaluate_with_criteria_llm(text: str, criteria: dict[str, Any]) -> dict[str, Any]:
    system_prompt = load_prompt_file(SYSTEM_CRITERIA_PROMPT_PATH)
    user_prompt = build_criteria_user_prompt(text, criteria)
    parsed = _call_llm_json(system_prompt, user_prompt)

    confidence_raw = parsed.get("confidence", 0)
    try:
        confidence = float(confidence_raw)
    except Exception:
        confidence = 0.0
    confidence = min(max(confidence, 0.0), 1.0)

    decision = str(parsed.get("decision", "include")).strip().lower()
    if decision not in VALID_DECISIONS:
        decision = "include"

    rationale = str(parsed.get("rationale", "")).strip()

    return {
        "decision": decision,
        "confidence": confidence,
        "rationale": rationale,
    }


def validate_rayyan_labels_llm(text: str, current_labels: list[str]) -> dict[str, Any]:
    system_prompt = load_prompt_file(SYSTEM_LABELS_PROMPT_PATH)
    user_prompt = build_labels_user_prompt(text, current_labels)
    parsed = _call_llm_json(system_prompt, user_prompt)

    suggested = parsed.get("suggested_labels", [])
    rationale = str(parsed.get("rationale", "")).strip()

    if not isinstance(suggested, list):
        suggested = []

    suggested_labels = list(dict.fromkeys([str(item).strip() for item in suggested if str(item).strip()]))

    # Keep switch-only behavior: same label count as original when labels exist.
    if current_labels:
        if len(suggested_labels) != len(current_labels):
            suggested_labels = current_labels[:]
            rationale = ""
    if not suggested_labels:
        suggested_labels = current_labels[:]

    if suggested_labels == current_labels:
        rationale = ""

    return {
        "current_labels": current_labels,
        "suggested_labels": suggested_labels,
        "rationale": rationale,
    }


def labels_to_text(labels: list[str]) -> str:
    if not labels:
        return ""
    return ", ".join(labels)


def update_row(df: pd.DataFrame, row_index: int, values: dict[str, Any]) -> pd.DataFrame:
    for col, val in values.items():
        if col in df.columns:
            try:
                df.at[row_index, col] = val
            except (TypeError, ValueError):
                # Some Excel columns are inferred as float64 when mostly empty.
                # Upcast that column so text values from extraction can be stored.
                df[col] = df[col].astype("object")
                df.at[row_index, col] = val
    return df


def process_pdf_and_extract(
    pdf_file,
    criteria_file,
    user_id_input: str,
    df: pd.DataFrame,
    current_row_index: int | None,
    target_columns: list[str],
    description_columns: list[str],
    *description_values: str,
    progress=gr.Progress(),
    request: gr.Request | None = None,
):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)

    if df is None or current_row_index is None:
        return empty_extracted_state("Load Excel and start screening first.")

    try:
        debug_log("Process PDF started", {"row_index": current_row_index})
        progress(0.15, desc="Extracting text from PDF")
        text = parse_pdf(pdf_file)
        criteria = parse_criteria_file(criteria_file) if criteria_file is not None else None

        missing_columns = get_missing_columns(df, current_row_index, target_columns)
        if len(missing_columns) == 0:
            return empty_extracted_state("Current row has no missing target fields.")

        descriptions = build_description_values_from_inputs(
            description_columns,
            list(description_values),
            missing_columns,
        )

        labels_column = "RAYYAN_Labels" if "RAYYAN_Labels" in df.columns else None
        current_labels = parse_labels_csv(df.loc[current_row_index, labels_column]) if labels_column else []

        progress(0.50, desc="Running parallel LLM workflows")
        workflow_timeout = float(os.getenv("WORKFLOW_TIMEOUT_SECONDS", "120"))
        warnings: list[str] = []
        with ThreadPoolExecutor(max_workers=3) as executor:
            extraction_future = executor.submit(extract_with_llm, text, missing_columns, descriptions)
            criteria_future = (
                executor.submit(evaluate_with_criteria_llm, text, criteria)
                if criteria is not None
                else None
            )
            labels_future = executor.submit(validate_rayyan_labels_llm, text, current_labels)
            try:
                result = extraction_future.result(timeout=workflow_timeout)
            except Exception as exc:
                raise RuntimeError(f"Extraction workflow failed: {exc}") from exc

            criteria_result = None
            if criteria_future is not None:
                try:
                    criteria_result = criteria_future.result(timeout=workflow_timeout)
                except Exception as exc:
                    warnings.append(f"Criteria workflow failed: {exc}")
                    debug_log("Criteria workflow failed", repr(exc))

            labels_result = {
                "current_labels": current_labels,
                "suggested_labels": current_labels,
                "rationale": "",
            }
            try:
                labels_result = labels_future.result(timeout=workflow_timeout)
            except Exception as exc:
                warnings.append(f"RAYYAN labels workflow failed: {exc}")
                debug_log("Labels workflow failed", repr(exc))

        if criteria_result is not None:
            result["decision"] = criteria_result["decision"]
            result["confidence"] = criteria_result["confidence"]

        criteria_rationale_ui = ""
        if criteria_result is not None:
            criteria_rationale_ui = criteria_result.get("rationale", "") or ""

        labels_current_ui = labels_to_text(labels_result.get("current_labels", []))
        labels_suggested_ui = labels_to_text(labels_result.get("suggested_labels", []))
        labels_rationale_ui = str(labels_result.get("rationale", "")).strip()
        extracted_columns, extracted_updates = build_extracted_input_updates(
            missing_columns,
            result["fields"],
        )
        extraction_status = "Extraction completed. Review and Accept/Edit/Reject."
        if criteria is None:
            extraction_status = (
                "Extraction completed without criteria.yml; confidence/decision are based on extraction output. "
                "Upload criteria.yml to override them with criteria screening."
            )
        if extracted_columns and all(str(result["fields"].get(col, "")).strip() == "" for col in extracted_columns):
            extraction_status = (
                "Extraction completed, but all extracted fields are empty. "
                "Check column descriptions/PDF content. Enable APP_DEBUG=1 to inspect raw model output."
            )
        if warnings:
            extraction_status = f"{extraction_status} Warnings: {' | '.join(warnings)}"

        final_evidence_text = result["evidence"]
        result["evidence"] = final_evidence_text
        result["labels_current"] = labels_current_ui
        result["labels_suggested"] = labels_suggested_ui
        result["labels_rationale"] = labels_rationale_ui
        result["criteria_rationale"] = criteria_rationale_ui

        description_values_list = list(description_values)
        saved_description_columns = description_columns if isinstance(description_columns, list) else []
        save_session_meta(
            user_id,
            {
                "description_columns": saved_description_columns,
                "description_values": description_values_list[: len(saved_description_columns)],
                "extracted_columns": extracted_columns,
                "extracted_values": [str(result["fields"].get(col, "")) for col in extracted_columns],
                "evidence": final_evidence_text,
                "confidence": float(result["confidence"]),
                "decision": result["decision"],
                "criteria_rationale": criteria_rationale_ui,
                "labels_current": labels_current_ui,
                "labels_suggested": labels_suggested_ui,
                "labels_rationale": labels_rationale_ui,
                "criteria_path": persist_uploaded_file(user_id, criteria_file, "criteria.yml")
                or load_session_meta(user_id).get("criteria_path", ""),
                "pdf_path": persist_uploaded_file(user_id, pdf_file, "uploaded_pdf.pdf")
                or load_session_meta(user_id).get("pdf_path", ""),
            }
        )
        progress(1.0, desc="Done")
        debug_log("Process PDF completed", {"warnings": warnings, "decision": result["decision"]})

        return (
            result,
            extracted_columns,
            *extracted_updates,
            final_evidence_text,
            result["confidence"],
            result["decision"],
            criteria_rationale_ui,
            labels_current_ui,
            labels_suggested_ui,
            labels_rationale_ui,
            extraction_status,
        )
    except Exception as exc:
        debug_log("Process PDF failed", repr(exc))
        return empty_extracted_state(f"Processing failed: {exc}")


def accept_extraction(
    extracted_columns: list[str],
    user_id_input: str,
    df: pd.DataFrame,
    current_row_index: int | None,
    incomplete_rows: list[int],
    position: int,
    url_column: str,
    target_columns: list[str],
    *extracted_values: str,
    request: gr.Request | None = None,
):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)
    empty_extracted_updates = build_empty_extracted_input_updates()
    if df is None or current_row_index is None:
        download_html = build_inline_download_html(None)
        return (
            df,
            position,
            current_row_index,
            {},
            "",
            "",
            "",
            "",
            "",
            "Nothing to accept.",
            [],
            *empty_extracted_updates,
            "",
            0.0,
            "include",
            "",
            "",
            "",
            "",
            download_html,
            gr.update(value=None),
        )

    fields = build_extracted_values_from_inputs(extracted_columns, list(extracted_values))
    df = update_row(df, current_row_index, fields)
    new_position = position + 1

    next_position, next_row, article_md, url_md, current_md, missing_md, counter = render_current_row(
        df,
        incomplete_rows,
        new_position,
        url_column,
        target_columns,
    )
    downloadable_path = persist_downloadable_dataframe(user_id, df)
    download_html = build_inline_download_html(downloadable_path)
    status = "Row updated and accepted."
    if not downloadable_path:
        status = f"{status} Download export could not be refreshed."
    save_session_meta(
        user_id,
        {
            "df_path": persist_dataframe(user_id, df),
            "extracted_columns": [],
            "extracted_values": [],
            "evidence": "",
            "confidence": 0.0,
            "decision": "include",
            "criteria_rationale": "",
            "labels_current": "",
            "labels_suggested": "",
            "labels_rationale": "",
            "position": next_position,
            "current_row_index": next_row,
            "download_path": downloadable_path or load_session_meta(user_id).get("download_path", ""),
        }
    )

    return (
        df,
        next_position,
        next_row,
        {},
        article_md,
        url_md,
        current_md,
        missing_md,
        counter,
        status,
        [],
        *empty_extracted_updates,
        "",
        0.0,
        "include",
        "",
        "",
        "",
        "",
        download_html,
        gr.update(value=None),
    )


def skip_row(
    user_id_input: str,
    df: pd.DataFrame,
    incomplete_rows: list[int],
    position: int,
    url_column: str,
    target_columns: list[str],
    request: gr.Request | None = None,
):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)
    empty_extracted_updates = build_empty_extracted_input_updates()
    if df is None:
        download_html = build_inline_download_html(None)
        return (
            df,
            position,
            None,
            "",
            "",
            "",
            "",
            "",
            "No dataset loaded.",
            [],
            *empty_extracted_updates,
            "",
            0.0,
            "include",
            "",
            "",
            "",
            "",
            {},
            download_html,
            gr.update(value=None),
        )

    new_position = position + 1
    next_position, next_row, article_md, url_md, current_md, missing_md, counter = render_current_row(
        df,
        incomplete_rows,
        new_position,
        url_column,
        target_columns,
    )
    downloadable_path = persist_downloadable_dataframe(user_id, df)
    download_html = build_inline_download_html(downloadable_path)
    status = "Row skipped."
    if not downloadable_path:
        status = f"{status} Existing download may be stale."
    save_session_meta(
        user_id,
        {
            "df_path": persist_dataframe(user_id, df),
            "extracted_columns": [],
            "extracted_values": [],
            "evidence": "",
            "confidence": 0.0,
            "decision": "include",
            "criteria_rationale": "",
            "labels_current": "",
            "labels_suggested": "",
            "labels_rationale": "",
            "position": next_position,
            "current_row_index": next_row,
            "download_path": downloadable_path or load_session_meta(user_id).get("download_path", ""),
        }
    )

    return (
        df,
        next_position,
        next_row,
        article_md,
        url_md,
        current_md,
        missing_md,
        counter,
        status,
        [],
        *empty_extracted_updates,
        "",
        0.0,
        "include",
        "",
        "",
        "",
        "",
        {},
        download_html,
        gr.update(value=None),
    )


def reject_extraction(user_id_input: str, request: gr.Request | None = None):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)
    save_session_meta(
        user_id,
        {
            "extracted_columns": [],
            "extracted_values": [],
            "evidence": "",
            "confidence": 0.0,
            "decision": "include",
            "criteria_rationale": "",
            "labels_current": "",
            "labels_suggested": "",
            "labels_rationale": "",
        }
    )
    return empty_extracted_state("Extraction rejected. Upload another PDF or try again.")


def restore_saved_session(user_id_input: str, request: gr.Request | None = None):
    user_id = resolve_user_id(explicit_user_id=user_id_input, request=request)
    meta = load_session_meta(user_id)
    target_columns_text = str(meta.get("target_columns_text", ""))
    url_column_text = str(meta.get("url_column_text", ""))
    description_columns = meta.get("description_columns", [])
    description_values = meta.get("description_values", [])
    if not isinstance(description_columns, list):
        description_columns = []
    if not isinstance(description_values, list):
        description_values = []

    excel_path = str(meta.get("df_path", "") or meta.get("excel_path", ""))
    pdf_path = str(meta.get("pdf_path", ""))
    criteria_path = str(meta.get("criteria_path", ""))
    download_path = str(meta.get("download_path", ""))
    excel_exists = bool(excel_path) and Path(excel_path).exists()
    pdf_exists = bool(pdf_path) and Path(pdf_path).exists()
    criteria_exists = bool(criteria_path) and Path(criteria_path).exists()
    download_exists = bool(download_path) and Path(download_path).exists()

    if excel_exists:
        loaded = load_excel(
            SimpleNamespace(name=excel_path),
            SimpleNamespace(name=criteria_path) if criteria_exists else None,
            user_id,
            target_columns_text,
            url_column_text,
            description_columns,
            *description_values,
            request=request,
        )
        loaded = list(loaded[:-1])  # drop pdf clear update; demo.load sets pdf explicitly above
        loaded[9] = f"{loaded[9]} Restored saved session."
    else:
        loaded = list(
            load_excel(
                None,
                SimpleNamespace(name=criteria_path) if criteria_exists else None,
                user_id,
                target_columns_text,
                url_column_text,
                description_columns,
                *description_values,
                request=request,
            )
        )[:-1]  # drop pdf clear update; demo.load sets pdf explicitly above
        loaded[9] = "No saved session found."

    base_extracted_start = 15 + MAX_DESCRIPTION_FIELDS
    default_evidence_idx = base_extracted_start + MAX_EXTRACT_FIELDS
    default_confidence_idx = default_evidence_idx + 1
    default_decision_idx = default_evidence_idx + 2
    default_criteria_rationale_idx = default_evidence_idx + 3
    default_labels_current_idx = default_evidence_idx + 4
    default_labels_suggested_idx = default_evidence_idx + 5
    default_labels_rationale_idx = default_evidence_idx + 6

    saved_extracted_columns = meta.get("extracted_columns", [])
    saved_extracted_values = meta.get("extracted_values", [])
    if not isinstance(saved_extracted_columns, list):
        saved_extracted_columns = []
    if not isinstance(saved_extracted_values, list):
        saved_extracted_values = []
    saved_fields = build_extracted_values_from_inputs(saved_extracted_columns, saved_extracted_values)
    restored_extracted_columns, restored_extracted_updates = build_extracted_input_updates(
        saved_extracted_columns,
        saved_fields,
    )
    if not restored_extracted_columns:
        restored_extracted_updates = build_empty_extracted_input_updates()

    extracted_state = {
        "fields": {col: saved_fields.get(col, "") for col in restored_extracted_columns},
        "evidence": str(meta.get("evidence", "")),
        "confidence": float(meta.get("confidence", 0.0)),
        "decision": str(meta.get("decision", "include")),
        "criteria_rationale": str(meta.get("criteria_rationale", "")),
        "labels_current": str(meta.get("labels_current", "")),
        "labels_suggested": str(meta.get("labels_suggested", meta.get("label_suggestions", ""))),
        "labels_rationale": str(meta.get("labels_rationale", "")),
    }

    loaded[6] = extracted_state
    loaded[8] = restored_extracted_columns
    loaded[base_extracted_start : base_extracted_start + MAX_EXTRACT_FIELDS] = restored_extracted_updates
    loaded[default_evidence_idx] = extracted_state["evidence"]
    loaded[default_confidence_idx] = extracted_state["confidence"]
    loaded[default_decision_idx] = extracted_state["decision"] if extracted_state["decision"] in VALID_DECISIONS else "include"
    loaded[default_criteria_rationale_idx] = extracted_state["criteria_rationale"]
    loaded[default_labels_current_idx] = extracted_state["labels_current"]
    loaded[default_labels_suggested_idx] = extracted_state["labels_suggested"]
    loaded[default_labels_rationale_idx] = extracted_state["labels_rationale"]
    loaded[-1] = build_inline_download_html(download_path if download_exists else None)

    return (
        user_id,
        gr.update(value=excel_path if excel_exists else None),
        gr.update(value=pdf_path if pdf_exists else None),
        gr.update(value=criteria_path if criteria_exists else None),
        target_columns_text,
        url_column_text,
        *loaded,
        gr.update(value=pdf_path if pdf_exists else None),
    )


def get_auth_config() -> list[tuple[str, str]] | tuple[str, str] | None:
    """Build Gradio basic auth config from environment variables.

    Expected Space Secrets:
    - USER1, USER2, ... with value "(username,password)" or "username,password"
    - Legacy fallback:
      - SPACE_APP_PASSWORD (required to enable legacy auth)
      - SPACE_APP_USERNAME (optional, defaults to 'admin')
    """
    users: list[tuple[str, str]] = []
    for key in sorted(os.environ.keys()):
        if not re.fullmatch(r"USER\d+", key):
            continue
        raw = os.getenv(key, "").strip()
        if not raw:
            continue

        username = ""
        password = ""
        try:
            parsed = ast.literal_eval(raw)
            if isinstance(parsed, tuple) and len(parsed) == 2:
                username = str(parsed[0]).strip()
                password = str(parsed[1]).strip()
        except Exception:
            parts = [part.strip() for part in raw.split(",", 1)]
            if len(parts) == 2:
                username, password = parts[0], parts[1]

        if username and password:
            users.append((username, password))

    if users:
        return users

    password = os.getenv("SPACE_APP_PASSWORD", "").strip()
    if not password:
        return None

    username = os.getenv("SPACE_APP_USERNAME", "admin").strip() or "admin"
    return username, password


_setup_storage_paths()


with gr.Blocks(title="Scientific Article Screener") as demo:
    gr.Markdown("# Scientific Article Screener")
    gr.Markdown(
        "Upload an Excel file and process one incomplete row at a time with a PDF."
    )

    # Session state
    df_state = gr.State(None)
    incomplete_rows_state = gr.State([])
    position_state = gr.State(0)
    current_row_state = gr.State(None)
    target_columns_state = gr.State([])
    url_column_state = gr.State("")
    extracted_state = gr.State({})
    description_columns_state = gr.State([])
    extracted_columns_state = gr.State([])
    user_id_state = gr.State("default")

    with gr.Row():
        # LEFT PANEL
        with gr.Column(scale=1):
            excel_file = gr.File(label="Upload Excel (.xlsx)", file_types=[".xlsx"])
            criteria_file = gr.File(label="Upload criteria.yml (optional)", file_types=[".yml", ".yaml"])
            target_columns_input = gr.Textbox(
                label="Target columns (comma-separated)",
                placeholder="Leave empty to use all columns except URL column",
            )
            url_column_input = gr.Textbox(
                label="URL column name (optional)",
                placeholder="Leave empty to auto-detect",
            )
            gr.Markdown("### Field descriptions")
            description_inputs: list[gr.Textbox] = []
            for idx in range(MAX_DESCRIPTION_FIELDS):
                description_inputs.append(
                    gr.Textbox(
                        label=f"Description {idx + 1}",
                        lines=4,
                        visible=False,
                    )
                )
            start_btn = gr.Button("Start screening", variant="primary")
            download_links_md = gr.HTML("")

        # WORKSPACE (previous middle + right, wider)
        with gr.Column(scale=2):
            row_counter = gr.Markdown("No row selected.")
            article_url_md = gr.Markdown("")
            article_details_md = gr.Markdown("")
            current_values_md = gr.Markdown("", visible=False)
            missing_columns_md = gr.Markdown("", visible=False)
            pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
            process_pdf_btn = gr.Button("Process PDF", variant="primary")
            gr.Markdown("### Extracted fields")
            extracted_inputs: list[gr.Textbox] = []
            for idx in range(MAX_EXTRACT_FIELDS):
                extracted_inputs.append(
                    gr.Textbox(
                        label=f"Extracted field {idx + 1}",
                        lines=1,
                        visible=False,
                    )
                )
            evidence_box = gr.Textbox(label="Evidence snippet", lines=4)
            confidence_box = gr.Slider(label="Confidence", minimum=0.0, maximum=1.0, step=0.01, value=0.0)
            decision_box = gr.Radio(label="Include/Exclude decision", choices=["include", "exclude"], value="include")
            criteria_rationale_box = gr.Textbox(label="Criteria rationale", lines=4)
            labels_current_box = gr.Textbox(label="RAYYAN current labels", lines=2)
            labels_suggested_box = gr.Textbox(label="RAYYAN suggested labels", lines=2)
            labels_rationale_box = gr.Textbox(label="RAYYAN label-switch rationale", lines=4)

            with gr.Row():
                accept_btn = gr.Button("Accept", variant="primary")
                reject_btn = gr.Button("Reject")
                skip_btn = gr.Button("Skip")

    status_box = gr.Markdown("Ready.")

    base_row_outputs = [
        df_state,
        incomplete_rows_state,
        position_state,
        current_row_state,
        target_columns_state,
        url_column_state,
        extracted_state,
        description_columns_state,
        extracted_columns_state,
        status_box,
        article_details_md,
        article_url_md,
        current_values_md,
        missing_columns_md,
        row_counter,
    ]
    extraction_outputs = [
        *extracted_inputs,
        evidence_box,
        confidence_box,
        decision_box,
        criteria_rationale_box,
        labels_current_box,
        labels_suggested_box,
        labels_rationale_box,
    ]
    download_outputs = [download_links_md, pdf_file]

    demo_load_outputs = [
        user_id_state,
        excel_file,
        pdf_file,
        criteria_file,
        target_columns_input,
        url_column_input,
        *base_row_outputs,
        *description_inputs,
        *extracted_inputs,
        evidence_box,
        confidence_box,
        decision_box,
        criteria_rationale_box,
        labels_current_box,
        labels_suggested_box,
        labels_rationale_box,
        download_links_md,
        pdf_file,
    ]
    start_outputs = [*base_row_outputs, *description_inputs, *extraction_outputs, *download_outputs]
    process_outputs = [extracted_state, extracted_columns_state, *extraction_outputs, status_box]
    accept_outputs = [
        df_state,
        position_state,
        current_row_state,
        extracted_state,
        article_details_md,
        article_url_md,
        current_values_md,
        missing_columns_md,
        row_counter,
        status_box,
        extracted_columns_state,
        *extraction_outputs,
        *download_outputs,
    ]
    skip_outputs = [
        df_state,
        position_state,
        current_row_state,
        article_details_md,
        article_url_md,
        current_values_md,
        missing_columns_md,
        row_counter,
        status_box,
        extracted_columns_state,
        *extraction_outputs,
        extracted_state,
        *download_outputs,
    ]
    reject_outputs = [extracted_state, extracted_columns_state, *extraction_outputs, status_box]

    demo.load(fn=init_user_id, inputs=[], outputs=[user_id_state]).then(
        fn=restore_saved_session,
        inputs=[user_id_state],
        outputs=demo_load_outputs,
    )

    target_columns_input.change(
        fn=refresh_description_inputs,
        inputs=[target_columns_input, url_column_input, df_state, description_columns_state, *description_inputs],
        outputs=[description_columns_state, *description_inputs],
    )

    url_column_input.change(
        fn=refresh_description_inputs,
        inputs=[target_columns_input, url_column_input, df_state, description_columns_state, *description_inputs],
        outputs=[description_columns_state, *description_inputs],
    )

    start_btn.click(
        fn=load_excel,
        inputs=[excel_file, criteria_file, user_id_state, target_columns_input, url_column_input, description_columns_state, *description_inputs],
        outputs=start_outputs,
    )

    process_pdf_btn.click(
        fn=process_pdf_and_extract,
        inputs=[pdf_file, criteria_file, user_id_state, df_state, current_row_state, target_columns_state, description_columns_state, *description_inputs],
        outputs=process_outputs,
    )

    accept_btn.click(
        fn=accept_extraction,
        inputs=[
            extracted_columns_state,
            user_id_state,
            df_state,
            current_row_state,
            incomplete_rows_state,
            position_state,
            url_column_state,
            target_columns_state,
            *extracted_inputs,
        ],
        outputs=accept_outputs,
    )

    skip_btn.click(
        fn=skip_row,
        inputs=[user_id_state, df_state, incomplete_rows_state, position_state, url_column_state, target_columns_state],
        outputs=skip_outputs,
    )

    reject_btn.click(
        fn=reject_extraction,
        inputs=[user_id_state],
        outputs=reject_outputs,
    )

if __name__ == "__main__":
    auth_config = get_auth_config()
    demo.launch(
        auth=auth_config,
        allowed_paths=[
            str(APP_STORAGE_ROOT.resolve()),
        ],
    )