import os import gradio as gr from typing import Iterator, List, Dict, Any, Tuple from backend_hf_api import HFInferenceBackend, is_hf_api_available SYSTEM_PROMPT_DEFAULT = os.getenv("SYSTEM_PROMPT", "You are a helpful assistant. Be concise and accurate.") DEFAULT_MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512")) DEFAULT_TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7")) # Use a valid Nemotron repo by default; override via Space Variables if you want another. DEFAULT_HF_API_MODEL = os.getenv("HF_API_MODEL", "NVIDIA/Nemotron-3-8B-Instruct") def _msg_content_to_text(content: Any) -> str: if isinstance(content, str): return content if isinstance(content, dict) and isinstance(content.get("text"), str): return content["text"] return "" if content is None else str(content) def _history_to_pairs(history: Any) -> List[Tuple[str, str]]: """Gradio v6 messages or legacy (user, assistant) pairs → (user, assistant) pairs.""" pairs: List[Tuple[str, str]] = [] if not history: return pairs if isinstance(history[0], dict): pending_user: str | None = None for m in history: role = m.get("role") text = _msg_content_to_text(m.get("content")) if role == "user": if pending_user is not None: pairs.append((pending_user, "")) pending_user = text elif role == "assistant": if pending_user is None: pairs.append(("", text)) else: pairs.append((pending_user, text)) pending_user = None if pending_user is not None: pairs.append((pending_user, "")) return pairs if isinstance(history[0], (list, tuple)) and len(history[0]) == 2: return [(str(u or ""), str(a or "")) for (u, a) in history] return [(str(history), "")] def chat_fn( message: str, history: List[Dict[str, Any]] | List[Tuple[str, str]], model_name: str, system_prompt: str, temperature: float, max_new_tokens: int, ) -> Iterator[str]: if not is_hf_api_available(): yield "[error] HF_TOKEN not set. Add it in Spaces → Settings → Secrets and restart." return try: backend = HFInferenceBackend(model_name or DEFAULT_HF_API_MODEL) pairs_history = _history_to_pairs(history) yield from backend.generate_stream( system_prompt=(system_prompt or SYSTEM_PROMPT_DEFAULT).strip(), history=pairs_history, user_msg=message, temperature=float(temperature), max_new_tokens=int(max_new_tokens), ) except Exception as e: yield f"[error] {type(e).__name__}: {e}" with gr.Blocks() as demo: gr.Markdown("# 🤖 HF Inference API Chatbot (Gradio v6)\nUses your **HF_TOKEN**. Preflight checks model to prevent crashes.") model_name = gr.Textbox( value=DEFAULT_HF_API_MODEL, label="HF model repo", placeholder="e.g., NVIDIA/Nemotron-3-8B-Instruct", ) with gr.Accordion("Advanced", open=False) as adv: system_prompt = gr.Textbox(value=SYSTEM_PROMPT_DEFAULT, label="System prompt", lines=3) temperature = gr.Slider(0.0, 1.5, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature") max_new_tokens = gr.Slider(16, 4096, value=DEFAULT_MAX_NEW_TOKENS, step=16, label="Max new tokens") gr.ChatInterface( fn=chat_fn, title="Chat", examples=[ ["Summarize why the sky is blue in 3 sentences.", DEFAULT_HF_API_MODEL, SYSTEM_PROMPT_DEFAULT, DEFAULT_TEMPERATURE, DEFAULT_MAX_NEW_TOKENS], ["Draft a friendly product blurb for a coffee mug.", DEFAULT_HF_API_MODEL, SYSTEM_PROMPT_DEFAULT, DEFAULT_TEMPERATURE, DEFAULT_MAX_NEW_TOKENS], ["Explain binary search with a tiny Python example.", DEFAULT_HF_API_MODEL, SYSTEM_PROMPT_DEFAULT, DEFAULT_TEMPERATURE, DEFAULT_MAX_NEW_TOKENS], ], cache_examples=False, additional_inputs=[model_name, system_prompt, temperature, max_new_tokens], additional_inputs_accordion=adv, save_history=True, editable=True, autoscroll=True, ) if __name__ == "__main__": demo.queue().launch()