import os import gradio as gr from huggingface_hub import InferenceClient # ============================================================ # COUNT FROGULA'S AI INTEGRATION STACK # Full InferenceClient + OpenAI-compatible HF Router # Models: Top LLMs, VLMs, Coders, TTS/ASR, Embeddings # Router: https://router.huggingface.co/v1 # ============================================================ MODELS = { "Qwen3.5-397B-A17B (VLM)": "Qwen/Qwen3.5-397B-A17B", "Kimi-K2.5 (VLM)": "moonshotai/Kimi-K2.5", "GLM-5 (754B)": "zai-org/GLM-5", "Qwen3-Coder-Next": "Qwen/Qwen3-Coder-Next", "Llama-3.3-70B": "meta-llama/Llama-3.3-70B-Instruct", "Llama-3.1-8B": "meta-llama/Meta-Llama-3.1-8B-Instruct", "DeepSeek-R1": "deepseek-ai/DeepSeek-R1", "Qwen3-8B": "Qwen/Qwen3-8B", "GPT-OSS-120B": "openai/gpt-oss-120b", "GPT-OSS-20B": "openai/gpt-oss-20b", } SYSTEM_DEFAULT = """You are COUNT FROGULA's AI Integration Assistant. You have access to the full HuggingFace AI stack: - Top LLMs & VLMs via InferenceClient - OpenAI-compatible endpoint at https://router.huggingface.co/v1 - smolagents agentic framework - MCP tools integration Be helpful, precise, and maximize capability in every response.""" def respond( message, history: list[dict], model_choice, system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken | None, ): token = hf_token.token if hf_token else os.environ.get("HF_TOKEN", "") model_id = MODELS.get(model_choice, "openai/gpt-oss-20b") client = InferenceClient( token=token, model=model_id, base_url="https://router.huggingface.co/v1", ) messages = [{"role": "system", "content": system_message}] messages.extend(history) messages.append({"role": "user", "content": message}) response = "" for chunk in client.chat.completions.create( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True, ): token_text = chunk.choices[0].delta.content or "" response += token_text yield response with gr.Blocks( theme=gr.themes.Soft(primary_hue="purple", secondary_hue="green"), title="COUNT FROGULA's AI Integration Stack", ) as demo: gr.Markdown( """ # COUNT FROGULA's AI Integration Stack **Full HF InferenceClient + OpenAI-compatible Router** > Models: Top LLMs, VLMs, Coders | Router: `https://router.huggingface.co/v1` """ ) with gr.Row(): model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), value="GPT-OSS-20B", label="Model", scale=2, ) gr.LoginButton(scale=1) chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ model_dropdown, gr.Textbox(value=SYSTEM_DEFAULT, label="System Message", lines=4), gr.Slider(minimum=1, maximum=8192, value=2048, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.05, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), gr.OAuthToken(), ], examples=[ ["Explain the HuggingFace InferenceClient API and how to use the OpenAI-compatible router at router.huggingface.co/v1"], ["Write a Python script using smolagents with HuggingFace tools to build an autonomous coding agent"], ["Compare Qwen3.5-397B, Kimi-K2.5, and GLM-5 for enterprise AI integration tasks"], ["Generate a complete Next.js + Vercel deployment config for a HuggingFace Spaces integration"], ], cache_examples=False, ) gr.Markdown( """ --- **Integration Stack:** InferenceClient | smolagents | MCP Tools | OpenAI Router **Collection:** [COUNT FROGULA's AI Integration Stack](https://huggingface.co/collections/COUNTfrogula/count-frogulas-ai-integration-stack) """ ) if __name__ == "__main__": demo.launch()