from __future__ import annotations import os from typing import Any import gradio as gr import requests from huggingface_hub import InferenceClient COLLECTION_API = "https://huggingface.co/api/collections/Qwen/qwen35" def fetch_qwen35_models() -> list[dict[str, Any]]: try: response = requests.get(COLLECTION_API, timeout=30) response.raise_for_status() payload = response.json() except Exception: # Minimal fallback for resilience if HF collection API is transiently unavailable. return [ {"id": "Qwen/Qwen3.5-35B-A3B", "live_providers": ["unknown"]}, {"id": "Qwen/Qwen3.5-27B", "live_providers": ["unknown"]}, {"id": "Qwen/Qwen3.5-9B", "live_providers": ["unknown"]}, {"id": "Qwen/Qwen3.5-4B", "live_providers": ["unknown"]}, {"id": "Qwen/Qwen3.5-2B", "live_providers": ["unknown"]}, {"id": "Qwen/Qwen3.5-0.8B", "live_providers": ["unknown"]}, ] models: list[dict[str, Any]] = [] for item in payload.get("items", []): if item.get("type") != "model": continue model_id = item.get("id") if not model_id: continue providers = [] for provider in item.get("availableInferenceProviders", []) or []: if provider.get("providerStatus") == "live" and provider.get("modelStatus") == "live": providers.append(str(provider.get("provider"))) models.append( { "id": model_id, "live_providers": sorted(set(providers)), } ) return models MODEL_INFO = fetch_qwen35_models() MODEL_IDS = [x["id"] for x in MODEL_INFO] DEFAULT_MODEL = MODEL_IDS[0] if MODEL_IDS else "Qwen/Qwen3.5-35B-A3B" PROVIDER_LOOKUP = {x["id"]: x.get("live_providers", []) for x in MODEL_INFO} def provider_note(model_id: str) -> str: providers = PROVIDER_LOOKUP.get(model_id, []) if providers: return f"Live inference providers: {', '.join(providers)}" return "No live provider listed by HF for this model right now. Try another model." def generate_reply( message: str, history: list[tuple[str, str]], model_id: str, system_prompt: str, max_new_tokens: int, temperature: float, top_p: float, ) -> str: token = os.getenv("HF_TOKEN") client = InferenceClient(token=token, timeout=120) messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) try: result = client.chat_completion( model=model_id, messages=messages, max_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p), ) reply = result.choices[0].message.content if isinstance(reply, str): return reply return str(reply) except Exception as exc: return ( f"Model call failed for `{model_id}`.\n\n" f"Details: {exc}\n\n" "Try another model from the dropdown. Some models may not currently have a live provider." ) with gr.Blocks(title="Qwen3.5 Chat") as demo: gr.Markdown("# Qwen3.5 Chat") gr.Markdown( "Select a model from the official Qwen3.5 collection and chat. " "This Space uses Hugging Face Inference providers via `HF_TOKEN`." ) model_dd = gr.Dropdown( choices=MODEL_IDS, value=DEFAULT_MODEL, label="Qwen3.5 Model", allow_custom_value=False, ) provider_md = gr.Markdown(provider_note(DEFAULT_MODEL)) with gr.Accordion("Generation Settings", open=False): system_prompt = gr.Textbox( label="System prompt", value="You are a helpful assistant.", lines=2, ) max_new_tokens = gr.Slider( label="Max new tokens", minimum=64, maximum=4096, step=32, value=1024, ) temperature = gr.Slider( label="Temperature", minimum=0.0, maximum=2.0, step=0.05, value=0.7, ) top_p = gr.Slider( label="Top-p", minimum=0.1, maximum=1.0, step=0.05, value=0.9, ) model_dd.change(fn=provider_note, inputs=model_dd, outputs=provider_md) gr.ChatInterface( fn=generate_reply, additional_inputs=[model_dd, system_prompt, max_new_tokens, temperature, top_p], ) demo.queue(max_size=32).launch()