"""Tiny Aya — streaming multilingual chat, built for the Build Small Hackathon. A gr.Server app: custom HTML/JS frontend (Cohere Labs + Build Small styling) backed by Gradio's queue + ZeroGPU. The browser talks to the `/chat` route through the Gradio JS client, so it streams token-by-token. Deploy on Hugging Face Spaces: - sdk: gradio (in README.md frontmatter) - add HF_TOKEN as a Space secret (tiny-aya-global is a gated model) - upload the logo file alongside this app: Cohere Labs-LockUp-Blue-CMYK.png """ import os import threading import torch import gradio as gr from fastapi.responses import HTMLResponse from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer try: import spaces _HAS_SPACES = True except ImportError: _HAS_SPACES = False # --------------------------------------------------------------------------- # # Model # --------------------------------------------------------------------------- # MODEL_ID = "CohereLabs/tiny-aya-global" HF_TOKEN = os.environ.get("HF_TOKEN") # gated repo -> needs a token device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", token=HF_TOKEN) model.to(device) # module-level: ZeroGPU fast-restore def _stream(messages: list): inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict( **inputs, max_new_tokens=512, do_sample=True, temperature=0.3, streamer=streamer, ) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) thread.start() acc = "" for token in streamer: acc += token yield acc thread.join() # @spaces.GPU only on ZeroGPU; cap duration at 120s (the ZeroGPU max). if _HAS_SPACES: _stream = spaces.GPU(duration=120)(_stream) # --------------------------------------------------------------------------- # # Server # --------------------------------------------------------------------------- # server = gr.Server() @server.get("/", response_class=HTMLResponse) async def homepage() -> str: return FRONTEND_HTML @server.api(name="chat") def chat_api(messages: list) -> str: # generator -> annotate with the YIELDED type yield from _stream(messages) # --------------------------------------------------------------------------- # # Frontend # --------------------------------------------------------------------------- # BANNER_URL = "https://cdn-uploads.huggingface.co/production/uploads/60d2dc1007da9c17c72708f8/Z0dKQfn56SAMmjVQTEaA0.png" COHERE_LOGO_URL = "https://cdn-uploads.huggingface.co/production/uploads/60d2dc1007da9c17c72708f8/fnuLx-qT2qzlYmEp6cszN.png" FRONTEND_HTML = f"""
Chat with Tiny Aya, Cohere Labs' 3.35B multilingual model. Ask in any of 70+ languages.