""" Stack X Ultimate — Hugging Face Space Inference ================================================ A free HF Space that serves our model 24/7 on T4 GPU. Works after training completes — auto-loads LoRA adapter + base model. Run on: https://huggingface.co/spaces/my-ai-stack/Stack-X-Ultimate-Inference """ import os import torch from typing import Optional import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # ─── Config ───────────────────────────────────────────────────────────────── BASE_MODEL = "Qwen/Qwen2.5-Coder-3B-Instruct" ADAPTER_REPO = "my-ai-stack/Stack-X-Ultimate" FALLBACK_ADAPTER = "my-ai-stack/Stack-4.0-Qwen-3B-Agentic" # ─── Model Loading ────────────────────────────────────────────────────────── def load_model(): """Load model with LoRA adapter.""" global model, tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" print(f"Loading base: {BASE_MODEL}") base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) # Try to load adapter try: print(f"Loading adapter: {ADAPTER_REPO}") model = PeftModel.from_pretrained(base, ADAPTER_REPO) print(f"✅ Loaded {ADAPTER_REPO}") except Exception as e1: print(f"Failed to load {ADAPTER_REPO}: {e1}") try: print(f"Falling back to: {FALLBACK_ADAPTER}") model = PeftModel.from_pretrained(base, FALLBACK_ADAPTER) print(f"✅ Loaded {FALLBACK_ADAPTER}") except Exception as e2: print(f"Both adapters failed. Using base model. Error: {e2}") model = base model.eval() total = sum(p.numel() for p in model.parameters()) / 1e9 print(f"Model ready: {total:.1f}B parameters") # Load at startup print("Initializing Stack X Ultimate Space...") try: load_model() STATUS = "✅ Model loaded" except Exception as e: STATUS = f"⚠️ Load error: {e}" model = None tokenizer = None # ─── Inference Functions ───────────────────────────────────────────────────── def generate(prompt: str, max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9): """Generate text response.""" if model is None or tokenizer is None: return "Model not loaded yet. Please try again in a moment." if not prompt.strip(): return "" try: messages = [ {"role": "system", "content": "You are Stack X, a helpful AI coding assistant with tool-use capabilities."}, {"role": "user", "content": prompt}, ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, ) response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response except Exception as e: return f"Error: {e}" def chat(messages: list, max_tokens: int = 512, temperature: float = 0.7): """Chat with message history.""" if model is None or tokenizer is None: return "Model not loaded yet." if not messages: return "" try: text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response except Exception as e: return f"Error: {e}" # ─── Gradio Interface ───────────────────────────────────────────────────────── with gr.Blocks(title="Stack X Ultimate", theme=gr.themes.Default()) as demo: gr.Markdown("# 🚀 Stack X Ultimate Inference") gr.Markdown(f"**Status:** {STATUS}") gr.Markdown("Built on Qwen2.5-Coder-3B-Instruct + LoRA adapter trained on NVIDIA Nemotron + Stack-4.0 agentic data.") with gr.Tab("Generate"): prompt = gr.Textbox(label="Prompt", placeholder="Write a quicksort in Python...", lines=5) with gr.Row(): max_tok = gr.Slider(32, 1024, value=512, step=32, label="Max tokens") temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p") generate_btn = gr.Button("Generate", variant="primary") output = gr.Textbox(label="Output", lines=10) generate_btn.click(fn=generate, inputs=[prompt, max_tok, temp, top_p], outputs=output) with gr.Tab("Chat"): chatbot = gr.Chatbot(label="Conversation") chat_msg = gr.Textbox(label="Your message", placeholder="Ask me anything...") chat_clear = gr.Button("Clear") chat_send = gr.Button("Send", variant="primary") def user_msg(msg, history): return "", history + [[msg, None]] def bot_resp(history): if not history: return history msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": c} for i, c in enumerate(sum(history, []))] # Build proper format formatted = [] for i, (role, content) in enumerate(zip(msgs[::2], msgs[1::2])): formatted.append({"role": role["role"], "content": content["content"]}) response = chat(formatted, max_tokens=512, temperature=0.7) history[-1][1] = response return history chat_msg.submit(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then( bot_resp, [chatbot], [chatbot] ) chat_send.click(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then( bot_resp, [chatbot], [chatbot] ) chat_clear.click(fn=None, inputs=None, outputs=chatbot) demo.launch(share=False)