| """ |
| Stack X Ultimate β Hugging Face Space Inference |
| ================================================ |
| A free HF Space that serves our model 24/7 on T4 GPU. |
| Works after training completes β auto-loads LoRA adapter + base model. |
| |
| Run on: https://huggingface.co/spaces/my-ai-stack/Stack-X-Ultimate-Inference |
| """ |
|
|
| import os |
| import torch |
| from typing import Optional |
|
|
| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from peft import PeftModel |
|
|
| |
| BASE_MODEL = "Qwen/Qwen2.5-Coder-3B-Instruct" |
| ADAPTER_REPO = "my-ai-stack/Stack-X-Ultimate" |
| FALLBACK_ADAPTER = "my-ai-stack/Stack-4.0-Qwen-3B-Agentic" |
|
|
| |
|
|
| def load_model(): |
| """Load model with LoRA adapter.""" |
| global model, tokenizer |
|
|
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.padding_side = "right" |
|
|
| print(f"Loading base: {BASE_MODEL}") |
| base = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
|
|
| |
| try: |
| print(f"Loading adapter: {ADAPTER_REPO}") |
| model = PeftModel.from_pretrained(base, ADAPTER_REPO) |
| print(f"β
Loaded {ADAPTER_REPO}") |
| except Exception as e1: |
| print(f"Failed to load {ADAPTER_REPO}: {e1}") |
| try: |
| print(f"Falling back to: {FALLBACK_ADAPTER}") |
| model = PeftModel.from_pretrained(base, FALLBACK_ADAPTER) |
| print(f"β
Loaded {FALLBACK_ADAPTER}") |
| except Exception as e2: |
| print(f"Both adapters failed. Using base model. Error: {e2}") |
| model = base |
|
|
| model.eval() |
| total = sum(p.numel() for p in model.parameters()) / 1e9 |
| print(f"Model ready: {total:.1f}B parameters") |
|
|
|
|
| |
| print("Initializing Stack X Ultimate Space...") |
| try: |
| load_model() |
| STATUS = "β
Model loaded" |
| except Exception as e: |
| STATUS = f"β οΈ Load error: {e}" |
| model = None |
| tokenizer = None |
|
|
| |
|
|
| def generate(prompt: str, max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9): |
| """Generate text response.""" |
| if model is None or tokenizer is None: |
| return "Model not loaded yet. Please try again in a moment." |
|
|
| if not prompt.strip(): |
| return "" |
|
|
| try: |
| messages = [ |
| {"role": "system", "content": "You are Stack X, a helpful AI coding assistant with tool-use capabilities."}, |
| {"role": "user", "content": prompt}, |
| ] |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) |
|
|
| with torch.no_grad(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p, |
| do_sample=temperature > 0, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| repetition_penalty=1.1, |
| ) |
|
|
| response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| return response |
|
|
| except Exception as e: |
| return f"Error: {e}" |
|
|
|
|
| def chat(messages: list, max_tokens: int = 512, temperature: float = 0.7): |
| """Chat with message history.""" |
| if model is None or tokenizer is None: |
| return "Model not loaded yet." |
|
|
| if not messages: |
| return "" |
|
|
| try: |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) |
|
|
| with torch.no_grad(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=temperature > 0, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| return response |
|
|
| except Exception as e: |
| return f"Error: {e}" |
|
|
|
|
| |
|
|
| with gr.Blocks(title="Stack X Ultimate", theme=gr.themes.Default()) as demo: |
| gr.Markdown("# π Stack X Ultimate Inference") |
| gr.Markdown(f"**Status:** {STATUS}") |
| gr.Markdown("Built on Qwen2.5-Coder-3B-Instruct + LoRA adapter trained on NVIDIA Nemotron + Stack-4.0 agentic data.") |
|
|
| with gr.Tab("Generate"): |
| prompt = gr.Textbox(label="Prompt", placeholder="Write a quicksort in Python...", lines=5) |
| with gr.Row(): |
| max_tok = gr.Slider(32, 1024, value=512, step=32, label="Max tokens") |
| temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
| top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p") |
| generate_btn = gr.Button("Generate", variant="primary") |
| output = gr.Textbox(label="Output", lines=10) |
| generate_btn.click(fn=generate, inputs=[prompt, max_tok, temp, top_p], outputs=output) |
|
|
| with gr.Tab("Chat"): |
| chatbot = gr.Chatbot(label="Conversation") |
| chat_msg = gr.Textbox(label="Your message", placeholder="Ask me anything...") |
| chat_clear = gr.Button("Clear") |
| chat_send = gr.Button("Send", variant="primary") |
|
|
| def user_msg(msg, history): |
| return "", history + [[msg, None]] |
|
|
| def bot_resp(history): |
| if not history: |
| return history |
| msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": c} |
| for i, c in enumerate(sum(history, []))] |
| |
| formatted = [] |
| for i, (role, content) in enumerate(zip(msgs[::2], msgs[1::2])): |
| formatted.append({"role": role["role"], "content": content["content"]}) |
| response = chat(formatted, max_tokens=512, temperature=0.7) |
| history[-1][1] = response |
| return history |
|
|
| chat_msg.submit(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then( |
| bot_resp, [chatbot], [chatbot] |
| ) |
| chat_send.click(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then( |
| bot_resp, [chatbot], [chatbot] |
| ) |
| chat_clear.click(fn=None, inputs=None, outputs=chatbot) |
|
|
| demo.launch(share=False) |