| | import gradio as gr |
| | import spaces |
| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | from huggingface_hub import snapshot_download |
| |
|
| | |
| | MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4" |
| |
|
| | print(f"⚙️ Setting up environment for {MODEL_ID}...") |
| |
|
| | |
| | model = None |
| | tokenizer = None |
| |
|
| | |
| | def download_model_first(): |
| | print("⏳ Starting preventive weight download (This will take time)...") |
| | try: |
| | |
| | snapshot_download(repo_id=MODEL_ID) |
| | print("✅ Download complete! Files are cached.") |
| | except Exception as e: |
| | print(f"⚠️ Warning: Download failed or already exists. Error: {e}") |
| |
|
| | def load_model(): |
| | global model, tokenizer |
| | if model is None: |
| | print(f"🔥 Loading model into VRAM...") |
| | try: |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| | |
| | |
| | model = AutoModelForCausalLM.from_pretrained( |
| | MODEL_ID, |
| | device_map="auto", |
| | trust_remote_code=True, |
| | torch_dtype=torch.float16 |
| | ) |
| | print("✅ Qwen 72B is ready!") |
| | except Exception as e: |
| | print(f"❌ Critical error loading the model: {e}") |
| | raise e |
| | return model, tokenizer |
| |
|
| | |
| | @spaces.GPU(duration=150) |
| | def generate(message, history, system_prompt, temperature, max_tokens): |
| | model, tokenizer = load_model() |
| | |
| | messages = [] |
| | if system_prompt: |
| | messages.append({"role": "system", "content": system_prompt}) |
| | |
| | |
| | for turn in history: |
| | if turn[0]: messages.append({"role": "user", "content": turn[0]}) |
| | if turn[1]: messages.append({"role": "assistant", "content": turn[1]}) |
| | |
| | messages.append({"role": "user", "content": message}) |
| |
|
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| | |
| | inputs = tokenizer([text], return_tensors="pt").to(model.device) |
| |
|
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=max_tokens, |
| | temperature=temperature, |
| | do_sample=True, |
| | top_p=0.95, |
| | top_k=40, |
| | repetition_penalty=1.1 |
| | ) |
| | |
| | response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) |
| | return response |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown(f"# Qwen 72B ZeroGPU Test") |
| | |
| | |
| | gr.Markdown( |
| | """ |
| | ### ⚠️ WARNING: Large Model Inference Test |
| | **This model (Qwen 72B) is extremely large.** |
| | * **Loading time:** There may be a massive delay during the first initialization. |
| | * **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space. |
| | """ |
| | ) |
| | |
| | with gr.Accordion("⚙️ Settings", open=False): |
| | sys_prompt = gr.Textbox( |
| | label="System Prompt", |
| | value="You are an expert AI assistant focused on complex coding solutions and software architecture.", |
| | lines=2 |
| | ) |
| | temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature") |
| | tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens") |
| |
|
| | chat = gr.ChatInterface( |
| | fn=generate, |
| | additional_inputs=[sys_prompt, temp, tokens] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | download_model_first() |
| | demo.launch() |