Bot detection triggered.
\"},\n", "]\n", "\n", "inputs = tokenizer.apply_chat_template(\n", " test_messages_2, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\"\n", ").to(\"cuda\")\n", "\n", "outputs = model.generate(\n", " input_ids=inputs,\n", " max_new_tokens=512,\n", " temperature=0.3,\n", " do_sample=True,\n", ")\n", "\n", "response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)\n", "print(\"Model response (error recovery):\")\n", "print(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9. Optional: Export to GGUF (for llama.cpp / Ollama)\n", "\n", "Uncomment to export for local deployment." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # Export to GGUF Q4_K_M (smallest good quality)\n", "# model.save_pretrained_gguf(\n", "# \"webscrape-agent-gguf\",\n", "# tokenizer,\n", "# quantization_method=\"q4_k_m\",\n", "# )\n", "# \n", "# # Push GGUF to Hub\n", "# model.push_to_hub_gguf(\n", "# OUTPUT_MODEL + \"-GGUF\",\n", "# tokenizer,\n", "# quantization_method=\"q4_k_m\",\n", "# )" ] } ] }