Spaces:
Sleeping
Sleeping
| # PASTE THIS INTO A COLAB CELL to run your Backend on a GPU! | |
| import os | |
| from google.colab import userdata | |
| # 1. Install Dependencies | |
| print("Installing dependencies...") | |
| !pip install -q fastapi uvicorn pyngrok nest_asyncio unsloth[colab-new] | |
| # 2. Setup Ngrok (You need a free token from https://dashboard.ngrok.com/get-started/your-authtoken) | |
| # Replace 'YOUR_TOKEN' below or set it in Colab Secrets | |
| NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE" | |
| from pyngrok import ngrok | |
| ngrok.set_auth_token(NGROK_TOKEN) | |
| # 3. Create the Server Code (Writing to file in Colab) | |
| server_code = """ | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import uvicorn | |
| import os | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" | |
| # Check if adapter exists in the uploaded files, else just base | |
| ADAPTER_PATH = "/content/important/finetuning/models/ora_adapter" | |
| model = None | |
| tokenizer = None | |
| class ChatRequest(BaseModel): | |
| message: str | |
| history: list = [] | |
| @app.on_event("startup") | |
| async def load_model(): | |
| global model, tokenizer | |
| print("Loading Model...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float16, | |
| device_map="cuda", | |
| load_in_4bit=True, | |
| ) | |
| if os.path.exists(ADAPTER_PATH): | |
| print(f"Loading Adapter from {ADAPTER_PATH}") | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) | |
| else: | |
| print("Adapter not found at path, using Base!") | |
| model = base_model | |
| print("Ready!") | |
| @app.post("/api/chat") | |
| async def chat(req: ChatRequest): | |
| messages = [{"role": "system", "content": "You are ORA, a spiritual assistant."}] | |
| messages.extend(req.history[-4:]) | |
| messages.append({"role": "user", "content": req.message}) | |
| input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda") | |
| outputs = model.generate(input_ids, max_new_tokens=256, temperature=0.7) | |
| response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) | |
| return {"response": response} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8000) | |
| """ | |
| with open("server.py", "w") as f: | |
| f.write(server_code) | |
| # 4. Run Server & Tunnel | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| # Start Tunnel | |
| public_url = ngrok.connect(8000).public_url | |
| print(f"\\n\\nYOUR PUBLIC API URL IS: {public_url}\\n\\n") | |
| # Run Server | |
| !python server.py | |