# PASTE THIS INTO A COLAB CELL to run your Backend on a GPU! import os from google.colab import userdata # 1. Install Dependencies print("Installing dependencies...") !pip install -q fastapi uvicorn pyngrok nest_asyncio unsloth[colab-new] # 2. Setup Ngrok (You need a free token from https://dashboard.ngrok.com/get-started/your-authtoken) # Replace 'YOUR_TOKEN' below or set it in Colab Secrets NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE" from pyngrok import ngrok ngrok.set_auth_token(NGROK_TOKEN) # 3. Create the Server Code (Writing to file in Colab) server_code = """ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import uvicorn import os app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" # Check if adapter exists in the uploaded files, else just base ADAPTER_PATH = "/content/important/finetuning/models/ora_adapter" model = None tokenizer = None class ChatRequest(BaseModel): message: str history: list = [] @app.on_event("startup") async def load_model(): global model, tokenizer print("Loading Model...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="cuda", load_in_4bit=True, ) if os.path.exists(ADAPTER_PATH): print(f"Loading Adapter from {ADAPTER_PATH}") model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) else: print("Adapter not found at path, using Base!") model = base_model print("Ready!") @app.post("/api/chat") async def chat(req: ChatRequest): messages = [{"role": "system", "content": "You are ORA, a spiritual assistant."}] messages.extend(req.history[-4:]) messages.append({"role": "user", "content": req.message}) input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda") outputs = model.generate(input_ids, max_new_tokens=256, temperature=0.7) response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) return {"response": response} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) """ with open("server.py", "w") as f: f.write(server_code) # 4. Run Server & Tunnel import nest_asyncio nest_asyncio.apply() # Start Tunnel public_url = ngrok.connect(8000).public_url print(f"\\n\\nYOUR PUBLIC API URL IS: {public_url}\\n\\n") # Run Server !python server.py