Spaces:
Sleeping
Sleeping
File size: 2,789 Bytes
5e0532d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# PASTE THIS INTO A COLAB CELL to run your Backend on a GPU!
import os
from google.colab import userdata
# 1. Install Dependencies
print("Installing dependencies...")
!pip install -q fastapi uvicorn pyngrok nest_asyncio unsloth[colab-new]
# 2. Setup Ngrok (You need a free token from https://dashboard.ngrok.com/get-started/your-authtoken)
# Replace 'YOUR_TOKEN' below or set it in Colab Secrets
NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE"
from pyngrok import ngrok
ngrok.set_auth_token(NGROK_TOKEN)
# 3. Create the Server Code (Writing to file in Colab)
server_code = """
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
import os
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct"
# Check if adapter exists in the uploaded files, else just base
ADAPTER_PATH = "/content/important/finetuning/models/ora_adapter"
model = None
tokenizer = None
class ChatRequest(BaseModel):
message: str
history: list = []
@app.on_event("startup")
async def load_model():
global model, tokenizer
print("Loading Model...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16,
device_map="cuda",
load_in_4bit=True,
)
if os.path.exists(ADAPTER_PATH):
print(f"Loading Adapter from {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
else:
print("Adapter not found at path, using Base!")
model = base_model
print("Ready!")
@app.post("/api/chat")
async def chat(req: ChatRequest):
messages = [{"role": "system", "content": "You are ORA, a spiritual assistant."}]
messages.extend(req.history[-4:])
messages.append({"role": "user", "content": req.message})
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
outputs = model.generate(input_ids, max_new_tokens=256, temperature=0.7)
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
return {"response": response}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
"""
with open("server.py", "w") as f:
f.write(server_code)
# 4. Run Server & Tunnel
import nest_asyncio
nest_asyncio.apply()
# Start Tunnel
public_url = ngrok.connect(8000).public_url
print(f"\\n\\nYOUR PUBLIC API URL IS: {public_url}\\n\\n")
# Run Server
!python server.py
|