File size: 2,789 Bytes
5e0532d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# PASTE THIS INTO A COLAB CELL to run your Backend on a GPU!

import os
from google.colab import userdata

# 1. Install Dependencies
print("Installing dependencies...")
!pip install -q fastapi uvicorn pyngrok nest_asyncio unsloth[colab-new]

# 2. Setup Ngrok (You need a free token from https://dashboard.ngrok.com/get-started/your-authtoken)
# Replace 'YOUR_TOKEN' below or set it in Colab Secrets
NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE" 
from pyngrok import ngrok
ngrok.set_auth_token(NGROK_TOKEN)

# 3. Create the Server Code (Writing to file in Colab)
server_code = """
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
import os

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct"
# Check if adapter exists in the uploaded files, else just base
ADAPTER_PATH = "/content/important/finetuning/models/ora_adapter"

model = None
tokenizer = None

class ChatRequest(BaseModel):
    message: str
    history: list = []

@app.on_event("startup")
async def load_model():
    global model, tokenizer
    print("Loading Model...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="cuda",
        load_in_4bit=True,
    )
    
    if os.path.exists(ADAPTER_PATH):
        print(f"Loading Adapter from {ADAPTER_PATH}")
        model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
    else:
        print("Adapter not found at path, using Base!")
        model = base_model
    print("Ready!")

@app.post("/api/chat")
async def chat(req: ChatRequest):
    messages = [{"role": "system", "content": "You are ORA, a spiritual assistant."}]
    messages.extend(req.history[-4:])
    messages.append({"role": "user", "content": req.message})

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    
    outputs = model.generate(input_ids, max_new_tokens=256, temperature=0.7)
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return {"response": response}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
"""

with open("server.py", "w") as f:
    f.write(server_code)

# 4. Run Server & Tunnel
import nest_asyncio
nest_asyncio.apply()

# Start Tunnel
public_url = ngrok.connect(8000).public_url
print(f"\\n\\nYOUR PUBLIC API URL IS: {public_url}\\n\\n")

# Run Server
!python server.py