Overflow-100B / app.py
CooLLaMACEO's picture
Update app.py
fa3e031 verified
# app.py
import os
import sys
import torch
import secrets
import time
from fastapi import FastAPI, HTTPException, Depends
from fastapi.security.api_key import APIKeyHeader
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoConfig, GPT2TokenizerFast
from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
# --- 1. GLOBAL VARIABLES ---
tokenizer = None
model = None
generated_keys = {}
# --- 2. CONFIGURATION ---
MODEL_PATH = "/app/model" # folder with vocab.json, merges.txt, model weights
API_KEY_NAME = "X-API-Key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=True)
app = FastAPI(title="Overflow-111.7B API")
# --- 3. MODEL & TOKENIZER LOADING ---
print("Starting Engine: Initializing Self-Registration...")
try:
if MODEL_PATH not in sys.path:
sys.path.insert(0, MODEL_PATH)
# Import configuration and model
import configuration_overflow
import modeling_overflow
# Register config and model
conf_class = configuration_overflow.OverflowConfig
model_class = modeling_overflow.OverflowForCausalLM
AutoConfig.register("overflow", conf_class)
AutoModelForCausalLM.register(conf_class, model_class)
print(f"Successfully registered model class {model_class.__name__}.")
# Load GPT2-style tokenizer from vocab.json + merges.txt
tokenizer = GPT2TokenizerFast(
vocab_file=f"{MODEL_PATH}/vocab.json",
merges_file=f"{MODEL_PATH}/merges.txt",
unk_token="",
bos_token="",
eos_token=""
)
print("Tokenizer loaded successfully.")
# Load model weights
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
device_map={"": "cpu"},
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True
)
print("Model loaded successfully. Engine Status: ONLINE")
except Exception as e:
print(f"CRITICAL LOADING ERROR: {e}")
# --- 4. API SCHEMAS ---
class Query(BaseModel):
prompt: str
max_tokens: int = 50
temperature: float = 0.7
# --- 5. API KEY AUTHENTICATION ---
@app.get("/api/generate")
async def create_new_key():
"""Generate a unique API key."""
new_key = f"of_sk-{secrets.token_hex(12)}"
generated_keys[new_key] = {"created_at": time.time()}
return {"status": "success", "api_key": new_key}
async def verify_auth(api_key: str = Depends(api_key_header)):
if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"):
return api_key
raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key")
# --- 6. CORE ENDPOINTS ---
@app.post("/v1/generate")
async def generate(query: Query, auth: str = Depends(verify_auth)):
if tokenizer is None or model is None:
raise HTTPException(
status_code=HTTP_503_SERVICE_UNAVAILABLE,
detail="Engine still loading. Please wait."
)
try:
inputs = tokenizer(query.prompt, return_tensors="pt")
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=query.max_tokens,
temperature=query.temperature,
do_sample=query.temperature > 0
)
response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
return {
"model": "Overflow-111.7B",
"choices": [{"text": response_text}]
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
def health():
state = "active" if model else "loading"
return {"status": state, "engine": "Overflow-111.7B"}
# --- 7. RUN SERVER ---
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)