Spaces:

CooLLaMACEO
/

Overflow-100B

Sleeping

App Files Files Community

Overflow-100B / app.py

CooLLaMACEO

Update app.py

fa3e031 verified about 1 month ago

raw

history blame contribute delete

3.8 kB

	# app.py
	import os
	import sys
	import torch
	import secrets
	import time
	from fastapi import FastAPI, HTTPException, Depends
	from fastapi.security.api_key import APIKeyHeader
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoConfig, GPT2TokenizerFast
	from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE

	# --- 1. GLOBAL VARIABLES ---
	tokenizer = None
	model = None
	generated_keys = {}

	# --- 2. CONFIGURATION ---
	MODEL_PATH = "/app/model" # folder with vocab.json, merges.txt, model weights
	API_KEY_NAME = "X-API-Key"
	api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=True)

	app = FastAPI(title="Overflow-111.7B API")

	# --- 3. MODEL & TOKENIZER LOADING ---
	print("Starting Engine: Initializing Self-Registration...")

	try:
	if MODEL_PATH not in sys.path:
	sys.path.insert(0, MODEL_PATH)

	# Import configuration and model
	import configuration_overflow
	import modeling_overflow

	# Register config and model
	conf_class = configuration_overflow.OverflowConfig
	model_class = modeling_overflow.OverflowForCausalLM

	AutoConfig.register("overflow", conf_class)
	AutoModelForCausalLM.register(conf_class, model_class)
	print(f"Successfully registered model class {model_class.__name__}.")

	# Load GPT2-style tokenizer from vocab.json + merges.txt
	tokenizer = GPT2TokenizerFast(
	vocab_file=f"{MODEL_PATH}/vocab.json",
	merges_file=f"{MODEL_PATH}/merges.txt",
	unk_token="",
	bos_token="",
	eos_token=""
	)
	print("Tokenizer loaded successfully.")

	# Load model weights
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	device_map={"": "cpu"},
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True
	)
	print("Model loaded successfully. Engine Status: ONLINE")

	except Exception as e:
	print(f"CRITICAL LOADING ERROR: {e}")

	# --- 4. API SCHEMAS ---
	class Query(BaseModel):
	prompt: str
	max_tokens: int = 50
	temperature: float = 0.7

	# --- 5. API KEY AUTHENTICATION ---
	@app.get("/api/generate")
	async def create_new_key():
	"""Generate a unique API key."""
	new_key = f"of_sk-{secrets.token_hex(12)}"
	generated_keys[new_key] = {"created_at": time.time()}
	return {"status": "success", "api_key": new_key}

	async def verify_auth(api_key: str = Depends(api_key_header)):
	if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"):
	return api_key
	raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key")

	# --- 6. CORE ENDPOINTS ---
	@app.post("/v1/generate")
	async def generate(query: Query, auth: str = Depends(verify_auth)):
	if tokenizer is None or model is None:
	raise HTTPException(
	status_code=HTTP_503_SERVICE_UNAVAILABLE,
	detail="Engine still loading. Please wait."
	)
	try:
	inputs = tokenizer(query.prompt, return_tensors="pt")
	with torch.no_grad():
	output_tokens = model.generate(
	**inputs,
	max_new_tokens=query.max_tokens,
	temperature=query.temperature,
	do_sample=query.temperature > 0
	)
	response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
	return {
	"model": "Overflow-111.7B",
	"choices": [{"text": response_text}]
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/")
	def health():
	state = "active" if model else "loading"
	return {"status": state, "engine": "Overflow-111.7B"}

	# --- 7. RUN SERVER ---
	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)