# app.py import os import sys import torch import secrets import time from fastapi import FastAPI, HTTPException, Depends from fastapi.security.api_key import APIKeyHeader from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoConfig, GPT2TokenizerFast from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE # --- 1. GLOBAL VARIABLES --- tokenizer = None model = None generated_keys = {} # --- 2. CONFIGURATION --- MODEL_PATH = "/app/model" # folder with vocab.json, merges.txt, model weights API_KEY_NAME = "X-API-Key" api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=True) app = FastAPI(title="Overflow-111.7B API") # --- 3. MODEL & TOKENIZER LOADING --- print("Starting Engine: Initializing Self-Registration...") try: if MODEL_PATH not in sys.path: sys.path.insert(0, MODEL_PATH) # Import configuration and model import configuration_overflow import modeling_overflow # Register config and model conf_class = configuration_overflow.OverflowConfig model_class = modeling_overflow.OverflowForCausalLM AutoConfig.register("overflow", conf_class) AutoModelForCausalLM.register(conf_class, model_class) print(f"Successfully registered model class {model_class.__name__}.") # Load GPT2-style tokenizer from vocab.json + merges.txt tokenizer = GPT2TokenizerFast( vocab_file=f"{MODEL_PATH}/vocab.json", merges_file=f"{MODEL_PATH}/merges.txt", unk_token="", bos_token="", eos_token="" ) print("Tokenizer loaded successfully.") # Load model weights model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, trust_remote_code=True, device_map={"": "cpu"}, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True ) print("Model loaded successfully. Engine Status: ONLINE") except Exception as e: print(f"CRITICAL LOADING ERROR: {e}") # --- 4. API SCHEMAS --- class Query(BaseModel): prompt: str max_tokens: int = 50 temperature: float = 0.7 # --- 5. API KEY AUTHENTICATION --- @app.get("/api/generate") async def create_new_key(): """Generate a unique API key.""" new_key = f"of_sk-{secrets.token_hex(12)}" generated_keys[new_key] = {"created_at": time.time()} return {"status": "success", "api_key": new_key} async def verify_auth(api_key: str = Depends(api_key_header)): if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"): return api_key raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key") # --- 6. CORE ENDPOINTS --- @app.post("/v1/generate") async def generate(query: Query, auth: str = Depends(verify_auth)): if tokenizer is None or model is None: raise HTTPException( status_code=HTTP_503_SERVICE_UNAVAILABLE, detail="Engine still loading. Please wait." ) try: inputs = tokenizer(query.prompt, return_tensors="pt") with torch.no_grad(): output_tokens = model.generate( **inputs, max_new_tokens=query.max_tokens, temperature=query.temperature, do_sample=query.temperature > 0 ) response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) return { "model": "Overflow-111.7B", "choices": [{"text": response_text}] } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/") def health(): state = "active" if model else "loading" return {"status": state, "engine": "Overflow-111.7B"} # --- 7. RUN SERVER --- if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)