Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import sys | |
| import torch | |
| import secrets | |
| import time | |
| from fastapi import FastAPI, HTTPException, Depends | |
| from fastapi.security.api_key import APIKeyHeader | |
| from pydantic import BaseModel | |
| from transformers import AutoModelForCausalLM, AutoConfig, GPT2TokenizerFast | |
| from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE | |
| # --- 1. GLOBAL VARIABLES --- | |
| tokenizer = None | |
| model = None | |
| generated_keys = {} | |
| # --- 2. CONFIGURATION --- | |
| MODEL_PATH = "/app/model" # folder with vocab.json, merges.txt, model weights | |
| API_KEY_NAME = "X-API-Key" | |
| api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=True) | |
| app = FastAPI(title="Overflow-111.7B API") | |
| # --- 3. MODEL & TOKENIZER LOADING --- | |
| print("Starting Engine: Initializing Self-Registration...") | |
| try: | |
| if MODEL_PATH not in sys.path: | |
| sys.path.insert(0, MODEL_PATH) | |
| # Import configuration and model | |
| import configuration_overflow | |
| import modeling_overflow | |
| # Register config and model | |
| conf_class = configuration_overflow.OverflowConfig | |
| model_class = modeling_overflow.OverflowForCausalLM | |
| AutoConfig.register("overflow", conf_class) | |
| AutoModelForCausalLM.register(conf_class, model_class) | |
| print(f"Successfully registered model class {model_class.__name__}.") | |
| # Load GPT2-style tokenizer from vocab.json + merges.txt | |
| tokenizer = GPT2TokenizerFast( | |
| vocab_file=f"{MODEL_PATH}/vocab.json", | |
| merges_file=f"{MODEL_PATH}/merges.txt", | |
| unk_token="", | |
| bos_token="", | |
| eos_token="" | |
| ) | |
| print("Tokenizer loaded successfully.") | |
| # Load model weights | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_PATH, | |
| trust_remote_code=True, | |
| device_map={"": "cpu"}, | |
| torch_dtype=torch.bfloat16, | |
| low_cpu_mem_usage=True | |
| ) | |
| print("Model loaded successfully. Engine Status: ONLINE") | |
| except Exception as e: | |
| print(f"CRITICAL LOADING ERROR: {e}") | |
| # --- 4. API SCHEMAS --- | |
| class Query(BaseModel): | |
| prompt: str | |
| max_tokens: int = 50 | |
| temperature: float = 0.7 | |
| # --- 5. API KEY AUTHENTICATION --- | |
| async def create_new_key(): | |
| """Generate a unique API key.""" | |
| new_key = f"of_sk-{secrets.token_hex(12)}" | |
| generated_keys[new_key] = {"created_at": time.time()} | |
| return {"status": "success", "api_key": new_key} | |
| async def verify_auth(api_key: str = Depends(api_key_header)): | |
| if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"): | |
| return api_key | |
| raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key") | |
| # --- 6. CORE ENDPOINTS --- | |
| async def generate(query: Query, auth: str = Depends(verify_auth)): | |
| if tokenizer is None or model is None: | |
| raise HTTPException( | |
| status_code=HTTP_503_SERVICE_UNAVAILABLE, | |
| detail="Engine still loading. Please wait." | |
| ) | |
| try: | |
| inputs = tokenizer(query.prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| output_tokens = model.generate( | |
| **inputs, | |
| max_new_tokens=query.max_tokens, | |
| temperature=query.temperature, | |
| do_sample=query.temperature > 0 | |
| ) | |
| response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
| return { | |
| "model": "Overflow-111.7B", | |
| "choices": [{"text": response_text}] | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def health(): | |
| state = "active" if model else "loading" | |
| return {"status": state, "engine": "Overflow-111.7B"} | |
| # --- 7. RUN SERVER --- | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |