Spaces:

CooLLaMACEO
/

Overflow-100B

Sleeping

File size: 3,796 Bytes

bf00a76
0187888
7a93b93
0187888
5ec5b09
 
7a93b93
5ec5b09
 
992127b
f895e5d
0187888
60fb399
f895e5d
 
bf00a76
0187888
6a82d80
992127b
6a82d80
fa3e031
6a82d80
992127b
0187888
60fb399
01cf8d5
0187888
b4391b2
7a93b93
d0c2ac1
7a93b93
60fb399
01cf8d5
60fb399
 
992127b
01cf8d5
992127b
60fb399
992127b
60fb399
 
01cf8d5
992127b
 
 
 
 
 
 
 
 
b4391b2
992127b
b4391b2
 
 
 
 
 
 
992127b
d0c2ac1
b4391b2
 
 
6a82d80
5ec5b09
 
 
b4391b2
5ec5b09
60fb399
5ec5b09
 
60fb399
b4391b2
5ec5b09
d0c2ac1
5ec5b09
7a93b93
d0c2ac1
7a93b93
d0c2ac1
5ec5b09
6a82d80
5ec5b09
7a93b93
f895e5d
 
bf00a76
60fb399
f895e5d
b4391b2
 
 
 
bf00a76
b4391b2
 
60fb399
b4391b2
 
6a82d80
bf00a76
6a82d80
 
b4391b2
 
0187888
5ec5b09
d0c2ac1
f895e5d
6a82d80
 
bf00a76
6a82d80

# app.py
import os
import sys
import torch
import secrets
import time
from fastapi import FastAPI, HTTPException, Depends
from fastapi.security.api_key import APIKeyHeader
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoConfig, GPT2TokenizerFast
from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE

# --- 1. GLOBAL VARIABLES ---
tokenizer = None
model = None
generated_keys = {}

# --- 2. CONFIGURATION ---
MODEL_PATH = "/app/model"  # folder with vocab.json, merges.txt, model weights
API_KEY_NAME = "X-API-Key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=True)

app = FastAPI(title="Overflow-111.7B API")

# --- 3. MODEL & TOKENIZER LOADING ---
print("Starting Engine: Initializing Self-Registration...")

try:
    if MODEL_PATH not in sys.path:
        sys.path.insert(0, MODEL_PATH)

    # Import configuration and model
    import configuration_overflow
    import modeling_overflow

    # Register config and model
    conf_class = configuration_overflow.OverflowConfig
    model_class = modeling_overflow.OverflowForCausalLM

    AutoConfig.register("overflow", conf_class)
    AutoModelForCausalLM.register(conf_class, model_class)
    print(f"Successfully registered model class {model_class.__name__}.")

    # Load GPT2-style tokenizer from vocab.json + merges.txt
    tokenizer = GPT2TokenizerFast(
        vocab_file=f"{MODEL_PATH}/vocab.json",
        merges_file=f"{MODEL_PATH}/merges.txt",
        unk_token="",
        bos_token="",
        eos_token=""
    )
    print("Tokenizer loaded successfully.")

    # Load model weights
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True,
        device_map={"": "cpu"},
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True
    )
    print("Model loaded successfully. Engine Status: ONLINE")

except Exception as e:
    print(f"CRITICAL LOADING ERROR: {e}")

# --- 4. API SCHEMAS ---
class Query(BaseModel):
    prompt: str
    max_tokens: int = 50
    temperature: float = 0.7

# --- 5. API KEY AUTHENTICATION ---
@app.get("/api/generate")
async def create_new_key():
    """Generate a unique API key."""
    new_key = f"of_sk-{secrets.token_hex(12)}"
    generated_keys[new_key] = {"created_at": time.time()}
    return {"status": "success", "api_key": new_key}

async def verify_auth(api_key: str = Depends(api_key_header)):
    if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"):
        return api_key
    raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key")

# --- 6. CORE ENDPOINTS ---
@app.post("/v1/generate")
async def generate(query: Query, auth: str = Depends(verify_auth)):
    if tokenizer is None or model is None:
        raise HTTPException(
            status_code=HTTP_503_SERVICE_UNAVAILABLE,
            detail="Engine still loading. Please wait."
        )
    try:
        inputs = tokenizer(query.prompt, return_tensors="pt")
        with torch.no_grad():
            output_tokens = model.generate(
                **inputs,
                max_new_tokens=query.max_tokens,
                temperature=query.temperature,
                do_sample=query.temperature > 0
            )
        response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
        return {
            "model": "Overflow-111.7B",
            "choices": [{"text": response_text}]
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
def health():
    state = "active" if model else "loading"
    return {"status": state, "engine": "Overflow-111.7B"}

# --- 7. RUN SERVER ---
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)