Spaces:

CooLLaMACEO
/

Overflow-100B

Sleeping

App Files Files Community

CooLLaMACEO commited on Mar 13

Commit

6a82d80

verified ·

1 Parent(s): fdb020b

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -18

app.py CHANGED Viewed

@@ -10,49 +10,54 @@ from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
-# --- CONFIGURATION ---
-MODEL_PATH = "/app/model"
-API_KEY_NAME = "X-API-Key"
-api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
-# 1. Initialize at top level
 tokenizer = None
 model = None
 generated_keys = {}
 app = FastAPI(title="Overflow-111.7B Self-Registering API")
-# --- ENGINE LOADING ---
 print("Starting Engine: Initializing Self-Registration...")
 try:
-    # 2. MUST declare global at the very beginning of the block
     global tokenizer, model
     if MODEL_PATH not in sys.path:
         sys.path.insert(0, MODEL_PATH)
-    # Force-Register Config
     import configuration_overflow
     conf_class = configuration_overflow.OverflowConfig
     AutoConfig.register("overflow", conf_class)
-    print(f"Successfully registered 'overflow' config.")
-    # Force-Register Model
     import modeling_overflow
     model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
     if model_classes:
         model_class = getattr(modeling_overflow, model_classes[0])
         AutoModelForCausalLM.register(conf_class, model_class)
         print(f"Successfully registered {model_classes[0]} to AutoModel.")
-    # 3. Now load them into the global variables
     print("Loading Tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_PATH,
         trust_remote_code=True
     )
     print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_PATH,
@@ -66,31 +71,34 @@ try:
 except Exception as e:
     print(f"CRITICAL LOADING ERROR: {e}")
-# --- SCHEMAS ---
 class Query(BaseModel):
     prompt: str
     max_tokens: int = 50
     temperature: float = 0.7
-# --- AUTH ---
 @app.get("/api/generate")
 async def create_new_key():
     new_key = f"of_sk-{secrets.token_hex(12)}"
     generated_keys[new_key] = {"created_at": time.time()}
     return {"status": "success", "api_key": new_key}
 async def verify_auth(api_key: str = Depends(api_key_header)):
     if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"):
         return api_key
     raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key")
-# --- ENDPOINTS ---
 @app.post("/v1/generate")
 async def generate(query: Query, auth: str = Depends(verify_auth)):
     if tokenizer is None or model is None:
         raise HTTPException(
             status_code=HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Engine is still booting up. Please wait."
         )
     try:
@@ -103,11 +111,18 @@ async def generate(query: Query, auth: str = Depends(verify_auth)):
                 do_sample=True if query.temperature > 0 else False
             )
         response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-        return {"model": "Overflow-111.7B", "choices": [{"text": response_text}]}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 def health():
     state = "active" if model else "loading"
-    return {"status": state, "engine": "Overflow-111.7B"}

 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
+# --- 1. GLOBAL INITIALIZATION ---
+# We define these at the top level so they exist when the app starts.
 tokenizer = None
 model = None
 generated_keys = {}
+# --- 2. CONFIGURATION ---
+MODEL_PATH = "/app/model"
+API_KEY_NAME = "X-API-Key"
+api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 app = FastAPI(title="Overflow-111.7B Self-Registering API")
+# --- 3. ENGINE LOADING & SELF-REGISTRATION ---
 print("Starting Engine: Initializing Self-Registration...")
 try:
+    # IMPORTANT: Global declaration must come BEFORE any usage in this block
     global tokenizer, model
+    # Add model path to system so Python finds configuration_overflow.py
     if MODEL_PATH not in sys.path:
         sys.path.insert(0, MODEL_PATH)
+    # Force-Register the Custom Configuration
     import configuration_overflow
     conf_class = configuration_overflow.OverflowConfig
     AutoConfig.register("overflow", conf_class)
+    print("Successfully registered 'overflow' config.")
+    # Force-Register the Custom Model Architecture
     import modeling_overflow
+    # Dynamically find the CausalLM class (usually OverflowForCausalLM)
     model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
     if model_classes:
         model_class = getattr(modeling_overflow, model_classes[0])
         AutoModelForCausalLM.register(conf_class, model_class)
         print(f"Successfully registered {model_classes[0]} to AutoModel.")
+    # Load Tokenizer
     print("Loading Tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_PATH,
         trust_remote_code=True
     )
+    # Load Model Weights
+    # Optimized for CPU usage with bfloat16 and low memory footprint
     print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_PATH,
 except Exception as e:
     print(f"CRITICAL LOADING ERROR: {e}")
+# --- 4. API SCHEMAS ---
 class Query(BaseModel):
     prompt: str
     max_tokens: int = 50
     temperature: float = 0.7
+# --- 5. AUTHENTICATION ---
 @app.get("/api/generate")
 async def create_new_key():
+    """Generates a unique of_sk- key for the session."""
     new_key = f"of_sk-{secrets.token_hex(12)}"
     generated_keys[new_key] = {"created_at": time.time()}
     return {"status": "success", "api_key": new_key}
 async def verify_auth(api_key: str = Depends(api_key_header)):
+    """Validates the X-API-Key header."""
     if api_key in generated_keys or api_key == os.environ.get("MASTER_API_KEY"):
         return api_key
     raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Invalid API Key")
+# --- 6. CORE ENDPOINTS ---
 @app.post("/v1/generate")
 async def generate(query: Query, auth: str = Depends(verify_auth)):
+    # If a user pings the API before the 111.7B weights are in RAM
     if tokenizer is None or model is None:
         raise HTTPException(
             status_code=HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Engine is still booting up (111.7B parameters take time). Please wait."
         )
     try:
                 do_sample=True if query.temperature > 0 else False
             )
         response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+        return {
+            "model": "Overflow-111.7B",
+            "choices": [{"text": response_text}]
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 def health():
     state = "active" if model else "loading"
+    return {"status": state, "engine": "Overflow-111.7B"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)