Spaces:

CooLLaMACEO
/

Overflow-100B

Sleeping

App Files Files Community

CooLLaMACEO commited on Mar 13

Commit

bf00a76

verified ·

1 Parent(s): 6a82d80

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -23

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
 import sys
 import torch
 import secrets
 import time
-import importlib.util
 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security.api_key import APIKeyHeader
 from pydantic import BaseModel
@@ -11,10 +11,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
 # --- 1. GLOBAL INITIALIZATION ---
-# We define these at the top level so they exist when the app starts.
 tokenizer = None
 model = None
-generated_keys = {}
 # --- 2. CONFIGURATION ---
 MODEL_PATH = "/app/model"
@@ -27,37 +26,29 @@ app = FastAPI(title="Overflow-111.7B Self-Registering API")
 print("Starting Engine: Initializing Self-Registration...")
 try:
-    # IMPORTANT: Global declaration must come BEFORE any usage in this block
-    global tokenizer, model
-    # Add model path to system so Python finds configuration_overflow.py
     if MODEL_PATH not in sys.path:
         sys.path.insert(0, MODEL_PATH)
-    # Force-Register the Custom Configuration
     import configuration_overflow
     conf_class = configuration_overflow.OverflowConfig
     AutoConfig.register("overflow", conf_class)
     print("Successfully registered 'overflow' config.")
-    # Force-Register the Custom Model Architecture
     import modeling_overflow
-    # Dynamically find the CausalLM class (usually OverflowForCausalLM)
     model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
     if model_classes:
         model_class = getattr(modeling_overflow, model_classes[0])
         AutoModelForCausalLM.register(conf_class, model_class)
         print(f"Successfully registered {model_classes[0]} to AutoModel.")
-    # Load Tokenizer
     print("Loading Tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_PATH,
-        trust_remote_code=True
-    )
-    # Load Model Weights
-    # Optimized for CPU usage with bfloat16 and low memory footprint
     print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_PATH,
@@ -80,7 +71,7 @@ class Query(BaseModel):
 # --- 5. AUTHENTICATION ---
 @app.get("/api/generate")
 async def create_new_key():
-    """Generates a unique of_sk- key for the session."""
     new_key = f"of_sk-{secrets.token_hex(12)}"
     generated_keys[new_key] = {"created_at": time.time()}
     return {"status": "success", "api_key": new_key}
@@ -94,25 +85,25 @@ async def verify_auth(api_key: str = Depends(api_key_header)):
 # --- 6. CORE ENDPOINTS ---
 @app.post("/v1/generate")
 async def generate(query: Query, auth: str = Depends(verify_auth)):
-    # If a user pings the API before the 111.7B weights are in RAM
     if tokenizer is None or model is None:
         raise HTTPException(
-            status_code=HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Engine is still booting up (111.7B parameters take time). Please wait."
         )
     try:
         inputs = tokenizer(query.prompt, return_tensors="pt")
         with torch.no_grad():
             output_tokens = model.generate(
-                **inputs,
                 max_new_tokens=query.max_tokens,
                 temperature=query.temperature,
                 do_sample=True if query.temperature > 0 else False
             )
         response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
         return {
-            "model": "Overflow-111.7B",
             "choices": [{"text": response_text}]
         }
     except Exception as e:
@@ -123,6 +114,7 @@ def health():
     state = "active" if model else "loading"
     return {"status": state, "engine": "Overflow-111.7B"}
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+# app.py
 import os
 import sys
 import torch
 import secrets
 import time
 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security.api_key import APIKeyHeader
 from pydantic import BaseModel
 from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
 # --- 1. GLOBAL INITIALIZATION ---
 tokenizer = None
 model = None
+generated_keys = {}
 # --- 2. CONFIGURATION ---
 MODEL_PATH = "/app/model"
 print("Starting Engine: Initializing Self-Registration...")
 try:
+    # Ensure model path is in sys.path
     if MODEL_PATH not in sys.path:
         sys.path.insert(0, MODEL_PATH)
+    # Register custom config
     import configuration_overflow
     conf_class = configuration_overflow.OverflowConfig
     AutoConfig.register("overflow", conf_class)
     print("Successfully registered 'overflow' config.")
+    # Register custom model architecture
     import modeling_overflow
     model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
     if model_classes:
         model_class = getattr(modeling_overflow, model_classes[0])
         AutoModelForCausalLM.register(conf_class, model_class)
         print(f"Successfully registered {model_classes[0]} to AutoModel.")
+    # Load tokenizer
     print("Loading Tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    # Load model weights
     print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_PATH,
 # --- 5. AUTHENTICATION ---
 @app.get("/api/generate")
 async def create_new_key():
+    """Generates a unique API key for the session."""
     new_key = f"of_sk-{secrets.token_hex(12)}"
     generated_keys[new_key] = {"created_at": time.time()}
     return {"status": "success", "api_key": new_key}
 # --- 6. CORE ENDPOINTS ---
 @app.post("/v1/generate")
 async def generate(query: Query, auth: str = Depends(verify_auth)):
+    # Ensure the model is loaded
     if tokenizer is None or model is None:
         raise HTTPException(
+            status_code=HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Engine is still booting up (111.7B parameters). Please wait."
         )
     try:
         inputs = tokenizer(query.prompt, return_tensors="pt")
         with torch.no_grad():
             output_tokens = model.generate(
+                **inputs,
                 max_new_tokens=query.max_tokens,
                 temperature=query.temperature,
                 do_sample=True if query.temperature > 0 else False
             )
         response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
         return {
+            "model": "Overflow-111.7B",
             "choices": [{"text": response_text}]
         }
     except Exception as e:
     state = "active" if model else "loading"
     return {"status": state, "engine": "Overflow-111.7B"}
+# --- 7. RUN SERVER ---
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)