CooLLaMACEO commited on
Commit
bf00a76
·
verified ·
1 Parent(s): 6a82d80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -23
app.py CHANGED
@@ -1,9 +1,9 @@
 
1
  import os
2
  import sys
3
  import torch
4
  import secrets
5
  import time
6
- import importlib.util
7
  from fastapi import FastAPI, HTTPException, Depends
8
  from fastapi.security.api_key import APIKeyHeader
9
  from pydantic import BaseModel
@@ -11,10 +11,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
11
  from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
12
 
13
  # --- 1. GLOBAL INITIALIZATION ---
14
- # We define these at the top level so they exist when the app starts.
15
  tokenizer = None
16
  model = None
17
- generated_keys = {}
18
 
19
  # --- 2. CONFIGURATION ---
20
  MODEL_PATH = "/app/model"
@@ -27,37 +26,29 @@ app = FastAPI(title="Overflow-111.7B Self-Registering API")
27
  print("Starting Engine: Initializing Self-Registration...")
28
 
29
  try:
30
- # IMPORTANT: Global declaration must come BEFORE any usage in this block
31
- global tokenizer, model
32
-
33
- # Add model path to system so Python finds configuration_overflow.py
34
  if MODEL_PATH not in sys.path:
35
  sys.path.insert(0, MODEL_PATH)
36
 
37
- # Force-Register the Custom Configuration
38
  import configuration_overflow
39
  conf_class = configuration_overflow.OverflowConfig
40
  AutoConfig.register("overflow", conf_class)
41
  print("Successfully registered 'overflow' config.")
42
 
43
- # Force-Register the Custom Model Architecture
44
  import modeling_overflow
45
- # Dynamically find the CausalLM class (usually OverflowForCausalLM)
46
  model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
47
  if model_classes:
48
  model_class = getattr(modeling_overflow, model_classes[0])
49
  AutoModelForCausalLM.register(conf_class, model_class)
50
  print(f"Successfully registered {model_classes[0]} to AutoModel.")
51
 
52
- # Load Tokenizer
53
  print("Loading Tokenizer...")
54
- tokenizer = AutoTokenizer.from_pretrained(
55
- MODEL_PATH,
56
- trust_remote_code=True
57
- )
58
 
59
- # Load Model Weights
60
- # Optimized for CPU usage with bfloat16 and low memory footprint
61
  print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
62
  model = AutoModelForCausalLM.from_pretrained(
63
  MODEL_PATH,
@@ -80,7 +71,7 @@ class Query(BaseModel):
80
  # --- 5. AUTHENTICATION ---
81
  @app.get("/api/generate")
82
  async def create_new_key():
83
- """Generates a unique of_sk- key for the session."""
84
  new_key = f"of_sk-{secrets.token_hex(12)}"
85
  generated_keys[new_key] = {"created_at": time.time()}
86
  return {"status": "success", "api_key": new_key}
@@ -94,25 +85,25 @@ async def verify_auth(api_key: str = Depends(api_key_header)):
94
  # --- 6. CORE ENDPOINTS ---
95
  @app.post("/v1/generate")
96
  async def generate(query: Query, auth: str = Depends(verify_auth)):
97
- # If a user pings the API before the 111.7B weights are in RAM
98
  if tokenizer is None or model is None:
99
  raise HTTPException(
100
- status_code=HTTP_503_SERVICE_UNAVAILABLE,
101
- detail="Engine is still booting up (111.7B parameters take time). Please wait."
102
  )
103
 
104
  try:
105
  inputs = tokenizer(query.prompt, return_tensors="pt")
106
  with torch.no_grad():
107
  output_tokens = model.generate(
108
- **inputs,
109
  max_new_tokens=query.max_tokens,
110
  temperature=query.temperature,
111
  do_sample=True if query.temperature > 0 else False
112
  )
113
  response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
114
  return {
115
- "model": "Overflow-111.7B",
116
  "choices": [{"text": response_text}]
117
  }
118
  except Exception as e:
@@ -123,6 +114,7 @@ def health():
123
  state = "active" if model else "loading"
124
  return {"status": state, "engine": "Overflow-111.7B"}
125
 
 
126
  if __name__ == "__main__":
127
  import uvicorn
128
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # app.py
2
  import os
3
  import sys
4
  import torch
5
  import secrets
6
  import time
 
7
  from fastapi import FastAPI, HTTPException, Depends
8
  from fastapi.security.api_key import APIKeyHeader
9
  from pydantic import BaseModel
 
11
  from starlette.status import HTTP_403_FORBIDDEN, HTTP_503_SERVICE_UNAVAILABLE
12
 
13
  # --- 1. GLOBAL INITIALIZATION ---
 
14
  tokenizer = None
15
  model = None
16
+ generated_keys = {}
17
 
18
  # --- 2. CONFIGURATION ---
19
  MODEL_PATH = "/app/model"
 
26
  print("Starting Engine: Initializing Self-Registration...")
27
 
28
  try:
29
+ # Ensure model path is in sys.path
 
 
 
30
  if MODEL_PATH not in sys.path:
31
  sys.path.insert(0, MODEL_PATH)
32
 
33
+ # Register custom config
34
  import configuration_overflow
35
  conf_class = configuration_overflow.OverflowConfig
36
  AutoConfig.register("overflow", conf_class)
37
  print("Successfully registered 'overflow' config.")
38
 
39
+ # Register custom model architecture
40
  import modeling_overflow
 
41
  model_classes = [c for c in dir(modeling_overflow) if 'ForCausalLM' in c]
42
  if model_classes:
43
  model_class = getattr(modeling_overflow, model_classes[0])
44
  AutoModelForCausalLM.register(conf_class, model_class)
45
  print(f"Successfully registered {model_classes[0]} to AutoModel.")
46
 
47
+ # Load tokenizer
48
  print("Loading Tokenizer...")
49
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 
 
 
50
 
51
+ # Load model weights
 
52
  print("Loading Model Weights (111.7B Parameters - 1-Bit)...")
53
  model = AutoModelForCausalLM.from_pretrained(
54
  MODEL_PATH,
 
71
  # --- 5. AUTHENTICATION ---
72
  @app.get("/api/generate")
73
  async def create_new_key():
74
+ """Generates a unique API key for the session."""
75
  new_key = f"of_sk-{secrets.token_hex(12)}"
76
  generated_keys[new_key] = {"created_at": time.time()}
77
  return {"status": "success", "api_key": new_key}
 
85
  # --- 6. CORE ENDPOINTS ---
86
  @app.post("/v1/generate")
87
  async def generate(query: Query, auth: str = Depends(verify_auth)):
88
+ # Ensure the model is loaded
89
  if tokenizer is None or model is None:
90
  raise HTTPException(
91
+ status_code=HTTP_503_SERVICE_UNAVAILABLE,
92
+ detail="Engine is still booting up (111.7B parameters). Please wait."
93
  )
94
 
95
  try:
96
  inputs = tokenizer(query.prompt, return_tensors="pt")
97
  with torch.no_grad():
98
  output_tokens = model.generate(
99
+ **inputs,
100
  max_new_tokens=query.max_tokens,
101
  temperature=query.temperature,
102
  do_sample=True if query.temperature > 0 else False
103
  )
104
  response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
105
  return {
106
+ "model": "Overflow-111.7B",
107
  "choices": [{"text": response_text}]
108
  }
109
  except Exception as e:
 
114
  state = "active" if model else "loading"
115
  return {"status": state, "engine": "Overflow-111.7B"}
116
 
117
+ # --- 7. RUN SERVER ---
118
  if __name__ == "__main__":
119
  import uvicorn
120
  uvicorn.run(app, host="0.0.0.0", port=7860)