Yermek68 commited on
Commit
e7fcc3e
Β·
verified Β·
1 Parent(s): 5aabf12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -60
app.py CHANGED
@@ -1,112 +1,180 @@
1
  """
2
- Eroha v6.5-ML Stable + Latency Patch (RC1)
3
- Production-grade build: FastAPI + Gradio + Prometheus + Lazy Model
 
4
  """
5
 
6
  import asyncio
7
- import time
8
  import psutil
 
9
  from fastapi import FastAPI, Request
10
- from slowapi import Limiter
11
  from slowapi.util import get_remote_address
 
12
  from prometheus_client import make_asgi_app, Counter, Gauge, Histogram
13
  import gradio as gr
14
  from gradio.routes import mount_gradio_app
 
15
  from transformers import pipeline
 
16
 
17
- # ─────────────────────────────
18
- # 1️⃣ ΠœΠ΅Ρ‚Ρ€ΠΈΠΊΠΈ ΠΈ Π»ΠΈΠΌΠΈΡ‚Π΅Ρ€
19
- # ─────────────────────────────
20
- REQ_COUNT = Counter("api_requests_total", "Total API requests", ["endpoint"])
21
- SYS_USAGE = Gauge("system_usage_percent", "System resource usage", ["resource"])
22
  INF_LATENCY = Histogram(
23
  "inference_latency_seconds",
24
  "Time spent in model inference",
25
- buckets=[0.1, 0.5, 1, 2, 5, 10, float("inf")]
26
  )
27
- state = {"cpu": 0, "ram": 0, "model_ready": False}
28
 
 
 
 
 
 
29
  def get_real_ip(request: Request):
30
  forwarded = request.headers.get("x-forwarded-for")
31
  return forwarded.split(",")[0] if forwarded else request.client.host
32
 
33
  limiter = Limiter(key_func=get_real_ip)
34
 
35
- # ─────────────────────────────
36
- # 2️⃣ МодСль (Lazy Singleton)
37
- # ─────────────────────────────
38
  class ErohaModel:
39
  pipe = None
40
-
41
  @classmethod
42
  def get_pipe(cls):
43
  if cls.pipe is None:
44
- cls.pipe = pipeline("text-generation", model="gpt2")
 
45
  state["model_ready"] = True
 
46
  return cls.pipe
47
 
48
- # ─────────────────────────────
49
- # 3️⃣ ΠœΠΎΠ½ΠΈΡ‚ΠΎΡ€ΠΈΠ½Π³ (Ρ„ΠΎΠ½)
50
- # ─────────────────────────────
51
- async def monitor():
52
- while True:
53
- state["cpu"] = psutil.cpu_percent()
54
- state["ram"] = psutil.virtual_memory().percent
55
- SYS_USAGE.labels(resource="cpu").set(state["cpu"])
56
- SYS_USAGE.labels(resource="ram").set(state["ram"])
57
- await asyncio.sleep(15)
58
-
59
- app = FastAPI(on_startup=[lambda: asyncio.create_task(monitor())])
60
- app.mount("/metrics/prom", make_asgi_app())
61
-
62
- # ─────────────────────────────
63
- # 4️⃣ Π­Π½Π΄ΠΏΠΎΠΈΠ½Ρ‚Ρ‹ API
64
- # ─────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  @app.post("/inference")
66
  @limiter.limit("5/minute")
67
  async def inference(request: Request):
68
  REQ_COUNT.labels(endpoint="/inference").inc()
69
  data = await request.json()
70
  prompt = data.get("prompt", "")
71
-
72
  loop = asyncio.get_event_loop()
73
 
74
- start_time = time.perf_counter()
75
  with INF_LATENCY.time():
76
  res = await loop.run_in_executor(
77
- None, lambda: ErohaModel.get_pipe()(prompt, max_length=50)
 
78
  )
79
- latency = time.perf_counter() - start_time
80
 
81
  return {
82
- "result": res[0]['generated_text'],
83
  "latency_sec": round(latency, 3),
84
  "cpu": state["cpu"],
85
  "ram": state["ram"]
86
  }
87
 
88
- @app.get("/health")
89
- async def health():
90
- return {"status": "ok", "model_loaded": state["model_ready"]}
91
-
92
- # ─────────────────────────────
93
- # 5️⃣ Π˜Π½Ρ‚Π΅Ρ€Ρ„Π΅ΠΉΡ (Gradio)
94
- # ─────────────────────────────
95
- with gr.Blocks(title="Eroha v6.5-ML Stable") as demo:
96
- gr.Markdown("# βš™οΈ Eroha v6.5-ML Stable")
97
- prompt = gr.Textbox(label="Input Prompt", placeholder="Type something...")
98
- output = gr.Textbox(label="Model Output")
99
- latency_box = gr.Number(label="Latency (sec)")
100
-
101
- def run_inference(text):
102
- start = time.perf_counter()
103
- res = ErohaModel.get_pipe()(text, max_length=50)[0]['generated_text']
104
- latency = time.perf_counter() - start
105
- return res, round(latency, 3)
106
-
107
- btn = gr.Button("Generate")
108
- btn.click(run_inference, inputs=prompt, outputs=[output, latency_box])
109
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  app = mount_gradio_app(app, demo, path="/")
111
 
112
  if __name__ == "__main__":
 
1
  """
2
+ Eroha v6.5-CoolDown Stable
3
+ --------------------------
4
+ ΠžΠΏΡ‚ΠΈΠΌΠΈΠ·ΠΈΡ€ΠΎΠ²Π°Π½ΠΎ ΠΏΠΎΠ΄ Π½ΠΈΠ·ΠΊΡƒΡŽ Π½Π°Π³Ρ€ΡƒΠ·ΠΊΡƒ ΠΈ ΡΡ‚Π°Π±ΠΈΠ»ΡŒΠ½ΡƒΡŽ Ρ€Π°Π±ΠΎΡ‚Ρƒ.
5
  """
6
 
7
  import asyncio
 
8
  import psutil
9
+ import time
10
  from fastapi import FastAPI, Request
11
+ from slowapi import Limiter, _rate_limit_exceeded_handler
12
  from slowapi.util import get_remote_address
13
+ from slowapi.errors import RateLimitExceeded
14
  from prometheus_client import make_asgi_app, Counter, Gauge, Histogram
15
  import gradio as gr
16
  from gradio.routes import mount_gradio_app
17
+ from contextlib import asynccontextmanager
18
  from transformers import pipeline
19
+ import httpx
20
 
21
+ # ───────────────────────────────
22
+ # 1️⃣ ΠœΠ΅Ρ‚Ρ€ΠΈΠΊΠΈ ΠΈ состояниС
23
+ # ───────────────────────────────
24
+ REQ_COUNT = Counter("api_requests_total", "Total requests", ["endpoint"])
25
+ SYS_USAGE = Gauge("system_usage_percent", "System metrics", ["resource"])
26
  INF_LATENCY = Histogram(
27
  "inference_latency_seconds",
28
  "Time spent in model inference",
29
+ buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, float("inf")]
30
  )
 
31
 
32
+ state = {"cpu": 0.0, "ram": 0.0, "timestamp": 0.0, "model_ready": False}
33
+
34
+ # ───────────────────────────────
35
+ # 2️⃣ Π›ΠΈΠΌΠΈΡ‚Π΅Ρ€ с ΠΏΠΎΠ΄Π΄Π΅Ρ€ΠΆΠΊΠΎΠΉ прокси
36
+ # ───────────────────────────────
37
  def get_real_ip(request: Request):
38
  forwarded = request.headers.get("x-forwarded-for")
39
  return forwarded.split(",")[0] if forwarded else request.client.host
40
 
41
  limiter = Limiter(key_func=get_real_ip)
42
 
43
+ # ───────────────────────────────
44
+ # 3️⃣ МодСль (лёгкая вСрсия GPT-2)
45
+ # ───────────────────────────────
46
  class ErohaModel:
47
  pipe = None
 
48
  @classmethod
49
  def get_pipe(cls):
50
  if cls.pipe is None:
51
+ print("[ErohaCore] 🧠 Loading distilgpt2 model (lightweight)...")
52
+ cls.pipe = pipeline("text-generation", model="distilgpt2")
53
  state["model_ready"] = True
54
+ print("[ErohaCore] βœ… Model ready.")
55
  return cls.pipe
56
 
57
+ # ───────────────────────────────
58
+ # 4️⃣ Lifespan ΠΈ фоновая Π·Π°Π΄Π°Ρ‡Π° ΠΌΠΎΠ½ΠΈΡ‚ΠΎΡ€ΠΈΠ½Π³Π°
59
+ # ───────────────────────────────
60
+ @asynccontextmanager
61
+ async def lifespan(app: FastAPI):
62
+ stop_event = asyncio.Event()
63
+ print("[ErohaCore] 🟒 CoolDown mode active β€” smart resource control enabled")
64
+
65
+ async def background_metrics():
66
+ """Π€ΠΎΠ½ΠΎΠ²Ρ‹ΠΉ сбор ΠΌΠ΅Ρ‚Ρ€ΠΈΠΊ с Π°Π²Ρ‚ΠΎ-Ρ€Π΅Π³ΡƒΠ»ΠΈΡ€ΠΎΠ²ΠΊΠΎΠΉ частоты"""
67
+ while not stop_event.is_set():
68
+ try:
69
+ cpu = psutil.cpu_percent()
70
+ ram = psutil.virtual_memory().percent
71
+ state["cpu"], state["ram"] = cpu, ram
72
+ state["timestamp"] = asyncio.get_event_loop().time()
73
+ SYS_USAGE.labels(resource="cpu").set(cpu)
74
+ SYS_USAGE.labels(resource="ram").set(ram)
75
+
76
+ # ΠŸΠ΅Ρ‡Π°Ρ‚ΡŒ состояния Π² консоль
77
+ print(f"[Monitor] CPU: {cpu:.1f}% | RAM: {ram:.1f}% | Next check in 60s")
78
+
79
+ # Smart CoolDown
80
+ if cpu > 85 or ram > 90:
81
+ print(f"[ErohaCore] ⚠️ High load detected β†’ pausing background tasks for 5 min")
82
+ await asyncio.sleep(300)
83
+ else:
84
+ await asyncio.sleep(60)
85
+
86
+ except Exception as e:
87
+ print(f"[Metrics Error] {e}")
88
+ await asyncio.sleep(60)
89
+
90
+ task = asyncio.create_task(background_metrics())
91
+ yield
92
+ stop_event.set()
93
+ await asyncio.gather(task, return_exceptions=True)
94
+
95
+ # ───────────────────────────────
96
+ # 5️⃣ FastAPI-ядро
97
+ # ───────────────────────────────
98
+ app = FastAPI(title="Eroha v6.5-CoolDown", lifespan=lifespan)
99
+ app.state.limiter = limiter
100
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
101
+
102
+ # Prometheus endpoint
103
+ metrics_app = make_asgi_app()
104
+ app.mount("/metrics/prom", metrics_app)
105
+
106
+ @app.get("/health")
107
+ async def health():
108
+ return {"status": "ok", "uptime": round(state["timestamp"], 2)}
109
+
110
+ @app.get("/metrics")
111
+ async def get_metrics():
112
+ return state
113
+
114
  @app.post("/inference")
115
  @limiter.limit("5/minute")
116
  async def inference(request: Request):
117
  REQ_COUNT.labels(endpoint="/inference").inc()
118
  data = await request.json()
119
  prompt = data.get("prompt", "")
 
120
  loop = asyncio.get_event_loop()
121
 
122
+ start = time.perf_counter()
123
  with INF_LATENCY.time():
124
  res = await loop.run_in_executor(
125
+ None,
126
+ lambda: ErohaModel.get_pipe()(prompt, max_length=50)
127
  )
128
+ latency = time.perf_counter() - start
129
 
130
  return {
131
+ "result": res[0]["generated_text"],
132
  "latency_sec": round(latency, 3),
133
  "cpu": state["cpu"],
134
  "ram": state["ram"]
135
  }
136
 
137
+ # ───────────────────────────────
138
+ # 6️⃣ Gradio Dashboard
139
+ # ───────────────────────────────
140
+ async def check_health_ui():
141
+ try:
142
+ async with httpx.AsyncClient(timeout=1) as client:
143
+ r = await client.get("http://localhost:7860/health")
144
+ if r.status_code == 200:
145
+ return "<div style='color:lime;font-size:18px;'>🟒 API ONLINE</div>"
146
+ except:
147
+ pass
148
+ return "<div style='color:red;font-size:18px;'>πŸ”΄ API OFFLINE</div>"
149
+
150
+ with gr.Blocks(title="Eroha v6.5-CoolDown Dashboard", theme=gr.themes.Soft()) as demo:
151
+ gr.Markdown("# βš™οΈ Eroha v6.5-CoolDown Stable")
152
+
153
+ with gr.Row():
154
+ health_status = gr.HTML("<div style='font-size:18px;'>🟑 Checking...</div>")
155
+
156
+ with gr.Tabs():
157
+ with gr.TabItem("Inference"):
158
+ inp = gr.Textbox(label="Prompt", placeholder="Type here...")
159
+ out = gr.Textbox(label="Model Output")
160
+ btn = gr.Button("Generate", variant="primary")
161
+ btn.click(
162
+ lambda x: ErohaModel.get_pipe()(x, max_length=50)[0]["generated_text"],
163
+ inputs=inp, outputs=out
164
+ )
165
+
166
+ with gr.TabItem("System Monitor"):
167
+ cpu_box = gr.Number(label="CPU %")
168
+ ram_box = gr.Number(label="RAM %")
169
+ gr.Markdown("> Metrics also exported to `/metrics/prom`")
170
+
171
+ # Авто-ΠΎΠ±Π½ΠΎΠ²Π»Π΅Π½ΠΈΠ΅ Ρ€Π°Π· Π² 30 сСкунд
172
+ demo.load(check_health_ui, outputs=[health_status], every=30)
173
+ demo.load(lambda: (state["cpu"], state["ram"]), outputs=[cpu_box, ram_box], every=30)
174
+
175
+ # ───────────────────────────────
176
+ # 7️⃣ ��апуск
177
+ # ───────────────────────────────
178
  app = mount_gradio_app(app, demo, path="/")
179
 
180
  if __name__ == "__main__":