Spaces:

Yermek68
/

eroha-agentapi

Sleeping

App Files Files Community

Yermek68 commited on 26 days ago

Commit

e7fcc3e

verified ·

1 Parent(s): 5aabf12

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -60

app.py CHANGED Viewed

@@ -1,112 +1,180 @@
 """
-Eroha v6.5-ML Stable + Latency Patch (RC1)
-Production-grade build: FastAPI + Gradio + Prometheus + Lazy Model
 """
 import asyncio
-import time
 import psutil
 from fastapi import FastAPI, Request
-from slowapi import Limiter
 from slowapi.util import get_remote_address
 from prometheus_client import make_asgi_app, Counter, Gauge, Histogram
 import gradio as gr
 from gradio.routes import mount_gradio_app
 from transformers import pipeline
-# ─────────────────────────────
-# 1️⃣ Метрики и лимитер
-# ─────────────────────────────
-REQ_COUNT = Counter("api_requests_total", "Total API requests", ["endpoint"])
-SYS_USAGE = Gauge("system_usage_percent", "System resource usage", ["resource"])
 INF_LATENCY = Histogram(
     "inference_latency_seconds",
     "Time spent in model inference",
-    buckets=[0.1, 0.5, 1, 2, 5, 10, float("inf")]
 )
-state = {"cpu": 0, "ram": 0, "model_ready": False}
 def get_real_ip(request: Request):
     forwarded = request.headers.get("x-forwarded-for")
     return forwarded.split(",")[0] if forwarded else request.client.host
 limiter = Limiter(key_func=get_real_ip)
-# ─────────────────────────────
-# 2️⃣ Модель (Lazy Singleton)
-# ─────────────────────────────
 class ErohaModel:
     pipe = None
     @classmethod
     def get_pipe(cls):
         if cls.pipe is None:
-            cls.pipe = pipeline("text-generation", model="gpt2")
             state["model_ready"] = True
         return cls.pipe
-# ─────────────────────────────
-# 3️⃣ Мониторинг (фон)
-# ─────────────────────────────
-async def monitor():
-    while True:
-        state["cpu"] = psutil.cpu_percent()
-        state["ram"] = psutil.virtual_memory().percent
-        SYS_USAGE.labels(resource="cpu").set(state["cpu"])
-        SYS_USAGE.labels(resource="ram").set(state["ram"])
-        await asyncio.sleep(15)
-app = FastAPI(on_startup=[lambda: asyncio.create_task(monitor())])
-app.mount("/metrics/prom", make_asgi_app())
-# ─────────────────────────────
-# 4️⃣ Эндпоинты API
-# ─────────────────────────────
 @app.post("/inference")
 @limiter.limit("5/minute")
 async def inference(request: Request):
     REQ_COUNT.labels(endpoint="/inference").inc()
     data = await request.json()
     prompt = data.get("prompt", "")
     loop = asyncio.get_event_loop()
-    start_time = time.perf_counter()
     with INF_LATENCY.time():
         res = await loop.run_in_executor(
-            None, lambda: ErohaModel.get_pipe()(prompt, max_length=50)
         )
-    latency = time.perf_counter() - start_time
     return {
-        "result": res[0]['generated_text'],
         "latency_sec": round(latency, 3),
         "cpu": state["cpu"],
         "ram": state["ram"]
     }
-@app.get("/health")
-async def health():
-    return {"status": "ok", "model_loaded": state["model_ready"]}
-# ─────────────────────────────
-# 5️⃣ Интерфейс (Gradio)
-# ─────────────────────────────
-with gr.Blocks(title="Eroha v6.5-ML Stable") as demo:
-    gr.Markdown("# ⚙️ Eroha v6.5-ML Stable")
-    prompt = gr.Textbox(label="Input Prompt", placeholder="Type something...")
-    output = gr.Textbox(label="Model Output")
-    latency_box = gr.Number(label="Latency (sec)")
-    def run_inference(text):
-        start = time.perf_counter()
-        res = ErohaModel.get_pipe()(text, max_length=50)[0]['generated_text']
-        latency = time.perf_counter() - start
-        return res, round(latency, 3)
-    btn = gr.Button("Generate")
-    btn.click(run_inference, inputs=prompt, outputs=[output, latency_box])
 app = mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

 """
+Eroha v6.5-CoolDown Stable
+--------------------------
+Оптимизировано под низкую нагрузку и стабильную работу.
 """
 import asyncio
 import psutil
+import time
 from fastapi import FastAPI, Request
+from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
 from prometheus_client import make_asgi_app, Counter, Gauge, Histogram
 import gradio as gr
 from gradio.routes import mount_gradio_app
+from contextlib import asynccontextmanager
 from transformers import pipeline
+import httpx
+# ───────────────────────────────
+# 1️⃣ Метрики и состояние
+# ───────────────────────────────
+REQ_COUNT = Counter("api_requests_total", "Total requests", ["endpoint"])
+SYS_USAGE = Gauge("system_usage_percent", "System metrics", ["resource"])
 INF_LATENCY = Histogram(
     "inference_latency_seconds",
     "Time spent in model inference",
+    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, float("inf")]
 )
+state = {"cpu": 0.0, "ram": 0.0, "timestamp": 0.0, "model_ready": False}
+# ───────────────────────────────
+# 2️⃣ Лимитер с поддержкой прокси
+# ───────────────────────────────
 def get_real_ip(request: Request):
     forwarded = request.headers.get("x-forwarded-for")
     return forwarded.split(",")[0] if forwarded else request.client.host
 limiter = Limiter(key_func=get_real_ip)
+# ───────────────────────────────
+# 3️⃣ Модель (лёгкая версия GPT-2)
+# ───────────────────────────────
 class ErohaModel:
     pipe = None
     @classmethod
     def get_pipe(cls):
         if cls.pipe is None:
+            print("[ErohaCore] 🧠 Loading distilgpt2 model (lightweight)...")
+            cls.pipe = pipeline("text-generation", model="distilgpt2")
             state["model_ready"] = True
+            print("[ErohaCore] ✅ Model ready.")
         return cls.pipe
+# ───────────────────────────────
+# 4️⃣ Lifespan и фоновая задача мониторинга
+# ───────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    stop_event = asyncio.Event()
+    print("[ErohaCore] 🟢 CoolDown mode active — smart resource control enabled")
+    async def background_metrics():
+        """Фоновый сбор метрик с авто-регулировкой частоты"""
+        while not stop_event.is_set():
+            try:
+                cpu = psutil.cpu_percent()
+                ram = psutil.virtual_memory().percent
+                state["cpu"], state["ram"] = cpu, ram
+                state["timestamp"] = asyncio.get_event_loop().time()
+                SYS_USAGE.labels(resource="cpu").set(cpu)
+                SYS_USAGE.labels(resource="ram").set(ram)
+                # Печать состояния в консоль
+                print(f"[Monitor] CPU: {cpu:.1f}% | RAM: {ram:.1f}% | Next check in 60s")
+                # Smart CoolDown
+                if cpu > 85 or ram > 90:
+                    print(f"[ErohaCore] ⚠️ High load detected → pausing background tasks for 5 min")
+                    await asyncio.sleep(300)
+                else:
+                    await asyncio.sleep(60)
+            except Exception as e:
+                print(f"[Metrics Error] {e}")
+                await asyncio.sleep(60)
+    task = asyncio.create_task(background_metrics())
+    yield
+    stop_event.set()
+    await asyncio.gather(task, return_exceptions=True)
+# ───────────────────────────────
+# 5️⃣ FastAPI-ядро
+# ───────────────────────────────
+app = FastAPI(title="Eroha v6.5-CoolDown", lifespan=lifespan)
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+# Prometheus endpoint
+metrics_app = make_asgi_app()
+app.mount("/metrics/prom", metrics_app)
+@app.get("/health")
+async def health():
+    return {"status": "ok", "uptime": round(state["timestamp"], 2)}
+@app.get("/metrics")
+async def get_metrics():
+    return state
 @app.post("/inference")
 @limiter.limit("5/minute")
 async def inference(request: Request):
     REQ_COUNT.labels(endpoint="/inference").inc()
     data = await request.json()
     prompt = data.get("prompt", "")
     loop = asyncio.get_event_loop()
+    start = time.perf_counter()
     with INF_LATENCY.time():
         res = await loop.run_in_executor(
+            None,
+            lambda: ErohaModel.get_pipe()(prompt, max_length=50)
         )
+    latency = time.perf_counter() - start
     return {
+        "result": res[0]["generated_text"],
         "latency_sec": round(latency, 3),
         "cpu": state["cpu"],
         "ram": state["ram"]
     }
+# ───────────────────────────────
+# 6️⃣ Gradio Dashboard
+# ───────────────────────────────
+async def check_health_ui():
+    try:
+        async with httpx.AsyncClient(timeout=1) as client:
+            r = await client.get("http://localhost:7860/health")
+            if r.status_code == 200:
+                return "<div style='color:lime;font-size:18px;'>🟢 API ONLINE</div>"
+    except:
+        pass
+    return "<div style='color:red;font-size:18px;'>🔴 API OFFLINE</div>"
+with gr.Blocks(title="Eroha v6.5-CoolDown Dashboard", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚙️ Eroha v6.5-CoolDown Stable")
+    with gr.Row():
+        health_status = gr.HTML("<div style='font-size:18px;'>🟡 Checking...</div>")
+    with gr.Tabs():
+        with gr.TabItem("Inference"):
+            inp = gr.Textbox(label="Prompt", placeholder="Type here...")
+            out = gr.Textbox(label="Model Output")
+            btn = gr.Button("Generate", variant="primary")
+            btn.click(
+                lambda x: ErohaModel.get_pipe()(x, max_length=50)[0]["generated_text"],
+                inputs=inp, outputs=out
+            )
+        with gr.TabItem("System Monitor"):
+            cpu_box = gr.Number(label="CPU %")
+            ram_box = gr.Number(label="RAM %")
+            gr.Markdown("> Metrics also exported to `/metrics/prom`")
+    # Авто-обновление раз в 30 секунд
+    demo.load(check_health_ui, outputs=[health_status], every=30)
+    demo.load(lambda: (state["cpu"], state["ram"]), outputs=[cpu_box, ram_box], every=30)
+# ───────────────────────────────
+# 7️⃣ ��апуск
+# ───────────────────────────────
 app = mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":