Spaces:

ruslanmv
/

matrix-ai

Sleeping

App Files Files Community

ruslanmv commited on Sep 27

Commit

fd95484

1 Parent(s): 4740c16

First working version chat

Browse files

Files changed (5) hide show

app/core/config.py +13 -15
app/core/inference/client.py +130 -80
app/services/chat_service.py +18 -7
app/services/plan_service.py +175 -36
configs/settings.yaml +9 -6

app/core/config.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from __future__ import annotations
-import os
-import yaml
 from pydantic import BaseModel, AnyHttpUrl
 from typing import Optional
 class ModelCfg(BaseModel):
-    name: str = "meta-llama/Meta-Llama-3-8B-Instruct"
     fallback: str = "mistralai/Mistral-7B-Instruct-v0.2"
     max_new_tokens: int = 256
     temperature: float = 0.2
 class LimitsCfg(BaseModel):
     rate_per_min: int = 60
@@ -30,26 +30,24 @@ class Settings(BaseModel):
     rag: RagCfg = RagCfg()
     matrixhub: MatrixHubCfg = MatrixHubCfg()
     security: SecurityCfg = SecurityCfg()
     @staticmethod
     def load() -> Settings:
-        """Loads settings from YAML and overrides with environment variables."""
         path = os.getenv("SETTINGS_FILE", "configs/settings.yaml")
         data = {}
         if os.path.exists(path):
             with open(path, "r", encoding="utf-8") as f:
                 data = yaml.safe_load(f) or {}
         settings = Settings.model_validate(data)
-        # Environment variable overrides
-        if "MODEL_NAME" in os.environ:
-            settings.model.name = os.environ["MODEL_NAME"]
-        if "INDEX_DATASET" in os.environ:
-            settings.rag.index_dataset = os.environ["INDEX_DATASET"]
-        if "RATE_LIMITS" in os.environ:
-            settings.limits.rate_per_min = int(os.environ["RATE_LIMITS"])
-        if "ADMIN_TOKEN" in os.environ:
-            settings.security.admin_token = os.environ["ADMIN_TOKEN"]
         return settings

 from __future__ import annotations
+import os, yaml
 from pydantic import BaseModel, AnyHttpUrl
 from typing import Optional
 class ModelCfg(BaseModel):
+    name: str = "HuggingFaceH4/zephyr-7b-beta"
     fallback: str = "mistralai/Mistral-7B-Instruct-v0.2"
     max_new_tokens: int = 256
     temperature: float = 0.2
+    provider: Optional[str] = None      # NEW
 class LimitsCfg(BaseModel):
     rate_per_min: int = 60
     rag: RagCfg = RagCfg()
     matrixhub: MatrixHubCfg = MatrixHubCfg()
     security: SecurityCfg = SecurityCfg()
+    chat_backend: str = "router"        # NEW (reserved)
+    chat_stream: bool = True            # NEW
     @staticmethod
     def load() -> Settings:
         path = os.getenv("SETTINGS_FILE", "configs/settings.yaml")
         data = {}
         if os.path.exists(path):
             with open(path, "r", encoding="utf-8") as f:
                 data = yaml.safe_load(f) or {}
         settings = Settings.model_validate(data)
+        # Env overrides
+        if "MODEL_NAME" in os.environ: settings.model.name = os.environ["MODEL_NAME"]
+        if "MODEL_FALLBACK" in os.environ: settings.model.fallback = os.environ["MODEL_FALLBACK"]
+        if "MODEL_PROVIDER" in os.environ: settings.model.provider = os.environ["MODEL_PROVIDER"]
+        if "ADMIN_TOKEN" in os.environ: settings.security.admin_token = os.environ["ADMIN_TOKEN"]
+        if "RATE_LIMITS" in os.environ: settings.limits.rate_per_min = int(os.environ["RATE_LIMITS"])
+        if "HF_CHAT_BACKEND" in os.environ: settings.chat_backend = os.environ["HF_CHAT_BACKEND"].strip().lower()
+        if "CHAT_STREAM" in os.environ: settings.chat_stream = os.environ["CHAT_STREAM"].lower() in ("1","true","yes","on")
         return settings

app/core/inference/client.py CHANGED Viewed

@@ -1,94 +1,144 @@
-import os
-import logging
-import httpx
-from typing import Optional, Any, Union
-from tenacity import retry, stop_after_attempt, wait_exponential
 logger = logging.getLogger(__name__)
-class HFClient:
-    def __init__(self, model: str, fallback: Optional[str] = None, timeout: int = 20):
-        self.model = model
-        self.fallback = fallback
-        self.timeout = timeout
-        token = os.getenv("HF_TOKEN")
-        if not token:
-            raise ValueError("HF_TOKEN environment variable is not set. Put it in .env or export it before starting.")
-        self.headers = {
-            "Authorization": f"Bearer {token}",
-            "Accept": "application/json",
         }
-        self.api_base = "https://api-inference.huggingface.co/models"
-    async def _post(self, model: str, payload: dict) -> Any:
-        url = f"{self.api_base}/{model}"
-        # wait_for_model=true is helpful if the container is cold
-        params = {"wait_for_model": "true"}
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            r = await client.post(url, headers=self.headers, json=payload, params=params)
-            r.raise_for_status()
-            return r.json()
-    @staticmethod
-    def _extract_text(data: Union[dict, list, str]) -> str:
-        # HF can return list[{"generated_text": "..."}] or {"generated_text": "..."} or str
-        if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
-            return str(data[0]["generated_text"])
-        if isinstance(data, dict) and "generated_text" in data:
-            return str(data["generated_text"])
-        if isinstance(data, str):
-            return data
-        # Some serverless returns {"error": "..."} with 200—handle gently
-        if isinstance(data, dict) and "error" in data:
-            raise RuntimeError(f"Hugging Face error: {data['error']}")
-        raise RuntimeError(f"Unexpected HF response format: {data!r}")
-    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
-    async def _generate_once(self, model: str, prompt: str, max_new_tokens: int, temperature: float) -> str:
         payload = {
-            "inputs": prompt,
-            "parameters": {
-                "max_new_tokens": max(1, int(max_new_tokens)),
-                "temperature": float(max(temperature, 0.01)),
-                "return_full_text": False,
-            },
         }
-        data = await self._post(model, payload)
-        return self._extract_text(data)
-    async def generate(self, prompt: str, max_new_tokens: int, temperature: float) -> str:
-        # Try primary
         try:
-            return await self._generate_once(self.model, prompt, max_new_tokens, temperature)
-        except httpx.HTTPStatusError as e:
-            code = e.response.status_code
-            body = e.response.text
-            logger.error("HTTP error from HF API for model %s: %s", self.model, body)
-            # If not authorized / not found / gated, try fallback if defined
-            if code in (401, 403, 404) and self.fallback and self.fallback != self.model:
-                logger.warning("Falling back to model %s due to %s", self.fallback, code)
-                try:
-                    return await self._generate_once(self.fallback, prompt, max_new_tokens, temperature)
-                except Exception:
-                    # re-raise original meaningful error below
-                    pass
-            # Give a readable hint for common cause with Llama
-            if code in (401, 403, 404) and "meta-llama" in self.model.lower():
-                raise PermissionError(
-                    "Hugging Face returned 404/403 for a gated model. "
-                    "Make sure your HF account accepted the model license and your HF_TOKEN has access. "
-                    f"Model={self.model}"
-                ) from e
-            raise
         except Exception as e:
-            logger.error("Failed to call HF API for model %s: %s", self.model, e)
-            # Try fallback for transient or parsing errors
-            if self.fallback and self.fallback != self.model:
-                try:
-                    logger.warning("Falling back to model %s due to generic failure", self.fallback)
-                    return await self._generate_once(self.fallback, prompt, max_new_tokens, temperature)
-                except Exception:
-                    pass
-            raise

+import os, json, time, logging
+from typing import Dict, List, Optional, Iterator, Tuple
+import requests
 logger = logging.getLogger(__name__)
+ROUTER_URL = "https://router.huggingface.co/v1/chat/completions"
+def _require_token() -> str:
+    tok = os.getenv("HF_TOKEN")
+    if not tok:
+        raise ValueError("HF_TOKEN is not set. Put it in .env or export it before starting.")
+    return tok
+def _model_with_provider(model: str, provider: Optional[str]) -> str:
+    if provider and ":" not in model:
+        return f"{model}:{provider}"
+    return model
+def _mk_messages(system_prompt: Optional[str], user_text: str) -> List[Dict[str, str]]:
+    msgs: List[Dict[str, str]] = []
+    if system_prompt:
+        msgs.append({"role": "system", "content": system_prompt})
+    msgs.append({"role": "user", "content": user_text})
+    return msgs
+def _timeout_tuple(connect: float = 10.0, read: float = 60.0) -> Tuple[float, float]:
+    # requests timeout is (connect, read)
+    return (connect, read)
+class RouterRequestsClient:
+    """
+    Simple requests-only client for HF Router Chat Completions.
+    Supports non-streaming (returns str) and streaming (yields token strings).
+    """
+    def __init__(self, model: str, fallback: Optional[str] = None, provider: Optional[str] = None,
+                 max_retries: int = 2, connect_timeout: float = 10.0, read_timeout: float = 60.0):
+        self.model = model
+        self.fallback = fallback if fallback != model else None
+        self.provider = provider
+        self.headers = {"Authorization": f"Bearer {_require_token()}"}
+        self.max_retries = max(0, int(max_retries))
+        self.timeout = _timeout_tuple(connect_timeout, read_timeout)
+    # -------- Non-stream (single text) --------
+    def chat_nonstream(self, system_prompt: Optional[str], user_text: str,
+                       max_tokens: int, temperature: float) -> str:
+        payload = {
+            "model": _model_with_provider(self.model, self.provider),
+            "messages": _mk_messages(system_prompt, user_text),
+            "temperature": float(temperature),
+            "max_tokens": int(max_tokens),
+            "stream": False,
         }
+        text, ok = self._try_once(payload)
+        if ok:
+            return text
+        # fallback (if configured)
+        if self.fallback:
+            payload["model"] = _model_with_provider(self.fallback, self.provider)
+            text, ok = self._try_once(payload)
+            if ok:
+                return text
+        raise RuntimeError(f"Chat non-stream failed: model={self.model} fallback={self.fallback}")
+    def _try_once(self, payload: dict) -> Tuple[str, bool]:
+        last_err = None
+        for attempt in range(self.max_retries + 1):
+            try:
+                r = requests.post(ROUTER_URL, headers=self.headers, json=payload, timeout=self.timeout)
+                if r.status_code >= 400:
+                    logger.error("Router error %s: %s", r.status_code, r.text)
+                    last_err = RuntimeError(f"{r.status_code}: {r.text}")
+                    # do not hard-spin; brief pause
+                    time.sleep(min(1.5 * (attempt + 1), 3.0))
+                    continue
+                data = r.json()
+                return data["choices"][0]["message"]["content"], True
+            except Exception as e:
+                logger.error("Router request failure: %s", e)
+                last_err = e
+                time.sleep(min(1.5 * (attempt + 1), 3.0))
+        if last_err:
+            logger.error("Router exhausted retries: %s", last_err)
+        return "", False
+    # -------- Streaming (yield token deltas) --------
+    def chat_stream(self, system_prompt: Optional[str], user_text: str,
+                    max_tokens: int, temperature: float) -> Iterator[str]:
         payload = {
+            "model": _model_with_provider(self.model, self.provider),
+            "messages": _mk_messages(system_prompt, user_text),
+            "temperature": float(temperature),
+            "max_tokens": int(max_tokens),
+            "stream": True,
         }
+        # primary
+        ok = False
+        for token in self._stream_once(payload):
+            ok = True
+            yield token
+        if ok:
+            return
+        # fallback stream if primary produced nothing (or died immediately)
+        if self.fallback:
+            payload["model"] = _model_with_provider(self.fallback, self.provider)
+            for token in self._stream_once(payload):
+                yield token
+    def _stream_once(self, payload: dict) -> Iterator[str]:
         try:
+            with requests.post(ROUTER_URL, headers=self.headers, json=payload, stream=True, timeout=self.timeout) as r:
+                if r.status_code >= 400:
+                    logger.error("Router stream error %s: %s", r.status_code, r.text)
+                    return
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    if not line.startswith("data:"):
+                        continue
+                    data = line[len("data:"):].strip()
+                    if data == "[DONE]":
+                        return
+                    try:
+                        obj = json.loads(data)
+                        # OpenAI-style: delta tokens
+                        delta = obj["choices"][0]["delta"].get("content", "")
+                        if delta:
+                            yield delta
+                    except Exception as e:
+                        logger.warning("Stream JSON parse issue: %s | line=%r", e, line)
+                        continue
         except Exception as e:
+            logger.error("Stream request failure: %s", e)
+            return
+    # -------- Planning (non-stream) --------
+    def plan_nonstream(self, system_prompt: str, user_text: str,
+                       max_tokens: int, temperature: float) -> str:
+        """Use same chat/completions but always non-stream for planning."""
+        return self.chat_nonstream(system_prompt, user_text, max_tokens, temperature)

app/services/chat_service.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
 from ..core.config import Settings
-from ..core.inference.client import HFClient
 SYSTEM_PROMPT = (
     "You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem. "
@@ -10,16 +10,27 @@ SYSTEM_PROMPT = (
 class ChatService:
     def __init__(self, settings: Settings):
         self.settings = settings
-        self.client = HFClient(
             model=settings.model.name,
             fallback=settings.model.fallback,
         )
     async def answer(self, query: str) -> str:
-        prompt = f"{SYSTEM_PROMPT}\n\nUser: {query}\nAssistant:"
-        text = await self.client.generate(
-            prompt=prompt,
-            max_new_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
         )
-        return (text or "").strip()

 from __future__ import annotations
 from ..core.config import Settings
+from ..core.inference.client import RouterRequestsClient
 SYSTEM_PROMPT = (
     "You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem. "
 class ChatService:
     def __init__(self, settings: Settings):
         self.settings = settings
+        self.client = RouterRequestsClient(
             model=settings.model.name,
             fallback=settings.model.fallback,
+            provider=settings.model.provider,
+            max_retries=2,
+            connect_timeout=10.0,
+            read_timeout=60.0,
         )
     async def answer(self, query: str) -> str:
+        # non-stream (compatible with current UI)
+        return self.client.chat_nonstream(
+            SYSTEM_PROMPT, query,
+            max_tokens=self.settings.model.max_new_tokens,
+            temperature=self.settings.model.temperature,
+        )
+    # Expose a generator for streaming endpoints
+    def stream_answer(self, query: str):
+        return self.client.chat_stream(
+            SYSTEM_PROMPT, query,
+            max_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
         )

app/services/plan_service.py CHANGED Viewed

@@ -1,56 +1,195 @@
 import hashlib
 import json
 import logging
 from pathlib import Path
 from ..core.schema import PlanRequest, PlanResponse
 from ..core.config import Settings
-from ..core.inference.client import HFClient
 from ..core.redact import redact
 logger = logging.getLogger(__name__)
-_PROMPT_TEMPLATE: str | None = None
 def _get_prompt_template() -> str:
-    global _PROMPT_TEMPLATE
-    if _PROMPT_TEMPLATE is None:
-        try:
-            path = Path(__file__).parent.parent / "core/prompts/plan.txt"
-            _PROMPT_TEMPLATE = path.read_text(encoding="utf-8")
-        except FileNotFoundError:
-            logger.error("FATAL: core/prompts/plan.txt not found.")
-            _PROMPT_TEMPLATE = "Generate a JSON plan with keys: plan_id, steps, risk, explanation."
-    return _PROMPT_TEMPLATE
-def _create_final_prompt(req: PlanRequest) -> str:
     template = _get_prompt_template()
-    context_str = f"Context:\n- app_id: {req.context.app_id}\n- symptoms: {', '.join(req.context.symptoms)}\n- lkg_version: {req.context.lkg or 'N/A'}\n- constraints: max_steps={req.constraints.max_steps}, risk={req.constraints.risk}"
     safe_context = redact(context_str)
-    return f"{template}\n\n{safe_context}\n\nJSON Response:"
-def _parse_llm_output(raw_output: str, context_str: str) -> dict:
     try:
-        start = raw_output.find('{')
-        end = raw_output.rfind('}')
-        if start != -1 and end != -1 and end > start:
-            json_str = raw_output[start:end+1]
-            return json.loads(json_str)
-        raise ValueError("No valid JSON object found in output.")
-    except (json.JSONDecodeError, ValueError) as e:
-        logger.warning(f"LLM output parsing failed: {e}. Applying safe fallback plan.")
         return {
-            "plan_id": hashlib.md5(context_str.encode()).hexdigest()[:12],
-            "steps": ["Pin to the last-known-good (LKG) version and re-run health probes."],
             "risk": "low",
-            "explanation": "Fallback plan: A safe default was applied due to a model output parsing error."
         }
 async def generate_plan(req: PlanRequest, settings: Settings) -> PlanResponse:
-    final_prompt = _create_final_prompt(req)
-    client = HFClient(model=settings.model.name)
-    raw_response = await client.generate(
-        prompt=final_prompt,
-        max_new_tokens=settings.model.max_new_tokens,
-        temperature=settings.model.temperature,
-    )
-    parsed_data = _parse_llm_output(raw_response, final_prompt)
-    return PlanResponse.model_validate(parsed_data)

+from __future__ import annotations
+import asyncio
 import hashlib
 import json
 import logging
 from pathlib import Path
+from typing import Any, Dict, Optional
 from ..core.schema import PlanRequest, PlanResponse
 from ..core.config import Settings
 from ..core.redact import redact
+from ..core.inference.client import RouterRequestsClient
 logger = logging.getLogger(__name__)
+# ----------------------------
+# Prompts
+# ----------------------------
+SYSTEM_PLANNER = (
+    "You are MATRIX-AI Planner. Produce a short, safe JSON plan. "
+    "Bounded steps, minimal risk, and explain briefly."
+)
+_PROMPT_TEMPLATE_CACHE: Optional[str] = None
 def _get_prompt_template() -> str:
+    """
+    Load core/prompts/plan.txt once (cached).
+    Fallback to a minimal instruction if missing.
+    """
+    global _PROMPT_TEMPLATE_CACHE
+    if _PROMPT_TEMPLATE_CACHE is not None:
+        return _PROMPT_TEMPLATE_CACHE
+    try:
+        path = Path(__file__).parent.parent / "core" / "prompts" / "plan.txt"
+        _PROMPT_TEMPLATE_CACHE = path.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        logger.error("FATAL: core/prompts/plan.txt not found. Using fallback template.")
+        _PROMPT_TEMPLATE_CACHE = (
+            "Generate a JSON plan with keys: plan_id, steps, risk, explanation. "
+            "Keep steps short, safe, and auditable."
+        )
+    return _PROMPT_TEMPLATE_CACHE
+def _render_context(req: PlanRequest) -> str:
+    """
+    Render a compact context string from the request.
+    (Matches your earlier shape: app_id, symptoms, lkg, constraints.)
+    """
+    app_id = getattr(req.context, "app_id", None) or getattr(req.context, "entity_uid", "unknown")
+    symptoms = getattr(req.context, "symptoms", []) or []
+    lkg = getattr(req.context, "lkg", None) or getattr(req.context, "lkg_version", None) or "N/A"
+    max_steps = getattr(req.constraints, "max_steps", 3)
+    risk = getattr(req.constraints, "risk", "low")
+    return (
+        "Context:\n"
+        f"- app_id: {app_id}\n"
+        f"- symptoms: {', '.join(symptoms) if symptoms else 'none'}\n"
+        f"- lkg_version: {lkg}\n"
+        f"- constraints: max_steps={max_steps}, risk={risk}"
+    )
+def _build_prompt(req: PlanRequest) -> str:
+    """
+    Compose final prompt with system guidance + template + redacted context.
+    """
     template = _get_prompt_template()
+    context_str = _render_context(req)
     safe_context = redact(context_str)
+    # You can tweak ordering if desired; this is clear and stable.
+    return f"{SYSTEM_PLANNER}\n\n{template}\n\n{safe_context}\n\nJSON Response:"
+# ----------------------------
+# Output parsing
+# ----------------------------
+def _extract_json_block(text: str) -> Dict[str, Any]:
+    """
+    Try hard to recover a JSON object from LLM text.
+    Supports ```json fences and "first { ... last }".
+    Raises ValueError if no JSON object can be extracted.
+    """
+    s = text.strip()
+    # Fenced block: ```json ... ```
+    if "```" in s:
+        fence_start = s.find("```")
+        lang_tag = s.find("\n", fence_start + 3)
+        if lang_tag != -1:
+            fence_close = s.find("```", lang_tag + 1)
+            if fence_close != -1:
+                fenced = s[lang_tag + 1 : fence_close].strip()
+                return json.loads(fenced)
+    # Plain: first "{" to last "}"
+    first = s.find("{")
+    last = s.rfind("}")
+    if first != -1 and last != -1 and last > first:
+        candidate = s[first : last + 1]
+        return json.loads(candidate)
+    raise ValueError("No valid JSON object found in output.")
+def _safe_parse_or_fallback(raw_output: str, context_for_id: str) -> Dict[str, Any]:
+    """
+    Parse the model output into a dict, or return a safe fallback plan.
+    """
     try:
+        obj = _extract_json_block(raw_output)
+        if not isinstance(obj, dict):
+            raise ValueError("Top-level JSON is not an object.")
+        # Minimal normalization: ensure keys exist
+        if "plan_id" not in obj or not obj["plan_id"]:
+            obj["plan_id"] = hashlib.md5(context_for_id.encode()).hexdigest()[:12]
+        if "steps" not in obj or not obj["steps"]:
+            obj["steps"] = [
+                "Pin to the last-known-good (LKG) version and re-run health probes."
+            ]
+        if "risk" not in obj or not obj["risk"]:
+            obj["risk"] = "low"
+        if "explanation" not in obj or not obj["explanation"]:
+            obj["explanation"] = "Autofilled explanation."
+        return obj
+    except Exception as e:
+        logger.warning("LLM output parsing failed: %s. Applying fallback plan.", e)
         return {
+            "plan_id": hashlib.md5(context_for_id.encode()).hexdigest()[:12],
+            "steps": [
+                "Pin to the last-known-good (LKG) version and re-run health probes."
+            ],
             "risk": "low",
+            "explanation": (
+                "Fallback plan: A safe default was applied due to a model output parsing error."
+            ),
         }
+# ----------------------------
+# Service (requests-only, non-stream)
+# ----------------------------
+class PlanService:
+    """
+    Planner uses HF Router (requests-only). Always non-stream for plan generation.
+    """
+    def __init__(self, settings: Settings):
+        self.settings = settings
+        self.client = RouterRequestsClient(
+            model=settings.model.name,
+            fallback=settings.model.fallback,
+            provider=settings.model.provider,
+            max_retries=2,
+            connect_timeout=10.0,
+            read_timeout=60.0,
+        )
+    async def generate(self, req: PlanRequest) -> PlanResponse:
+        """
+        Build prompt -> call Router (non-stream) -> robustly parse -> PlanResponse.
+        """
+        final_prompt = _build_prompt(req)
+        # run the blocking requests call in a worker thread to avoid blocking the event loop
+        raw_text = await asyncio.to_thread(
+            self.client.plan_nonstream,
+            SYSTEM_PLANNER,
+            final_prompt,
+            self.settings.model.max_new_tokens,
+            self.settings.model.temperature,
+        )
+        parsed = _safe_parse_or_fallback(raw_text, final_prompt)
+        return PlanResponse.model_validate(parsed)
+# ----------------------------
+# Back-compat function (keeps existing imports working)
+# ----------------------------
 async def generate_plan(req: PlanRequest, settings: Settings) -> PlanResponse:
+    """
+    Backward-compatible entry point:
+    previous code called services.plan.generate_plan(...)
+    """
+    service = PlanService(settings)
+    return await service.generate(req)

configs/settings.yaml CHANGED Viewed

@@ -1,21 +1,24 @@
 model:
-  name: "HuggingFaceH4/zephyr-7b-beta" # good balance of speed and capability
-  #name: "mistralai/Mistral-7B-Instruct-v0.2" # capable, open, but large
-  #fallback: "HuggingFaceH4/zephyr-7b-beta" # smaller, faster, but less capable
-  fallback: "microsoft/Phi-3-mini-4k-instruct" # smaller, faster, but less capable
   max_new_tokens: 256
   temperature: 0.2
 limits:
   rate_per_min: 60
   cache_size: 256
 rag:
-  index_dataset: "" # e.g., "your-username/matrix-ai-index"
   top_k: 4
 matrixhub:
   base_url: "https://api.matrixhub.io"
 security:
-  admin_token: "" # Should be set via env var

 model:
+  name: "HuggingFaceH4/zephyr-7b-beta"
+  fallback: "microsoft/Phi-3-mini-4k-instruct"
+  provider: "featherless-ai"      # NEW: makes "model:provider" for Router
   max_new_tokens: 256
   temperature: 0.2
+# Chat backend + mode (requests → Router only)
+chat_backend: "router"             # reserved (future multi-backend)
+chat_stream: true                  # default streaming behavior for /v1/chat/stream
 limits:
   rate_per_min: 60
   cache_size: 256
 rag:
+  index_dataset: ""
   top_k: 4
 matrixhub:
   base_url: "https://api.matrixhub.io"
 security:
+  admin_token: ""