Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 23 days ago

Commit

583c3c6

1 Parent(s): ebdd2fb

feat(intent): add 4-provider fallback chain for intent classification

Fallback chain (in order):
1. Groq llama-3.1-8b-instant - 14,400 free RPD, ~50ms (PRIMARY)
2. Gemini Flash - 1,500 free RPD, ~200ms (FALLBACK 1)
3. OpenRouter auto router - free model pool, ~300ms (FALLBACK 2)
4. HuggingFace Inference API - ~300 RPH, ~2s (FALLBACK 3)
5. Default NEWS_GENERAL - always works, 0ms (SAFETY NET)

All providers use same classification prompt and parse logic.
OpenRouter uses openrouter/auto which selects best available free model.
HuggingFace uses Llama-3.2-3B-Instruct (fast, small, good for classification).
Added OPENROUTER_API_KEY to config.py and .env template.

Files changed (3) hide show

.env +5 -0
src/core/config.py +3 -0
src/infrastructure/adapters/intent_classifier_v2.py +167 -81

.env CHANGED Viewed

@@ -108,3 +108,8 @@ SEARXNG_ENABLED=true
 SEARXNG_BASE_URL=http://searxng:8080
 SEARXNG_TIMEOUT=5.0
 SEARXNG_MAX_RESULTS=10

 SEARXNG_BASE_URL=http://searxng:8080
 SEARXNG_TIMEOUT=5.0
 SEARXNG_MAX_RESULTS=10
+# --- OpenRouter (FREE model pool — fallback for intent classification) ---
+# Get free key: https://openrouter.ai/keys (no credit card required)
+# Free models: Llama 4, Qwen 3, DeepSeek, Gemma 3 and more
+OPENROUTER_API_KEY=your-openrouter-api-key-here

src/core/config.py CHANGED Viewed

@@ -61,6 +61,9 @@ class Settings(BaseSettings):
     HF_TOKEN: str = os.getenv("HF_TOKEN", "")
     HF_MODEL: str = os.getenv("HF_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
     # Ollama — local inference
     OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
     OLLAMA_MODEL: str = os.getenv("OLLAMA_MODEL", "llama3.2")

     HF_TOKEN: str = os.getenv("HF_TOKEN", "")
     HF_MODEL: str = os.getenv("HF_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
+    # OpenRouter — free model pool | https://openrouter.ai/keys
+    OPENROUTER_API_KEY: str = os.getenv("OPENROUTER_API_KEY", "")
     # Ollama — local inference
     OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
     OLLAMA_MODEL: str = os.getenv("OLLAMA_MODEL", "llama3.2")

src/infrastructure/adapters/intent_classifier_v2.py CHANGED Viewed

@@ -1,24 +1,27 @@
 """
-Intent Classifier v4 — LLM-Powered (Hybrid)
 Architecture:
-  Layer 1: Instant safety net (0ms)   — 6 exact strings only
-  Layer 2: LLM classification (50ms)  — llama-3.1-8b-instant via Groq
-  Layer 3: Safe default (0ms)         — NEWS_GENERAL if LLM fails
 Why LLM instead of hard-coded rules:
   - 99%+ accuracy vs ~75% for keyword matching
-  - Handles any language naturally (Amharic, Arabic, Somali...)
   - Handles any topic (new conflicts, new places, new events)
-  - Zero maintenance — no keyword lists to update
   - Understands context ("Abiy's latest move" → NEWS_TEMPORAL)
-Model choice: llama-3.1-8b-instant on Groq
-  - 14,400 free requests/day (vs 1,000 for 70B)
-  - Intent is a simple 4-choice task — 8B is more than enough
-  - ~50ms latency
-  - Preserves 70B quota for actual RAG answer generation
-  - Fallback: Gemini Flash → default NEWS_GENERAL
 """
 import logging
@@ -44,7 +47,7 @@ _INSTANT_OTHER = {
 # ═══════════════════════════════════════════════════════════════════════════════
-# CLASSIFICATION PROMPT
 # ═══════════════════════════════════════════════════════════════════════════════
 _CLASSIFY_PROMPT = """You are an intent classifier for ARKI AI, a news assistant focused on Ethiopia and Africa.
@@ -78,7 +81,7 @@ Category:"""
 class IntentResult:
     intent: str            # NEWS_TEMPORAL | NEWS_HISTORICAL | NEWS_GENERAL | OTHER
     confidence: float      # 0.0 – 1.0
-    method: str            # instant | llm_groq | llm_gemini | default
     inference_time_ms: float
     query_complexity: str  # vague | simple | medium | complex
     sub_type: str          # general | conflict | humanitarian | identity | off_topic
@@ -106,23 +109,31 @@ class IntentResult:
 class IntentClassifierV2:
     """
-    LLM-powered intent classifier.
-    Uses llama-3.1-8b-instant (14,400 free RPD on Groq) for classification.
-    Falls back to Gemini Flash, then defaults to NEWS_GENERAL.
     """
-    # Groq endpoint — uses the fast 8B model, not the 70B used for answers
-    GROQ_BASE_URL = "https://api.groq.com/openai/v1/chat/completions"
-    CLASSIFICATION_MODEL = "llama-3.1-8b-instant"
     VALID_INTENTS = {"NEWS_TEMPORAL", "NEWS_HISTORICAL", "NEWS_GENERAL", "OTHER"}
     def __init__(self):
         self._groq_key: Optional[str] = None
         self._gemini_key: Optional[str] = None
         self._client = httpx.Client(timeout=5.0)
-        self._lock = threading.Lock()
         self._metrics = {
             "total": 0,
             "by_intent": {},
@@ -135,16 +146,37 @@ class IntentClassifierV2:
         """Load API keys from settings."""
         try:
             from src.core.config import settings
             key = settings.GROQ_API_KEY
             if key and key not in ("", "your-groq-api-key-here"):
                 self._groq_key = key
-                logger.info("✅ Intent classifier: Groq key loaded")
-            else:
-                logger.warning("Intent classifier: Groq key not set — will use fallback")
-            gem_key = settings.GEMINI_API_KEY
-            if gem_key and gem_key not in ("", "your-gemini-api-key-here"):
-                self._gemini_key = gem_key
         except Exception as e:
             logger.error(f"Intent classifier: failed to load keys: {e}")
@@ -160,80 +192,137 @@ class IntentClassifierV2:
         if ql in _INSTANT_OTHER:
             return self._result("OTHER", 1.0, "instant", t0, complexity, "identity")
-        # ── Layer 2: LLM classification ───────────────────────────────────────
-        # Try Groq first (fast 8B model, 14,400 RPD free)
         if self._groq_key:
-            intent = self._classify_with_groq(q)
             if intent:
                 return self._result(intent, 0.97, "llm_groq", t0, complexity,
                                     self._sub_type(q, intent))
-        # Try Gemini Flash as fallback
         if self._gemini_key:
-            intent = self._classify_with_gemini(q)
             if intent:
                 return self._result(intent, 0.95, "llm_gemini", t0, complexity,
                                     self._sub_type(q, intent))
-        # ── Layer 3: Safe default ─────────────────────────────────────────────
-        # Better to search and find nothing than to refuse
-        logger.warning(f"Intent classifier: all LLMs failed for '{q[:50]}' — defaulting to NEWS_GENERAL")
         return self._result("NEWS_GENERAL", 0.50, "default", t0, complexity, "general")
-    # ── LLM calls ─────────────────────────────────────────────────────────────
-    def _classify_with_groq(self, query: str) -> Optional[str]:
-        """Call Groq llama-3.1-8b-instant for intent classification."""
         try:
-            prompt = _CLASSIFY_PROMPT.format(query=query)
             response = self._client.post(
-                self.GROQ_BASE_URL,
-                headers={
-                    "Authorization": f"Bearer {self._groq_key}",
-                    "Content-Type": "application/json",
-                },
                 json={
-                    "model": self.CLASSIFICATION_MODEL,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "max_tokens": 20,       # We only need 1 word
-                    "temperature": 0.0,     # Deterministic
-                    "stop": ["\n", " "],    # Stop after first word
                 },
-                timeout=4.0,
             )
             if response.status_code == 200:
-                content = response.json()["choices"][0]["message"]["content"].strip()
                 intent = self._parse_intent(content)
                 if intent:
-                    logger.debug(f"Groq classified '{query[:40]}' → {intent}")
                     return intent
-                logger.warning(f"Groq returned unexpected intent: '{content}'")
             elif response.status_code == 429:
-                logger.warning("Intent classifier: Groq rate limit hit")
             else:
-                logger.warning(f"Intent classifier: Groq returned {response.status_code}")
         except httpx.TimeoutException:
-            logger.warning("Intent classifier: Groq timeout (4s)")
         except Exception as e:
-            logger.error(f"Intent classifier: Groq error: {e}")
         return None
-    def _classify_with_gemini(self, query: str) -> Optional[str]:
-        """Call Gemini Flash as fallback classifier."""
         try:
-            prompt = _CLASSIFY_PROMPT.format(query=query)
-            url = (
-                f"https://generativelanguage.googleapis.com/v1beta/models/"
-                f"gemini-2.0-flash:generateContent?key={self._gemini_key}"
-            )
             response = self._client.post(
                 url,
                 json={
-                    "contents": [{"parts": [{"text": prompt}]}],
                     "generationConfig": {
                         "maxOutputTokens": 20,
                         "temperature": 0.0,
@@ -253,16 +342,18 @@ class IntentClassifierV2:
                 )
                 intent = self._parse_intent(content)
                 if intent:
-                    logger.debug(f"Gemini classified '{query[:40]}' → {intent}")
                     return intent
             elif response.status_code == 429:
-                logger.warning("Intent classifier: Gemini rate limit hit")
         except httpx.TimeoutException:
-            logger.warning("Intent classifier: Gemini timeout (4s)")
         except Exception as e:
-            logger.error(f"Intent classifier: Gemini error: {e}")
         return None
@@ -270,9 +361,8 @@ class IntentClassifierV2:
     def _parse_intent(self, raw: str) -> Optional[str]:
         """Parse LLM response to valid intent. Handles partial matches."""
-        cleaned = raw.strip().upper().replace(".", "").replace(":", "")
-        # Exact match
         if cleaned in self.VALID_INTENTS:
             return cleaned
@@ -294,22 +384,18 @@ class IntentClassifierV2:
             return "off_topic"
         ql = query.lower()
-        if any(w in ql for w in ("clash", "attack", "killed", "battle", "fano", "tplf", "military", "troops")):
             return "conflict"
-        if any(w in ql for w in ("displaced", "refugee", "aid", "humanitarian", "famine", "drought")):
             return "humanitarian"
         return "general"
     def _complexity(self, query: str) -> str:
         n = len(query.split())
-        if n == 0:
-            return "empty"
-        if n == 1:
-            return "vague"
-        if n <= 4:
-            return "simple"
-        if n <= 12:
-            return "medium"
         return "complex"
     def _result(

 """
+Intent Classifier v4 — LLM-Powered with 4-Provider Fallback Chain
 Architecture:
+  Layer 1: Instant safety net (0ms)       — 20 exact strings only
+  Layer 2: Groq llama-3.1-8b-instant      — 14,400 free RPD, ~50ms  (PRIMARY)
+  Layer 3: Gemini Flash fallback          — 1,500 free RPD,  ~200ms (FALLBACK 1)
+  Layer 4: OpenRouter free router         — free models pool, ~300ms (FALLBACK 2)
+  Layer 5: HuggingFace Inference API      — ~300 RPH,        ~2s    (FALLBACK 3)
+  Layer 6: Safe default                   — NEWS_GENERAL,    0ms    (ALWAYS WORKS)
 Why LLM instead of hard-coded rules:
   - 99%+ accuracy vs ~75% for keyword matching
+  - Handles any language naturally (Amharic, Arabic, Somali, French...)
   - Handles any topic (new conflicts, new places, new events)
+  - Zero maintenance — no keyword lists to update ever
   - Understands context ("Abiy's latest move" → NEWS_TEMPORAL)
+Provider selection rationale:
+  - Groq 8B:      14,400 RPD free — primary, fastest, cheapest
+  - Gemini Flash: 1,500 RPD free  — reliable fallback
+  - OpenRouter:   free model pool — auto-selects best available free model
+  - HuggingFace:  ~300 RPH free   — last resort (slower but always available)
+  - Default:      NEWS_GENERAL    — never fails, safe for user experience
 """
 import logging
 # ═══════════════════════════════════════════════════════════════════════════════
+# CLASSIFICATION PROMPT — same prompt used across all providers
 # ═══════════════════════════════════════════════════════════════════════════════
 _CLASSIFY_PROMPT = """You are an intent classifier for ARKI AI, a news assistant focused on Ethiopia and Africa.
 class IntentResult:
     intent: str            # NEWS_TEMPORAL | NEWS_HISTORICAL | NEWS_GENERAL | OTHER
     confidence: float      # 0.0 – 1.0
+    method: str            # instant | llm_groq | llm_gemini | llm_openrouter | llm_hf | default
     inference_time_ms: float
     query_complexity: str  # vague | simple | medium | complex
     sub_type: str          # general | conflict | humanitarian | identity | off_topic
 class IntentClassifierV2:
     """
+    LLM-powered intent classifier with 4-provider fallback chain.
+    Fallback order:
+      Groq 8B → Gemini Flash → OpenRouter Free → HuggingFace → Default
     """
+    # Provider endpoints
+    GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
+    GROQ_MODEL = "llama-3.1-8b-instant"   # 14,400 free RPD
+    GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
+    OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
+    OPENROUTER_MODEL = "openrouter/auto"   # Auto-selects best available free model
+    HF_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct/v1/chat/completions"
     VALID_INTENTS = {"NEWS_TEMPORAL", "NEWS_HISTORICAL", "NEWS_GENERAL", "OTHER"}
     def __init__(self):
         self._groq_key: Optional[str] = None
         self._gemini_key: Optional[str] = None
+        self._openrouter_key: Optional[str] = None
+        self._hf_token: Optional[str] = None
         self._client = httpx.Client(timeout=5.0)
         self._metrics = {
             "total": 0,
             "by_intent": {},
         """Load API keys from settings."""
         try:
             from src.core.config import settings
             key = settings.GROQ_API_KEY
             if key and key not in ("", "your-groq-api-key-here"):
                 self._groq_key = key
+            gem = settings.GEMINI_API_KEY
+            if gem and gem not in ("", "your-gemini-api-key-here"):
+                self._gemini_key = gem
+            # OpenRouter key (add OPENROUTER_API_KEY to .env)
+            try:
+                or_key = getattr(settings, "OPENROUTER_API_KEY", "")
+                if or_key and or_key not in ("", "your-openrouter-api-key-here"):
+                    self._openrouter_key = or_key
+            except Exception:
+                pass
+            # HuggingFace token
+            hf = settings.HF_TOKEN
+            if hf and hf not in ("", "your-hf-token-here"):
+                self._hf_token = hf
+            providers = []
+            if self._groq_key:     providers.append("Groq")
+            if self._gemini_key:   providers.append("Gemini")
+            if self._openrouter_key: providers.append("OpenRouter")
+            if self._hf_token:     providers.append("HuggingFace")
+            providers.append("Default")
+            logger.info(f"✅ Intent classifier providers: {' → '.join(providers)}")
         except Exception as e:
             logger.error(f"Intent classifier: failed to load keys: {e}")
         if ql in _INSTANT_OTHER:
             return self._result("OTHER", 1.0, "instant", t0, complexity, "identity")
+        # ── Layer 2: Groq llama-3.1-8b-instant (PRIMARY) ─────────────────────
         if self._groq_key:
+            intent = self._call_openai_compat(
+                url=self.GROQ_URL,
+                api_key=self._groq_key,
+                model=self.GROQ_MODEL,
+                query=q,
+                provider="groq",
+            )
             if intent:
                 return self._result(intent, 0.97, "llm_groq", t0, complexity,
                                     self._sub_type(q, intent))
+        # ── Layer 3: Gemini Flash (FALLBACK 1) ────────────────────────────────
         if self._gemini_key:
+            intent = self._call_gemini(q)
             if intent:
                 return self._result(intent, 0.95, "llm_gemini", t0, complexity,
                                     self._sub_type(q, intent))
+        # ── Layer 4: OpenRouter free router (FALLBACK 2) ─────────────────────
+        if self._openrouter_key:
+            intent = self._call_openai_compat(
+                url=self.OPENROUTER_URL,
+                api_key=self._openrouter_key,
+                model=self.OPENROUTER_MODEL,
+                query=q,
+                provider="openrouter",
+                extra_headers={
+                    "HTTP-Referer": "https://arki-ai.com",
+                    "X-Title": "ARKI AI Intent Classifier",
+                },
+            )
+            if intent:
+                return self._result(intent, 0.93, "llm_openrouter", t0, complexity,
+                                    self._sub_type(q, intent))
+        # ── Layer 5: HuggingFace Inference API (FALLBACK 3) ───────────────────
+        if self._hf_token:
+            intent = self._call_openai_compat(
+                url=self.HF_URL,
+                api_key=self._hf_token,
+                model="meta-llama/Llama-3.2-3B-Instruct",
+                query=q,
+                provider="huggingface",
+                timeout=8.0,  # HF is slower
+            )
+            if intent:
+                return self._result(intent, 0.90, "llm_hf", t0, complexity,
+                                    self._sub_type(q, intent))
+        # ── Layer 6: Safe default ─────────────────────────────────────────────
+        logger.warning(f"Intent: all providers failed for '{q[:50]}' — defaulting to NEWS_GENERAL")
         return self._result("NEWS_GENERAL", 0.50, "default", t0, complexity, "general")
+    # ── Provider calls ────────────────────────────────────────────────────────
+    def _call_openai_compat(
+        self,
+        url: str,
+        api_key: str,
+        model: str,
+        query: str,
+        provider: str,
+        extra_headers: Optional[Dict] = None,
+        timeout: float = 4.0,
+    ) -> Optional[str]:
+        """
+        Generic OpenAI-compatible API call.
+        Works for: Groq, OpenRouter, HuggingFace (all use same format).
+        """
         try:
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            }
+            if extra_headers:
+                headers.update(extra_headers)
             response = self._client.post(
+                url,
+                headers=headers,
                 json={
+                    "model": model,
+                    "messages": [
+                        {"role": "user", "content": _CLASSIFY_PROMPT.format(query=query)}
+                    ],
+                    "max_tokens": 20,
+                    "temperature": 0.0,
                 },
+                timeout=timeout,
             )
             if response.status_code == 200:
+                content = (
+                    response.json()
+                    .get("choices", [{}])[0]
+                    .get("message", {})
+                    .get("content", "")
+                    .strip()
+                )
                 intent = self._parse_intent(content)
                 if intent:
+                    logger.debug(f"{provider}: '{query[:40]}' → {intent}")
                     return intent
+                logger.warning(f"{provider}: unexpected response: '{content}'")
             elif response.status_code == 429:
+                logger.warning(f"Intent: {provider} rate limited")
+            elif response.status_code == 503:
+                logger.warning(f"Intent: {provider} unavailable (503)")
             else:
+                logger.warning(f"Intent: {provider} returned {response.status_code}")
         except httpx.TimeoutException:
+            logger.warning(f"Intent: {provider} timeout ({timeout}s)")
         except Exception as e:
+            logger.error(f"Intent: {provider} error: {e}")
         return None
+    def _call_gemini(self, query: str) -> Optional[str]:
+        """Gemini has a different API format."""
         try:
+            url = f"{self.GEMINI_URL}?key={self._gemini_key}"
             response = self._client.post(
                 url,
                 json={
+                    "contents": [
+                        {"parts": [{"text": _CLASSIFY_PROMPT.format(query=query)}]}
+                    ],
                     "generationConfig": {
                         "maxOutputTokens": 20,
                         "temperature": 0.0,
                 )
                 intent = self._parse_intent(content)
                 if intent:
+                    logger.debug(f"gemini: '{query[:40]}' → {intent}")
                     return intent
             elif response.status_code == 429:
+                logger.warning("Intent: Gemini rate limited")
+            else:
+                logger.warning(f"Intent: Gemini returned {response.status_code}")
         except httpx.TimeoutException:
+            logger.warning("Intent: Gemini timeout (4s)")
         except Exception as e:
+            logger.error(f"Intent: Gemini error: {e}")
         return None
     def _parse_intent(self, raw: str) -> Optional[str]:
         """Parse LLM response to valid intent. Handles partial matches."""
+        cleaned = raw.strip().upper().replace(".", "").replace(":", "").split()[0] if raw.strip() else ""
         if cleaned in self.VALID_INTENTS:
             return cleaned
             return "off_topic"
         ql = query.lower()
+        if any(w in ql for w in ("clash", "attack", "killed", "battle", "fano", "tplf", "military")):
             return "conflict"
+        if any(w in ql for w in ("displaced", "refugee", "aid", "humanitarian", "famine")):
             return "humanitarian"
         return "general"
     def _complexity(self, query: str) -> str:
         n = len(query.split())
+        if n == 0:   return "empty"
+        if n == 1:   return "vague"
+        if n <= 4:   return "simple"
+        if n <= 12:  return "medium"
         return "complex"
     def _result(