Spaces:

CaffeinatedCoding
/

nyayasetu

Running

App Files Files Community

CaffeinatedCoding commited on 12 days ago

Commit

a64025f

verified ·

1 Parent(s): f756c47

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

api/main.py +30 -9
src/agent_v2.py +78 -19
src/llm.py +103 -54

api/main.py CHANGED Viewed

@@ -37,10 +37,20 @@ def download_models():
         if not os.path.exists("models/ner_model"):
             logger.info("Downloading NER model...")
-            snapshot_download(
-                repo_id=repo_id, repo_type="model",
-                allow_patterns="ner_model/*", local_dir="models", token=hf_token
-            )
             logger.info("NER model downloaded")
         else:
             logger.info("NER model already exists")
@@ -48,10 +58,14 @@ def download_models():
         if not os.path.exists("models/faiss_index/index.faiss"):
             logger.info("Downloading FAISS index...")
             os.makedirs("models/faiss_index", exist_ok=True)
-            hf_hub_download(repo_id=repo_id, filename="faiss_index/index.faiss",
-                            repo_type="model", local_dir="models", token=hf_token)
-            hf_hub_download(repo_id=repo_id, filename="faiss_index/chunk_metadata.jsonl",
-                            repo_type="model", local_dir="models", token=hf_token)
             logger.info("FAISS index downloaded")
         else:
             logger.info("FAISS index already exists")
@@ -138,7 +152,14 @@ def serve_frontend():
 @app.get("/health")
 def health():
-    return {"status": "ok", "service": "NyayaSetu", "version": "2.0.0", "agent": AGENT_VERSION}
 @app.post("/query", response_model=QueryResponse)

         if not os.path.exists("models/ner_model"):
             logger.info("Downloading NER model...")
+            os.makedirs("models/ner_model", exist_ok=True)
+            # NER model files — explicit downloads to avoid snapshot_download pattern bugs
+            ner_files = [
+                "config.json", "model.safetensors", "tokenizer.json",
+                "tokenizer_config.json", "training_args.bin", "training_results.json"
+            ]
+            for fname in ner_files:
+                try:
+                    hf_hub_download(
+                        repo_id=repo_id, filename=f"ner_model/{fname}",
+                        repo_type="model", local_dir="models", token=hf_token
+                    )
+                except Exception as e:
+                    logger.warning(f"Could not download ner_model/{fname}: {e}")
             logger.info("NER model downloaded")
         else:
             logger.info("NER model already exists")
         if not os.path.exists("models/faiss_index/index.faiss"):
             logger.info("Downloading FAISS index...")
             os.makedirs("models/faiss_index", exist_ok=True)
+            # Download FAISS files explicitly to avoid snapshot_download pattern issues
+            faiss_files = ["index.faiss", "chunk_metadata.jsonl"]
+            for fname in faiss_files:
+                try:
+                    hf_hub_download(repo_id=repo_id, filename=f"faiss_index/{fname}",
+                                    repo_type="model", local_dir="models", token=hf_token)
+                except Exception as fe:
+                    logger.warning(f"Could not download faiss_index/{fname}: {fe}")
             logger.info("FAISS index downloaded")
         else:
             logger.info("FAISS index already exists")
 @app.get("/health")
 def health():
+    from src.agent_v2 import _circuit_breaker
+    return {
+        "status": "ok",
+        "service": "NyayaSetu",
+        "version": "2.0.0",
+        "agent": AGENT_VERSION,
+        "groq_circuit_breaker": _circuit_breaker.get_status()
+    }
 @app.post("/query", response_model=QueryResponse)

src/agent_v2.py CHANGED Viewed

@@ -28,12 +28,57 @@ from src.ner import extract_entities, augment_query
 logger = logging.getLogger(__name__)
-from groq import Groq
-from tenacity import retry, stop_after_attempt, wait_exponential
 from dotenv import load_dotenv
 load_dotenv()
-_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 # ── Session store ─────────────────────────────────────────
 sessions: Dict[str, Dict] = {}
@@ -116,8 +161,13 @@ def update_session(session_id: str, analysis: Dict, user_message: str, response:
 # ── Pass 1: Analyse ───────────────────────────────────────
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=4))
 def analyse(user_message: str, session: Dict) -> Dict:
     summary = session.get("summary", "")
     last_msgs = session.get("last_3_messages", [])
     cs = session["case_state"]
@@ -165,16 +215,14 @@ Rules:
 - Update hypothesis confidence based on new evidence
 - search_queries must be specific legal questions for vector search"""
-    response = _client.chat.completions.create(
-        model="llama-3.3-70b-versatile",
         messages=[
             {"role": "system", "content": ANALYSIS_PROMPT},
             {"role": "user", "content": user_content}
-        ],
-        temperature=0.1,
-        max_tokens=900
     )
-    raw = response.choices[0].message.content.strip()
     raw = raw.replace("```json", "").replace("```", "").strip()
     try:
@@ -229,8 +277,13 @@ def retrieve_parallel(search_queries: List[str], top_k: int = 5) -> List[Dict]:
 # ── Pass 3: Respond ───────────────────────────────────────
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
 def respond(user_message: str, analysis: Dict, chunks: List[Dict], session: Dict) -> str:
     system_prompt = build_prompt(analysis)
     cs = session["case_state"]
     turn_count = cs.get("turn_count", 0)
@@ -325,16 +378,14 @@ Instructions:
 - Opposition war-gaming: if giving strategy, include what the other side will argue
 {radar_instruction}"""
-    response = _client.chat.completions.create(
-        model="llama-3.3-70b-versatile",
         messages=[
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_content}
-        ],
-        temperature=0.3,
-        max_tokens=1500
     )
-    return response.choices[0].message.content
 # ── Main entry point ──────────────────────────────────────
@@ -346,7 +397,11 @@ def run_query_v2(user_message: str, session_id: str) -> Dict[str, Any]:
     try:
         analysis = analyse(user_message, session)
     except Exception as e:
-        logger.error(f"Pass 1 failed: {e}")
         analysis = {
             "tone": "casual", "format_requested": "none",
             "subject": "legal query", "action_needed": "advice",
@@ -404,7 +459,11 @@ def run_query_v2(user_message: str, session_id: str) -> Dict[str, Any]:
     try:
         answer = respond(user_message, analysis, chunks, session)
     except Exception as e:
-        logger.error(f"Pass 3 failed: {e}")
         if chunks:
             fallback = "\n\n".join(
                 f"[{c.get('title', 'Source')}]\n{c.get('text', '')[:400]}"

 logger = logging.getLogger(__name__)
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
 from dotenv import load_dotenv
+import threading
+import time
+from src.llm import call_llm_raw
 load_dotenv()
+# ── Circuit Breaker for Groq API ──────────────────────────
+class CircuitBreaker:
+    """Simple circuit breaker to detect when Groq API is down."""
+    def __init__(self, failure_threshold=5, recovery_timeout=60):
+        self.failure_count = 0
+        self.failure_threshold = failure_threshold
+        self.recovery_timeout = recovery_timeout
+        self.last_failure_time = None
+        self.is_open = False
+        self.lock = threading.Lock()
+    def record_success(self):
+        with self.lock:
+            self.failure_count = 0
+            self.is_open = False
+    def record_failure(self):
+        with self.lock:
+            self.failure_count += 1
+            self.last_failure_time = time.time()
+            if self.failure_count >= self.failure_threshold:
+                self.is_open = True
+                logger.warning(f"Circuit breaker OPEN: {self.failure_count} failures detected")
+    def can_attempt(self) -> bool:
+        with self.lock:
+            if not self.is_open:
+                return True
+            # Try to recover after timeout
+            if time.time() - self.last_failure_time > self.recovery_timeout:
+                logger.info("Circuit breaker attempting recovery...")
+                self.is_open = False
+                self.failure_count = 0
+                return True
+            return False
+    def get_status(self) -> str:
+        with self.lock:
+            if self.is_open:
+                return f"OPEN ({self.failure_count} failures)"
+            return f"CLOSED ({self.failure_count} failures)"
+_circuit_breaker = CircuitBreaker()
 # ── Session store ─────────────────────────────────────────
 sessions: Dict[str, Dict] = {}
 # ── Pass 1: Analyse ───────────────────────────────────────
+# Retry up to 5 times with exponential backoff (1s to 16s) to handle transient failures
+@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=1, max=16, multiplier=1.5))
 def analyse(user_message: str, session: Dict) -> Dict:
+    if not _circuit_breaker.can_attempt():
+        logger.error(f"Circuit breaker OPEN - skipping Pass 1. Status: {_circuit_breaker.get_status()}")
+        raise Exception("Groq API circuit breaker is open - service unavailable")
     summary = session.get("summary", "")
     last_msgs = session.get("last_3_messages", [])
     cs = session["case_state"]
 - Update hypothesis confidence based on new evidence
 - search_queries must be specific legal questions for vector search"""
+    response = call_llm_raw(
         messages=[
             {"role": "system", "content": ANALYSIS_PROMPT},
             {"role": "user", "content": user_content}
+        ]
     )
+    _circuit_breaker.record_success()  # API call succeeded
+    raw = response.strip()
     raw = raw.replace("```json", "").replace("```", "").strip()
     try:
 # ── Pass 3: Respond ───────────────────────────────────────
+# Retry up to 5 times with exponential backoff (2s to 32s) — more aggressive than Pass 1
+@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=2, max=32, multiplier=1.5))
 def respond(user_message: str, analysis: Dict, chunks: List[Dict], session: Dict) -> str:
+    if not _circuit_breaker.can_attempt():
+        logger.error(f"Circuit breaker OPEN - skipping Pass 3. Status: {_circuit_breaker.get_status()}")
+        raise Exception("Groq API circuit breaker is open - service unavailable")
     system_prompt = build_prompt(analysis)
     cs = session["case_state"]
     turn_count = cs.get("turn_count", 0)
 - Opposition war-gaming: if giving strategy, include what the other side will argue
 {radar_instruction}"""
+    response = call_llm_raw(
         messages=[
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_content}
+        ]
     )
+    _circuit_breaker.record_success()  # API call succeeded
+    return response
 # ── Main entry point ──────────────────────────────────────
     try:
         analysis = analyse(user_message, session)
     except Exception as e:
+        error_type = type(e).__name__
+        logger.error(f"Pass 1 failed after retries: {error_type}: {e}. Circuit breaker: {_circuit_breaker.get_status()}")
+        # Record API failure if it was a connection error
+        if "APIConnectionError" in error_type or "RateLimitError" in error_type:
+            _circuit_breaker.record_failure()
         analysis = {
             "tone": "casual", "format_requested": "none",
             "subject": "legal query", "action_needed": "advice",
     try:
         answer = respond(user_message, analysis, chunks, session)
     except Exception as e:
+        error_type = type(e).__name__
+        logger.error(f"Pass 3 failed after retries: {error_type}: {e}. Circuit breaker: {_circuit_breaker.get_status()}")
+        # Record API failure if it was a connection error
+        if "APIConnectionError" in error_type or "RateLimitError" in error_type:
+            _circuit_breaker.record_failure()
         if chunks:
             fallback = "\n\n".join(
                 f"[{c.get('title', 'Source')}]\n{c.get('text', '')[:400]}"

src/llm.py CHANGED Viewed

@@ -1,77 +1,126 @@
 """
-LLM module. Single Groq API call with tenacity retry.
-WHY Groq? Free tier, fastest inference (~500 tokens/sec).
-WHY temperature=0.1? Lower = more deterministic, less hallucination.
-WHY one call per query? Multi-step chains add latency and failure points.
 """
 import os
 import logging
-from groq import Groq
-from tenacity import retry, stop_after_attempt, wait_exponential
 from dotenv import load_dotenv
 load_dotenv()
 logger = logging.getLogger(__name__)
-api_key = os.getenv("GROQ_API_KEY")
-logger.info(f"GROQ_API_KEY loaded: {bool(api_key)} (length: {len(api_key) if api_key else 0})")
-_client = Groq(
-    api_key=api_key
-)
-logger.info("Groq client initialized successfully")
 def call_llm_raw(messages: list) -> str:
     """
-    Call Groq with pre-built messages list.
     Used by V2 agent for Pass 1 and Pass 3.
     """
-    try:
-        response = _client.chat.completions.create(
-            model="llama-3.3-70b-versatile",
-            messages=messages,
-            temperature=0.3,
-            max_tokens=1500
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        logger.error(f"Groq API error in call_llm_raw: {type(e).__name__}: {str(e)}", exc_info=True)
-        raise
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=2, max=8)
-)
 def call_llm(query: str, context: str) -> str:
     """
-    Call Groq Llama-3. Used by V1 agent.
-    Retries 3 times with exponential backoff.
     """
-    try:
-        user_message = f"""QUESTION: {query}
-SUPREME COURT JUDGMENT EXCERPTS:
-{context}
-Answer based only on the excerpts above. Cite judgment IDs.
-Use proper markdown formatting."""
-        response = _client.chat.completions.create(
-            model="llama-3.3-70b-versatile",
-            messages=[
-                {"role": "system", "content": "You are NyayaSetu, an Indian legal research assistant. Answer only from provided excerpts. Cite judgment IDs. End with: NOTE: This is not legal advice."},
-                {"role": "user", "content": user_message}
-            ],
-            temperature=0.1,
-            max_tokens=1500
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        logger.error(f"Groq API error in call_llm: {type(e).__name__}: {str(e)}", exc_info=True)
-        raise

 """
+LLM module. HuggingFace Inference API as primary.
+Works natively from HF Spaces — same infrastructure.
+Groq as local dev fallback.
+WHY HF Inference API?
+HF Spaces can always reach HuggingFace's own APIs.
+No network routing issues. Uses existing HF_TOKEN.
+Same Llama 3.3 70B model as Groq.
 """
 import os
 import logging
 from dotenv import load_dotenv
+from tenacity import retry, stop_after_attempt, wait_exponential
 load_dotenv()
 logger = logging.getLogger(__name__)
+# ── HuggingFace Inference API ─────────────────────────────
+_hf_client = None
+def _init_hf():
+    global _hf_client
+    token = os.getenv("HF_TOKEN")
+    if not token:
+        logger.warning("HF_TOKEN not set — HF Inference API disabled")
+        return False
+    try:
+        from huggingface_hub import InferenceClient
+        _hf_client = InferenceClient(
+            model="meta-llama/Llama-3.3-70B-Instruct",
+            token=token
+        )
+        logger.info("HF Inference API ready (Llama-3.3-70B)")
+        return True
+    except Exception as e:
+        logger.error(f"HF Inference API init failed: {e}")
+        return False
+# ── Groq fallback (works locally, may be blocked on HF Spaces) ──
+_groq_client = None
+def _init_groq():
+    global _groq_client
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        return False
+    try:
+        from groq import Groq
+        _groq_client = Groq(api_key=api_key)
+        logger.info("Groq ready as fallback")
+        return True
+    except Exception as e:
+        logger.error(f"Groq init failed: {e}")
+        return False
+_hf_ready = _init_hf()
+_groq_ready = _init_groq()
+def _call_hf(messages: list) -> str:
+    """Call HuggingFace Inference API."""
+    # Convert to HF format
+    response = _hf_client.chat_completion(
+        messages=messages,
+        max_tokens=1500,
+        temperature=0.3,
+    )
+    return response.choices[0].message.content
+def _call_groq(messages: list) -> str:
+    """Call Groq as fallback."""
+    response = _groq_client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=messages,
+        temperature=0.3,
+        max_tokens=1500
+    )
+    return response.choices[0].message.content
+def _call_with_fallback(messages: list) -> str:
+    """Try HF first, fall back to Groq."""
+    if _hf_ready and _hf_client:
+        try:
+            return _call_hf(messages)
+        except Exception as e:
+            logger.warning(f"HF Inference failed: {e}, trying Groq")
+    if _groq_ready and _groq_client:
+        try:
+            return _call_groq(messages)
+        except Exception as e:
+            logger.error(f"Groq also failed: {e}")
+    raise Exception("All LLM providers failed")
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
 def call_llm_raw(messages: list) -> str:
     """
+    Call LLM with pre-built messages list.
     Used by V2 agent for Pass 1 and Pass 3.
     """
+    return _call_with_fallback(messages)
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
 def call_llm(query: str, context: str) -> str:
     """
+    Call LLM with query and context.
+    Used by V1 agent.
     """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are NyayaSetu, an Indian legal research assistant. Answer only from provided excerpts. Cite judgment IDs. End with: NOTE: This is not legal advice."
+        },
+        {
+            "role": "user",
+            "content": f"QUESTION: {query}\n\nSOURCES:\n{context}\n\nAnswer based on sources. Cite judgment IDs."
+        }
+    ]
+    return _call_with_fallback(messages)