Spaces:

RolandM
/

prisma-chatbot

Running

RolandM commited on 3 days ago

Commit

7db5adc

1 Parent(s): ba58dc4

Add inference module with HF API wrapper

- src/inference.py: PrismaInferenceClient wraps huggingface_hub's
InferenceClient with forced JSON mode (required for Llama 3.3 70B)
- Typed errors: InferenceError for API issues, EvaluationParseError
bubbles up from src.evaluation for parse issues
- src/config.py: MODEL_ID, DEFAULT_TEMPERATURE, DEFAULT_MAX_TOKENS
- tests/test_inference.py: 10 tests with mocked InferenceClient,
no real API calls

Files changed (3) hide show

src/config.py +5 -0
src/inference.py +132 -6
tests/test_inference.py +123 -0

src/config.py CHANGED Viewed

@@ -11,6 +11,11 @@ MIN_SCORE: int = 1
 MAX_SCORE: int = 7
 SESSION_TURN_CAP: int = 12
 DEFAULT_ATTRIBUTES: list[str] = [
     "competent",
     "likeable",

 MAX_SCORE: int = 7
 SESSION_TURN_CAP: int = 12
+MODEL_ID: str = "meta-llama/Llama-3.3-70B-Instruct"
+DEFAULT_TEMPERATURE: float = 0.7
+DEFAULT_MAX_TOKENS: int = 600
 DEFAULT_ATTRIBUTES: list[str] = [
     "competent",
     "likeable",

src/inference.py CHANGED Viewed

@@ -1,9 +1,135 @@
-"""Hugging Face Inference API client wrapper.
-Thin wrapper around `huggingface_hub`'s inference client that issues a
-single LLM call per turn and returns the raw model output. Keeps API
-concerns (auth, model selection, retries) isolated from prompt and
-evaluation logic.
-Implementation pending — scaffolding only.
 """

+"""HF Inference API client wrapper for Prisma.
+Provides PrismaInferenceClient, a small wrapper around huggingface_hub's
+InferenceClient that:
+- Forces JSON output via response_format={"type": "json_object"}.
+  This is required for reliable structured output with Llama 3.3 70B,
+  which otherwise produces conversational text before/instead of JSON.
+- Parses and validates the response via src.evaluation.
+- Raises typed errors for API failures (InferenceError) and parse
+  failures (EvaluationParseError, propagated from evaluation).
+The wrapper is initialized once per session with an HF token; each
+generate() call sends a full message history (system + conversation)
+and returns a validated ParsedTurn.
 """
+from __future__ import annotations
+from typing import Sequence
+from huggingface_hub import InferenceClient
+from huggingface_hub.utils import HfHubHTTPError
+from .config import DEFAULT_MAX_TOKENS, DEFAULT_TEMPERATURE, MODEL_ID
+from .evaluation import ParsedTurn, parse_model_output
+# Single chat message in OpenAI format. Kept loose for v1; can tighten to
+# a TypedDict later if message shapes diversify.
+ChatMessage = dict[str, str]
+class InferenceError(Exception):
+    """Raised when the inference API call fails or returns malformed data.
+    Wraps network errors, authentication failures, rate-limit errors,
+    and missing-field errors in the API response. Parse errors on the
+    model's content are *not* wrapped here — they surface as
+    EvaluationParseError so the app layer can distinguish them.
+    """
+class PrismaInferenceClient:
+    """Wrapper around huggingface_hub.InferenceClient configured for Prisma.
+    Holds a single InferenceClient instance and exposes a ``generate()``
+    method that takes a full message history and returns a validated
+    ``ParsedTurn``.
+    JSON output is forced unconditionally via the ``response_format``
+    parameter. This is required for Llama 3.3 70B and harmless on models
+    that already comply with prompt-level JSON instructions, so we apply
+    it uniformly for consistency across model families.
+    Args:
+        token: HuggingFace access token with inference permissions.
+        model_id: Model to call. Defaults to ``MODEL_ID`` from config.
+        temperature: Sampling temperature.
+        max_tokens: Maximum tokens per response.
+    Raises:
+        ValueError: If ``token`` is empty.
+    """
+    def __init__(
+        self,
+        token: str,
+        model_id: str = MODEL_ID,
+        temperature: float = DEFAULT_TEMPERATURE,
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+    ) -> None:
+        if not token:
+            raise ValueError("token must be a non-empty string")
+        self._client = InferenceClient(token=token)
+        self._model_id = model_id
+        self._temperature = temperature
+        self._max_tokens = max_tokens
+    @property
+    def model_id(self) -> str:
+        """The model ID this client is configured to use."""
+        return self._model_id
+    def generate(self, messages: Sequence[ChatMessage]) -> ParsedTurn:
+        """Send a chat completion request and return a parsed turn.
+        Args:
+            messages: Full chat history including the system message as the
+                first entry. Each message is a dict with ``role`` and
+                ``content`` keys (OpenAI format).
+        Returns:
+            A ``ParsedTurn`` with the response text and validated
+            evaluation scores.
+        Raises:
+            ValueError: If ``messages`` is empty.
+            InferenceError: If the API call itself fails (auth, rate limit,
+                network, malformed response envelope).
+            EvaluationParseError: If the model's content cannot be parsed
+                or validated against the expected attribute schema.
+        """
+        if not messages:
+            raise ValueError("messages must not be empty")
+        try:
+            completion = self._client.chat_completion(
+                model=self._model_id,
+                messages=list(messages),
+                max_tokens=self._max_tokens,
+                temperature=self._temperature,
+                response_format={"type": "json_object"},
+            )
+        except HfHubHTTPError as exc:
+            raise InferenceError(
+                f"HF Inference API request failed: {exc}"
+            ) from exc
+        except Exception as exc:
+            raise InferenceError(
+                f"Unexpected error during inference call: {exc}"
+            ) from exc
+        try:
+            raw = completion.choices[0].message.content
+        except (AttributeError, IndexError, TypeError) as exc:
+            raise InferenceError(
+                f"Inference response missing expected fields: {exc}"
+            ) from exc
+        if not isinstance(raw, str) or not raw.strip():
+            raise InferenceError(
+                "Inference response content is empty or non-text"
+            )
+        return parse_model_output(raw)

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Unit tests for src.inference."""
+from __future__ import annotations
+import json
+from unittest.mock import MagicMock, patch
+import pytest
+from src.evaluation import EvaluationParseError, ParsedTurn
+from src.inference import InferenceError, PrismaInferenceClient
+VALID_PAYLOAD = json.dumps({
+    "response": "Hi there!",
+    "evaluation": {
+        "competent": 5,
+        "likeable": 5,
+        "considerate": 5,
+        "polite": 5,
+        "formal": 5,
+        "demanding": 3,
+    },
+})
+def _mock_completion(content: str) -> MagicMock:
+    """Build a MagicMock mimicking the HF chat_completion return shape."""
+    completion = MagicMock()
+    completion.choices = [MagicMock()]
+    completion.choices[0].message.content = content
+    return completion
+# ---- Construction ----
+def test_rejects_empty_token():
+    with pytest.raises(ValueError, match="token"):
+        PrismaInferenceClient(token="")
+def test_exposes_model_id():
+    client = PrismaInferenceClient(token="hf_test", model_id="some/model")
+    assert client.model_id == "some/model"
+# ---- generate(): happy paths ----
+def test_generate_returns_parsed_turn():
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.return_value = _mock_completion(VALID_PAYLOAD)
+        result = client.generate([{"role": "user", "content": "hi"}])
+    assert isinstance(result, ParsedTurn)
+    assert result.response == "Hi there!"
+    assert result.evaluation["competent"] == 5
+def test_generate_forces_json_response_format():
+    """The wrapper must always pass response_format={'type': 'json_object'}."""
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.return_value = _mock_completion(VALID_PAYLOAD)
+        client.generate([{"role": "user", "content": "hi"}])
+    call = mock_inner.chat_completion.call_args
+    assert call.kwargs["response_format"] == {"type": "json_object"}
+def test_generate_passes_messages_and_model():
+    client = PrismaInferenceClient(token="hf_test", model_id="custom/model")
+    messages = [
+        {"role": "system", "content": "sys"},
+        {"role": "user", "content": "hi"},
+    ]
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.return_value = _mock_completion(VALID_PAYLOAD)
+        client.generate(messages)
+    call = mock_inner.chat_completion.call_args
+    assert call.kwargs["model"] == "custom/model"
+    assert call.kwargs["messages"] == messages
+# ---- generate(): error paths ----
+def test_generate_rejects_empty_messages():
+    client = PrismaInferenceClient(token="hf_test")
+    with pytest.raises(ValueError, match="messages"):
+        client.generate([])
+def test_generate_wraps_unexpected_exception():
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.side_effect = RuntimeError("boom")
+        with pytest.raises(InferenceError, match="boom"):
+            client.generate([{"role": "user", "content": "hi"}])
+def test_generate_rejects_empty_content():
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.return_value = _mock_completion("")
+        with pytest.raises(InferenceError, match="empty"):
+            client.generate([{"role": "user", "content": "hi"}])
+def test_generate_rejects_missing_choices():
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        bad = MagicMock()
+        bad.choices = []
+        mock_inner.chat_completion.return_value = bad
+        with pytest.raises(InferenceError, match="missing expected fields"):
+            client.generate([{"role": "user", "content": "hi"}])
+def test_generate_propagates_parse_errors():
+    """Parse failures bubble up as EvaluationParseError, not InferenceError."""
+    client = PrismaInferenceClient(token="hf_test")
+    with patch.object(client, "_client") as mock_inner:
+        mock_inner.chat_completion.return_value = _mock_completion("not json")
+        with pytest.raises(EvaluationParseError):
+            client.generate([{"role": "user", "content": "hi"}])