feat: add Groq backend as primary free inference provider
Browse files- Add Groq (OpenAI-compatible) backend to LLMClient and config
- Auto-select Groq when GROQ_API_KEY secret is set, fall back to HF
- Update JobAgent to accept backend + api_key instead of HF-only token
- Raise LLMQuotaError (402) explicitly instead of silently scoring 0
- Switch model list to Groq models (llama-3.3-70b, llama-3.1-8b, gemma2, mixtral)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- agent/job_matcher.py +3 -1
- agent/llm_client.py +76 -2
- agent/pipeline.py +6 -6
- app.py +28 -9
- config.py +11 -1
agent/job_matcher.py
CHANGED
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
from typing import Any, TypedDict
|
| 6 |
|
| 7 |
-
from agent.llm_client import LLMClient
|
| 8 |
from agent.utils import parse_json, job_institution, job_description
|
| 9 |
|
| 10 |
|
|
@@ -101,6 +101,8 @@ class JobMatcher:
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
raw = self.llm.generate(system=_SYSTEM, user=prompt, json_mode=True)
|
|
|
|
|
|
|
| 104 |
except RuntimeError as exc:
|
| 105 |
return _fallback(str(exc))
|
| 106 |
|
|
|
|
| 4 |
|
| 5 |
from typing import Any, TypedDict
|
| 6 |
|
| 7 |
+
from agent.llm_client import LLMClient, LLMQuotaError
|
| 8 |
from agent.utils import parse_json, job_institution, job_description
|
| 9 |
|
| 10 |
|
|
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
raw = self.llm.generate(system=_SYSTEM, user=prompt, json_mode=True)
|
| 104 |
+
except LLMQuotaError:
|
| 105 |
+
raise # propagate — caller should surface this to the user
|
| 106 |
except RuntimeError as exc:
|
| 107 |
return _fallback(str(exc))
|
| 108 |
|
agent/llm_client.py
CHANGED
|
@@ -19,6 +19,10 @@ from typing import Iterator
|
|
| 19 |
from config import config
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
class LLMClient:
|
| 23 |
"""Unified LLM client supporting Ollama and HuggingFace backends.
|
| 24 |
|
|
@@ -42,6 +46,7 @@ class LLMClient:
|
|
| 42 |
# Lazily initialised clients
|
| 43 |
self._openai_client = None
|
| 44 |
self._hf_client = None
|
|
|
|
| 45 |
|
| 46 |
# ------------------------------------------------------------------
|
| 47 |
# Internal: backend initialisation
|
|
@@ -63,6 +68,23 @@ class LLMClient:
|
|
| 63 |
)
|
| 64 |
return self._openai_client
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
def _get_hf_client(self):
|
| 67 |
"""Return (and cache) a huggingface_hub.InferenceClient."""
|
| 68 |
if self._hf_client is None:
|
|
@@ -129,12 +151,14 @@ class LLMClient:
|
|
| 129 |
|
| 130 |
if self.backend == "ollama":
|
| 131 |
return self._generate_ollama(system, user, json_mode=json_mode)
|
|
|
|
|
|
|
| 132 |
elif self.backend == "huggingface":
|
| 133 |
return self._generate_hf(system, user)
|
| 134 |
else:
|
| 135 |
raise RuntimeError(
|
| 136 |
f"Unknown LLM backend: '{self.backend}'. "
|
| 137 |
-
"Set LLM_BACKEND=ollama or
|
| 138 |
)
|
| 139 |
|
| 140 |
def stream_generate(self, system: str, user: str) -> Iterator[str]:
|
|
@@ -149,6 +173,8 @@ class LLMClient:
|
|
| 149 |
"""
|
| 150 |
if self.backend == "ollama":
|
| 151 |
yield from self._stream_ollama(system, user)
|
|
|
|
|
|
|
| 152 |
elif self.backend == "huggingface":
|
| 153 |
yield from self._stream_hf(system, user)
|
| 154 |
else:
|
|
@@ -212,6 +238,47 @@ class LLMClient:
|
|
| 212 |
) from exc
|
| 213 |
raise RuntimeError(f"Ollama streaming failed: {exc}") from exc
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
# ------------------------------------------------------------------
|
| 216 |
# HuggingFace implementation (via InferenceClient)
|
| 217 |
# ------------------------------------------------------------------
|
|
@@ -232,7 +299,14 @@ class LLMClient:
|
|
| 232 |
return result.choices[0].message.content or ""
|
| 233 |
except Exception as exc:
|
| 234 |
last_exc = exc
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
import time as _time
|
| 237 |
_time.sleep(2 ** attempt) # 1s, 2s, 4s
|
| 238 |
continue
|
|
|
|
| 19 |
from config import config
|
| 20 |
|
| 21 |
|
| 22 |
+
class LLMQuotaError(RuntimeError):
|
| 23 |
+
"""Raised when the HuggingFace free-tier quota is exhausted (HTTP 402)."""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
class LLMClient:
|
| 27 |
"""Unified LLM client supporting Ollama and HuggingFace backends.
|
| 28 |
|
|
|
|
| 46 |
# Lazily initialised clients
|
| 47 |
self._openai_client = None
|
| 48 |
self._hf_client = None
|
| 49 |
+
self._groq_client = None
|
| 50 |
|
| 51 |
# ------------------------------------------------------------------
|
| 52 |
# Internal: backend initialisation
|
|
|
|
| 68 |
)
|
| 69 |
return self._openai_client
|
| 70 |
|
| 71 |
+
def _get_groq_client(self):
|
| 72 |
+
"""Return (and cache) an openai.OpenAI client pointed at Groq."""
|
| 73 |
+
if not hasattr(self, "_groq_client") or self._groq_client is None:
|
| 74 |
+
try:
|
| 75 |
+
from openai import OpenAI # type: ignore
|
| 76 |
+
except ImportError as exc:
|
| 77 |
+
raise ImportError(
|
| 78 |
+
"The 'openai' package is required for the Groq backend.\n"
|
| 79 |
+
"Install it with: pip install openai>=1.0.0"
|
| 80 |
+
) from exc
|
| 81 |
+
api_key = self._token_override or config.groq_api_key
|
| 82 |
+
self._groq_client = OpenAI(
|
| 83 |
+
base_url=config.groq_base_url,
|
| 84 |
+
api_key=api_key,
|
| 85 |
+
)
|
| 86 |
+
return self._groq_client
|
| 87 |
+
|
| 88 |
def _get_hf_client(self):
|
| 89 |
"""Return (and cache) a huggingface_hub.InferenceClient."""
|
| 90 |
if self._hf_client is None:
|
|
|
|
| 151 |
|
| 152 |
if self.backend == "ollama":
|
| 153 |
return self._generate_ollama(system, user, json_mode=json_mode)
|
| 154 |
+
elif self.backend == "groq":
|
| 155 |
+
return self._generate_groq(system, user, json_mode=json_mode)
|
| 156 |
elif self.backend == "huggingface":
|
| 157 |
return self._generate_hf(system, user)
|
| 158 |
else:
|
| 159 |
raise RuntimeError(
|
| 160 |
f"Unknown LLM backend: '{self.backend}'. "
|
| 161 |
+
"Set LLM_BACKEND=ollama, groq, or huggingface in your .env."
|
| 162 |
)
|
| 163 |
|
| 164 |
def stream_generate(self, system: str, user: str) -> Iterator[str]:
|
|
|
|
| 173 |
"""
|
| 174 |
if self.backend == "ollama":
|
| 175 |
yield from self._stream_ollama(system, user)
|
| 176 |
+
elif self.backend == "groq":
|
| 177 |
+
yield from self._stream_groq(system, user)
|
| 178 |
elif self.backend == "huggingface":
|
| 179 |
yield from self._stream_hf(system, user)
|
| 180 |
else:
|
|
|
|
| 238 |
) from exc
|
| 239 |
raise RuntimeError(f"Ollama streaming failed: {exc}") from exc
|
| 240 |
|
| 241 |
+
# ------------------------------------------------------------------
|
| 242 |
+
# Groq implementation (OpenAI-compatible API)
|
| 243 |
+
# ------------------------------------------------------------------
|
| 244 |
+
|
| 245 |
+
def _generate_groq(self, system: str, user: str, json_mode: bool = False) -> str:
|
| 246 |
+
client = self._get_groq_client()
|
| 247 |
+
kwargs: dict = {
|
| 248 |
+
"model": self.model,
|
| 249 |
+
"messages": [
|
| 250 |
+
{"role": "system", "content": system},
|
| 251 |
+
{"role": "user", "content": user},
|
| 252 |
+
],
|
| 253 |
+
"max_tokens": config.max_tokens,
|
| 254 |
+
}
|
| 255 |
+
if json_mode:
|
| 256 |
+
kwargs["response_format"] = {"type": "json_object"}
|
| 257 |
+
try:
|
| 258 |
+
response = client.chat.completions.create(**kwargs)
|
| 259 |
+
return response.choices[0].message.content or ""
|
| 260 |
+
except Exception as exc:
|
| 261 |
+
raise RuntimeError(f"Groq inference failed: {exc}") from exc
|
| 262 |
+
|
| 263 |
+
def _stream_groq(self, system: str, user: str) -> Iterator[str]:
|
| 264 |
+
client = self._get_groq_client()
|
| 265 |
+
try:
|
| 266 |
+
stream = client.chat.completions.create(
|
| 267 |
+
model=self.model,
|
| 268 |
+
messages=[
|
| 269 |
+
{"role": "system", "content": system},
|
| 270 |
+
{"role": "user", "content": user},
|
| 271 |
+
],
|
| 272 |
+
max_tokens=config.max_tokens,
|
| 273 |
+
stream=True,
|
| 274 |
+
)
|
| 275 |
+
for chunk in stream:
|
| 276 |
+
delta = chunk.choices[0].delta
|
| 277 |
+
if delta and delta.content:
|
| 278 |
+
yield delta.content
|
| 279 |
+
except Exception as exc:
|
| 280 |
+
raise RuntimeError(f"Groq streaming failed: {exc}") from exc
|
| 281 |
+
|
| 282 |
# ------------------------------------------------------------------
|
| 283 |
# HuggingFace implementation (via InferenceClient)
|
| 284 |
# ------------------------------------------------------------------
|
|
|
|
| 299 |
return result.choices[0].message.content or ""
|
| 300 |
except Exception as exc:
|
| 301 |
last_exc = exc
|
| 302 |
+
err_str = str(exc)
|
| 303 |
+
if "402" in err_str:
|
| 304 |
+
raise LLMQuotaError(
|
| 305 |
+
"HuggingFace quota exceeded (402 Payment Required).\n"
|
| 306 |
+
"Switch to a different model in the LLM Settings panel, "
|
| 307 |
+
"or wait for your free-tier quota to reset."
|
| 308 |
+
) from exc
|
| 309 |
+
if "503" in err_str or "502" in err_str or "529" in err_str:
|
| 310 |
import time as _time
|
| 311 |
_time.sleep(2 ** attempt) # 1s, 2s, 4s
|
| 312 |
continue
|
agent/pipeline.py
CHANGED
|
@@ -15,16 +15,16 @@ from agent.cover_letter import CoverLetterWriter
|
|
| 15 |
class JobAgent:
|
| 16 |
"""Orchestrates CV parsing, job search, scoring, and application generation.
|
| 17 |
|
| 18 |
-
Each instance holds its own LLM client
|
| 19 |
-
model — safe to instantiate per-request (no shared mutable state).
|
| 20 |
|
| 21 |
Args:
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
def __init__(self,
|
| 27 |
-
self.llm = LLMClient(model=model, backend=
|
| 28 |
self.parser = CVParser(self.llm)
|
| 29 |
self.searcher = JobSearcher()
|
| 30 |
self.matcher = JobMatcher(self.llm)
|
|
|
|
| 15 |
class JobAgent:
|
| 16 |
"""Orchestrates CV parsing, job search, scoring, and application generation.
|
| 17 |
|
| 18 |
+
Each instance holds its own LLM client — safe to instantiate per-request.
|
|
|
|
| 19 |
|
| 20 |
Args:
|
| 21 |
+
model: Model ID for the selected backend.
|
| 22 |
+
backend: "groq" | "huggingface" | "ollama"
|
| 23 |
+
api_key: API key for the selected backend (not needed for Ollama).
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
def __init__(self, model: str, backend: str = "groq", api_key: str = "") -> None:
|
| 27 |
+
self.llm = LLMClient(model=model, backend=backend, token=api_key or None)
|
| 28 |
self.parser = CVParser(self.llm)
|
| 29 |
self.searcher = JobSearcher()
|
| 30 |
self.matcher = JobMatcher(self.llm)
|
app.py
CHANGED
|
@@ -174,13 +174,13 @@ def run_search(
|
|
| 174 |
return _err("Please upload a CV file first.")
|
| 175 |
if not field or not field.strip():
|
| 176 |
return _err("Please enter a research field.")
|
| 177 |
-
if not
|
| 178 |
-
return _err("No
|
| 179 |
|
| 180 |
try:
|
| 181 |
from agent.pipeline import JobAgent
|
| 182 |
|
| 183 |
-
agent = JobAgent(
|
| 184 |
cv_path = cv_file if isinstance(cv_file, str) else cv_file.name
|
| 185 |
|
| 186 |
progress(0, desc="Parsing CV...")
|
|
@@ -246,7 +246,7 @@ def load_position(
|
|
| 246 |
match: dict = job.get("match") or {}
|
| 247 |
|
| 248 |
from agent.pipeline import JobAgent
|
| 249 |
-
agent = JobAgent(
|
| 250 |
|
| 251 |
progress(0.3, desc="Generating tailoring hints...")
|
| 252 |
hints, cover_letter = agent.prepare_application(job, profile_text)
|
|
@@ -270,7 +270,7 @@ def regenerate_letter(
|
|
| 270 |
return "*No position loaded.*"
|
| 271 |
try:
|
| 272 |
from agent.pipeline import JobAgent
|
| 273 |
-
agent = JobAgent(
|
| 274 |
progress(0.3, desc="Regenerating cover letter...")
|
| 275 |
result = agent.regenerate_letter(scored_jobs[current_idx], profile_text)
|
| 276 |
progress(1.0)
|
|
@@ -372,7 +372,14 @@ def export_zip(approved: list) -> tuple:
|
|
| 372 |
# Gradio Blocks layout
|
| 373 |
# ---------------------------------------------------------------------------
|
| 374 |
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
"Qwen/Qwen2.5-7B-Instruct",
|
| 377 |
"meta-llama/Llama-3.2-3B-Instruct",
|
| 378 |
"microsoft/Phi-3.5-mini-instruct",
|
|
@@ -401,8 +408,20 @@ LOCATIONS = [
|
|
| 401 |
"South Africa", "Israel",
|
| 402 |
]
|
| 403 |
|
| 404 |
-
#
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
with gr.Blocks(
|
| 408 |
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
|
|
@@ -464,7 +483,7 @@ with gr.Blocks(
|
|
| 464 |
label="Model",
|
| 465 |
choices=MODELS,
|
| 466 |
value=MODELS[0],
|
| 467 |
-
info=
|
| 468 |
)
|
| 469 |
search_btn = gr.Button("Parse CV & Search Positions", variant="primary", size="lg")
|
| 470 |
search_status = gr.Markdown("*Ready. Fill in the form and click Search.*")
|
|
|
|
| 174 |
return _err("Please upload a CV file first.")
|
| 175 |
if not field or not field.strip():
|
| 176 |
return _err("Please enter a research field.")
|
| 177 |
+
if not _API_KEY and _BACKEND != "ollama":
|
| 178 |
+
return _err("No API key configured. Set GROQ_API_KEY (or HF_TOKEN) as a Space secret.")
|
| 179 |
|
| 180 |
try:
|
| 181 |
from agent.pipeline import JobAgent
|
| 182 |
|
| 183 |
+
agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
|
| 184 |
cv_path = cv_file if isinstance(cv_file, str) else cv_file.name
|
| 185 |
|
| 186 |
progress(0, desc="Parsing CV...")
|
|
|
|
| 246 |
match: dict = job.get("match") or {}
|
| 247 |
|
| 248 |
from agent.pipeline import JobAgent
|
| 249 |
+
agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
|
| 250 |
|
| 251 |
progress(0.3, desc="Generating tailoring hints...")
|
| 252 |
hints, cover_letter = agent.prepare_application(job, profile_text)
|
|
|
|
| 270 |
return "*No position loaded.*"
|
| 271 |
try:
|
| 272 |
from agent.pipeline import JobAgent
|
| 273 |
+
agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
|
| 274 |
progress(0.3, desc="Regenerating cover letter...")
|
| 275 |
result = agent.regenerate_letter(scored_jobs[current_idx], profile_text)
|
| 276 |
progress(1.0)
|
|
|
|
| 372 |
# Gradio Blocks layout
|
| 373 |
# ---------------------------------------------------------------------------
|
| 374 |
|
| 375 |
+
GROQ_MODELS = [
|
| 376 |
+
"llama-3.3-70b-versatile",
|
| 377 |
+
"llama-3.1-8b-instant",
|
| 378 |
+
"gemma2-9b-it",
|
| 379 |
+
"mixtral-8x7b-32768",
|
| 380 |
+
]
|
| 381 |
+
|
| 382 |
+
HF_MODELS = [
|
| 383 |
"Qwen/Qwen2.5-7B-Instruct",
|
| 384 |
"meta-llama/Llama-3.2-3B-Instruct",
|
| 385 |
"microsoft/Phi-3.5-mini-instruct",
|
|
|
|
| 408 |
"South Africa", "Israel",
|
| 409 |
]
|
| 410 |
|
| 411 |
+
# Backend selection: Groq takes priority over HuggingFace
|
| 412 |
+
_GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 413 |
+
_HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 414 |
+
|
| 415 |
+
if _GROQ_KEY:
|
| 416 |
+
_BACKEND = "groq"
|
| 417 |
+
_API_KEY = _GROQ_KEY
|
| 418 |
+
MODELS = GROQ_MODELS
|
| 419 |
+
_MODEL_INFO = "Free via Groq — no user limits"
|
| 420 |
+
else:
|
| 421 |
+
_BACKEND = "huggingface"
|
| 422 |
+
_API_KEY = _HF_TOKEN
|
| 423 |
+
MODELS = HF_MODELS
|
| 424 |
+
_MODEL_INFO = "Free via HuggingFace Inference API"
|
| 425 |
|
| 426 |
with gr.Blocks(
|
| 427 |
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
|
|
|
|
| 483 |
label="Model",
|
| 484 |
choices=MODELS,
|
| 485 |
value=MODELS[0],
|
| 486 |
+
info=_MODEL_INFO,
|
| 487 |
)
|
| 488 |
search_btn = gr.Button("Parse CV & Search Positions", variant="primary", size="lg")
|
| 489 |
search_status = gr.Markdown("*Ready. Fill in the form and click Search.*")
|
config.py
CHANGED
|
@@ -47,6 +47,10 @@ class AppConfig:
|
|
| 47 |
default_factory=lambda: os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
|
| 48 |
)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# Generation settings
|
| 51 |
max_tokens: int = 4096
|
| 52 |
|
|
@@ -95,10 +99,16 @@ class AppConfig:
|
|
| 95 |
"Free HuggingFace inference may be rate-limited or unavailable. "
|
| 96 |
"Get a free key at https://huggingface.co/settings/tokens"
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
else:
|
| 99 |
print(
|
| 100 |
f"[WARNING] Unknown LLM_BACKEND '{self.llm_backend}'. "
|
| 101 |
-
"Supported values: 'ollama', 'huggingface'."
|
| 102 |
)
|
| 103 |
|
| 104 |
|
|
|
|
| 47 |
default_factory=lambda: os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
|
| 48 |
)
|
| 49 |
|
| 50 |
+
# Groq settings (recommended free cloud backend)
|
| 51 |
+
groq_api_key: str = field(default_factory=lambda: os.getenv("GROQ_API_KEY", ""))
|
| 52 |
+
groq_base_url: str = "https://api.groq.com/openai/v1"
|
| 53 |
+
|
| 54 |
# Generation settings
|
| 55 |
max_tokens: int = 4096
|
| 56 |
|
|
|
|
| 99 |
"Free HuggingFace inference may be rate-limited or unavailable. "
|
| 100 |
"Get a free key at https://huggingface.co/settings/tokens"
|
| 101 |
)
|
| 102 |
+
elif self.llm_backend == "groq":
|
| 103 |
+
if not self.groq_api_key:
|
| 104 |
+
print(
|
| 105 |
+
"[WARNING] GROQ_API_KEY is not set. "
|
| 106 |
+
"Get a free key at https://console.groq.com/keys"
|
| 107 |
+
)
|
| 108 |
else:
|
| 109 |
print(
|
| 110 |
f"[WARNING] Unknown LLM_BACKEND '{self.llm_backend}'. "
|
| 111 |
+
"Supported values: 'ollama', 'huggingface', 'groq'."
|
| 112 |
)
|
| 113 |
|
| 114 |
|