HipFil98 Claude Sonnet 4.6 commited on
Commit
55df7c0
·
1 Parent(s): 51f8256

feat: add Groq backend as primary free inference provider

Browse files

- Add Groq (OpenAI-compatible) backend to LLMClient and config
- Auto-select Groq when GROQ_API_KEY secret is set, fall back to HF
- Update JobAgent to accept backend + api_key instead of HF-only token
- Raise LLMQuotaError (402) explicitly instead of silently scoring 0
- Switch model list to Groq models (llama-3.3-70b, llama-3.1-8b, gemma2, mixtral)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (5) hide show
  1. agent/job_matcher.py +3 -1
  2. agent/llm_client.py +76 -2
  3. agent/pipeline.py +6 -6
  4. app.py +28 -9
  5. config.py +11 -1
agent/job_matcher.py CHANGED
@@ -4,7 +4,7 @@ from __future__ import annotations
4
 
5
  from typing import Any, TypedDict
6
 
7
- from agent.llm_client import LLMClient
8
  from agent.utils import parse_json, job_institution, job_description
9
 
10
 
@@ -101,6 +101,8 @@ class JobMatcher:
101
 
102
  try:
103
  raw = self.llm.generate(system=_SYSTEM, user=prompt, json_mode=True)
 
 
104
  except RuntimeError as exc:
105
  return _fallback(str(exc))
106
 
 
4
 
5
  from typing import Any, TypedDict
6
 
7
+ from agent.llm_client import LLMClient, LLMQuotaError
8
  from agent.utils import parse_json, job_institution, job_description
9
 
10
 
 
101
 
102
  try:
103
  raw = self.llm.generate(system=_SYSTEM, user=prompt, json_mode=True)
104
+ except LLMQuotaError:
105
+ raise # propagate — caller should surface this to the user
106
  except RuntimeError as exc:
107
  return _fallback(str(exc))
108
 
agent/llm_client.py CHANGED
@@ -19,6 +19,10 @@ from typing import Iterator
19
  from config import config
20
 
21
 
 
 
 
 
22
  class LLMClient:
23
  """Unified LLM client supporting Ollama and HuggingFace backends.
24
 
@@ -42,6 +46,7 @@ class LLMClient:
42
  # Lazily initialised clients
43
  self._openai_client = None
44
  self._hf_client = None
 
45
 
46
  # ------------------------------------------------------------------
47
  # Internal: backend initialisation
@@ -63,6 +68,23 @@ class LLMClient:
63
  )
64
  return self._openai_client
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def _get_hf_client(self):
67
  """Return (and cache) a huggingface_hub.InferenceClient."""
68
  if self._hf_client is None:
@@ -129,12 +151,14 @@ class LLMClient:
129
 
130
  if self.backend == "ollama":
131
  return self._generate_ollama(system, user, json_mode=json_mode)
 
 
132
  elif self.backend == "huggingface":
133
  return self._generate_hf(system, user)
134
  else:
135
  raise RuntimeError(
136
  f"Unknown LLM backend: '{self.backend}'. "
137
- "Set LLM_BACKEND=ollama or LLM_BACKEND=huggingface in your .env."
138
  )
139
 
140
  def stream_generate(self, system: str, user: str) -> Iterator[str]:
@@ -149,6 +173,8 @@ class LLMClient:
149
  """
150
  if self.backend == "ollama":
151
  yield from self._stream_ollama(system, user)
 
 
152
  elif self.backend == "huggingface":
153
  yield from self._stream_hf(system, user)
154
  else:
@@ -212,6 +238,47 @@ class LLMClient:
212
  ) from exc
213
  raise RuntimeError(f"Ollama streaming failed: {exc}") from exc
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # ------------------------------------------------------------------
216
  # HuggingFace implementation (via InferenceClient)
217
  # ------------------------------------------------------------------
@@ -232,7 +299,14 @@ class LLMClient:
232
  return result.choices[0].message.content or ""
233
  except Exception as exc:
234
  last_exc = exc
235
- if "503" in str(exc) or "502" in str(exc) or "529" in str(exc):
 
 
 
 
 
 
 
236
  import time as _time
237
  _time.sleep(2 ** attempt) # 1s, 2s, 4s
238
  continue
 
19
  from config import config
20
 
21
 
22
+ class LLMQuotaError(RuntimeError):
23
+ """Raised when the HuggingFace free-tier quota is exhausted (HTTP 402)."""
24
+
25
+
26
  class LLMClient:
27
  """Unified LLM client supporting Ollama and HuggingFace backends.
28
 
 
46
  # Lazily initialised clients
47
  self._openai_client = None
48
  self._hf_client = None
49
+ self._groq_client = None
50
 
51
  # ------------------------------------------------------------------
52
  # Internal: backend initialisation
 
68
  )
69
  return self._openai_client
70
 
71
+ def _get_groq_client(self):
72
+ """Return (and cache) an openai.OpenAI client pointed at Groq."""
73
+ if not hasattr(self, "_groq_client") or self._groq_client is None:
74
+ try:
75
+ from openai import OpenAI # type: ignore
76
+ except ImportError as exc:
77
+ raise ImportError(
78
+ "The 'openai' package is required for the Groq backend.\n"
79
+ "Install it with: pip install openai>=1.0.0"
80
+ ) from exc
81
+ api_key = self._token_override or config.groq_api_key
82
+ self._groq_client = OpenAI(
83
+ base_url=config.groq_base_url,
84
+ api_key=api_key,
85
+ )
86
+ return self._groq_client
87
+
88
  def _get_hf_client(self):
89
  """Return (and cache) a huggingface_hub.InferenceClient."""
90
  if self._hf_client is None:
 
151
 
152
  if self.backend == "ollama":
153
  return self._generate_ollama(system, user, json_mode=json_mode)
154
+ elif self.backend == "groq":
155
+ return self._generate_groq(system, user, json_mode=json_mode)
156
  elif self.backend == "huggingface":
157
  return self._generate_hf(system, user)
158
  else:
159
  raise RuntimeError(
160
  f"Unknown LLM backend: '{self.backend}'. "
161
+ "Set LLM_BACKEND=ollama, groq, or huggingface in your .env."
162
  )
163
 
164
  def stream_generate(self, system: str, user: str) -> Iterator[str]:
 
173
  """
174
  if self.backend == "ollama":
175
  yield from self._stream_ollama(system, user)
176
+ elif self.backend == "groq":
177
+ yield from self._stream_groq(system, user)
178
  elif self.backend == "huggingface":
179
  yield from self._stream_hf(system, user)
180
  else:
 
238
  ) from exc
239
  raise RuntimeError(f"Ollama streaming failed: {exc}") from exc
240
 
241
+ # ------------------------------------------------------------------
242
+ # Groq implementation (OpenAI-compatible API)
243
+ # ------------------------------------------------------------------
244
+
245
+ def _generate_groq(self, system: str, user: str, json_mode: bool = False) -> str:
246
+ client = self._get_groq_client()
247
+ kwargs: dict = {
248
+ "model": self.model,
249
+ "messages": [
250
+ {"role": "system", "content": system},
251
+ {"role": "user", "content": user},
252
+ ],
253
+ "max_tokens": config.max_tokens,
254
+ }
255
+ if json_mode:
256
+ kwargs["response_format"] = {"type": "json_object"}
257
+ try:
258
+ response = client.chat.completions.create(**kwargs)
259
+ return response.choices[0].message.content or ""
260
+ except Exception as exc:
261
+ raise RuntimeError(f"Groq inference failed: {exc}") from exc
262
+
263
+ def _stream_groq(self, system: str, user: str) -> Iterator[str]:
264
+ client = self._get_groq_client()
265
+ try:
266
+ stream = client.chat.completions.create(
267
+ model=self.model,
268
+ messages=[
269
+ {"role": "system", "content": system},
270
+ {"role": "user", "content": user},
271
+ ],
272
+ max_tokens=config.max_tokens,
273
+ stream=True,
274
+ )
275
+ for chunk in stream:
276
+ delta = chunk.choices[0].delta
277
+ if delta and delta.content:
278
+ yield delta.content
279
+ except Exception as exc:
280
+ raise RuntimeError(f"Groq streaming failed: {exc}") from exc
281
+
282
  # ------------------------------------------------------------------
283
  # HuggingFace implementation (via InferenceClient)
284
  # ------------------------------------------------------------------
 
299
  return result.choices[0].message.content or ""
300
  except Exception as exc:
301
  last_exc = exc
302
+ err_str = str(exc)
303
+ if "402" in err_str:
304
+ raise LLMQuotaError(
305
+ "HuggingFace quota exceeded (402 Payment Required).\n"
306
+ "Switch to a different model in the LLM Settings panel, "
307
+ "or wait for your free-tier quota to reset."
308
+ ) from exc
309
+ if "503" in err_str or "502" in err_str or "529" in err_str:
310
  import time as _time
311
  _time.sleep(2 ** attempt) # 1s, 2s, 4s
312
  continue
agent/pipeline.py CHANGED
@@ -15,16 +15,16 @@ from agent.cover_letter import CoverLetterWriter
15
  class JobAgent:
16
  """Orchestrates CV parsing, job search, scoring, and application generation.
17
 
18
- Each instance holds its own LLM client configured with the given token and
19
- model — safe to instantiate per-request (no shared mutable state).
20
 
21
  Args:
22
- token: HuggingFace API token.
23
- model: HuggingFace model ID (e.g. "mistralai/Mistral-7B-Instruct-v0.3").
 
24
  """
25
 
26
- def __init__(self, token: str, model: str) -> None:
27
- self.llm = LLMClient(model=model, backend="huggingface", token=token)
28
  self.parser = CVParser(self.llm)
29
  self.searcher = JobSearcher()
30
  self.matcher = JobMatcher(self.llm)
 
15
  class JobAgent:
16
  """Orchestrates CV parsing, job search, scoring, and application generation.
17
 
18
+ Each instance holds its own LLM client safe to instantiate per-request.
 
19
 
20
  Args:
21
+ model: Model ID for the selected backend.
22
+ backend: "groq" | "huggingface" | "ollama"
23
+ api_key: API key for the selected backend (not needed for Ollama).
24
  """
25
 
26
+ def __init__(self, model: str, backend: str = "groq", api_key: str = "") -> None:
27
+ self.llm = LLMClient(model=model, backend=backend, token=api_key or None)
28
  self.parser = CVParser(self.llm)
29
  self.searcher = JobSearcher()
30
  self.matcher = JobMatcher(self.llm)
app.py CHANGED
@@ -174,13 +174,13 @@ def run_search(
174
  return _err("Please upload a CV file first.")
175
  if not field or not field.strip():
176
  return _err("Please enter a research field.")
177
- if not _SHARED_TOKEN:
178
- return _err("No HF_TOKEN configured. Set it as a Space secret.")
179
 
180
  try:
181
  from agent.pipeline import JobAgent
182
 
183
- agent = JobAgent(token=_SHARED_TOKEN, model=model_name)
184
  cv_path = cv_file if isinstance(cv_file, str) else cv_file.name
185
 
186
  progress(0, desc="Parsing CV...")
@@ -246,7 +246,7 @@ def load_position(
246
  match: dict = job.get("match") or {}
247
 
248
  from agent.pipeline import JobAgent
249
- agent = JobAgent(token=_SHARED_TOKEN, model=model_name)
250
 
251
  progress(0.3, desc="Generating tailoring hints...")
252
  hints, cover_letter = agent.prepare_application(job, profile_text)
@@ -270,7 +270,7 @@ def regenerate_letter(
270
  return "*No position loaded.*"
271
  try:
272
  from agent.pipeline import JobAgent
273
- agent = JobAgent(token=_SHARED_TOKEN, model=model_name)
274
  progress(0.3, desc="Regenerating cover letter...")
275
  result = agent.regenerate_letter(scored_jobs[current_idx], profile_text)
276
  progress(1.0)
@@ -372,7 +372,14 @@ def export_zip(approved: list) -> tuple:
372
  # Gradio Blocks layout
373
  # ---------------------------------------------------------------------------
374
 
375
- MODELS = [
 
 
 
 
 
 
 
376
  "Qwen/Qwen2.5-7B-Instruct",
377
  "meta-llama/Llama-3.2-3B-Instruct",
378
  "microsoft/Phi-3.5-mini-instruct",
@@ -401,8 +408,20 @@ LOCATIONS = [
401
  "South Africa", "Israel",
402
  ]
403
 
404
- # If a shared token is configured via Space Secrets, users don't need to provide one.
405
- _SHARED_TOKEN = os.environ.get("HF_TOKEN", "")
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  with gr.Blocks(
408
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
@@ -464,7 +483,7 @@ with gr.Blocks(
464
  label="Model",
465
  choices=MODELS,
466
  value=MODELS[0],
467
- info="All are free via HF Inference API",
468
  )
469
  search_btn = gr.Button("Parse CV & Search Positions", variant="primary", size="lg")
470
  search_status = gr.Markdown("*Ready. Fill in the form and click Search.*")
 
174
  return _err("Please upload a CV file first.")
175
  if not field or not field.strip():
176
  return _err("Please enter a research field.")
177
+ if not _API_KEY and _BACKEND != "ollama":
178
+ return _err("No API key configured. Set GROQ_API_KEY (or HF_TOKEN) as a Space secret.")
179
 
180
  try:
181
  from agent.pipeline import JobAgent
182
 
183
+ agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
184
  cv_path = cv_file if isinstance(cv_file, str) else cv_file.name
185
 
186
  progress(0, desc="Parsing CV...")
 
246
  match: dict = job.get("match") or {}
247
 
248
  from agent.pipeline import JobAgent
249
+ agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
250
 
251
  progress(0.3, desc="Generating tailoring hints...")
252
  hints, cover_letter = agent.prepare_application(job, profile_text)
 
270
  return "*No position loaded.*"
271
  try:
272
  from agent.pipeline import JobAgent
273
+ agent = JobAgent(model=model_name, backend=_BACKEND, api_key=_API_KEY)
274
  progress(0.3, desc="Regenerating cover letter...")
275
  result = agent.regenerate_letter(scored_jobs[current_idx], profile_text)
276
  progress(1.0)
 
372
  # Gradio Blocks layout
373
  # ---------------------------------------------------------------------------
374
 
375
+ GROQ_MODELS = [
376
+ "llama-3.3-70b-versatile",
377
+ "llama-3.1-8b-instant",
378
+ "gemma2-9b-it",
379
+ "mixtral-8x7b-32768",
380
+ ]
381
+
382
+ HF_MODELS = [
383
  "Qwen/Qwen2.5-7B-Instruct",
384
  "meta-llama/Llama-3.2-3B-Instruct",
385
  "microsoft/Phi-3.5-mini-instruct",
 
408
  "South Africa", "Israel",
409
  ]
410
 
411
+ # Backend selection: Groq takes priority over HuggingFace
412
+ _GROQ_KEY = os.environ.get("GROQ_API_KEY", "")
413
+ _HF_TOKEN = os.environ.get("HF_TOKEN", "")
414
+
415
+ if _GROQ_KEY:
416
+ _BACKEND = "groq"
417
+ _API_KEY = _GROQ_KEY
418
+ MODELS = GROQ_MODELS
419
+ _MODEL_INFO = "Free via Groq — no user limits"
420
+ else:
421
+ _BACKEND = "huggingface"
422
+ _API_KEY = _HF_TOKEN
423
+ MODELS = HF_MODELS
424
+ _MODEL_INFO = "Free via HuggingFace Inference API"
425
 
426
  with gr.Blocks(
427
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
 
483
  label="Model",
484
  choices=MODELS,
485
  value=MODELS[0],
486
+ info=_MODEL_INFO,
487
  )
488
  search_btn = gr.Button("Parse CV & Search Positions", variant="primary", size="lg")
489
  search_status = gr.Markdown("*Ready. Fill in the form and click Search.*")
config.py CHANGED
@@ -47,6 +47,10 @@ class AppConfig:
47
  default_factory=lambda: os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
48
  )
49
 
 
 
 
 
50
  # Generation settings
51
  max_tokens: int = 4096
52
 
@@ -95,10 +99,16 @@ class AppConfig:
95
  "Free HuggingFace inference may be rate-limited or unavailable. "
96
  "Get a free key at https://huggingface.co/settings/tokens"
97
  )
 
 
 
 
 
 
98
  else:
99
  print(
100
  f"[WARNING] Unknown LLM_BACKEND '{self.llm_backend}'. "
101
- "Supported values: 'ollama', 'huggingface'."
102
  )
103
 
104
 
 
47
  default_factory=lambda: os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
48
  )
49
 
50
+ # Groq settings (recommended free cloud backend)
51
+ groq_api_key: str = field(default_factory=lambda: os.getenv("GROQ_API_KEY", ""))
52
+ groq_base_url: str = "https://api.groq.com/openai/v1"
53
+
54
  # Generation settings
55
  max_tokens: int = 4096
56
 
 
99
  "Free HuggingFace inference may be rate-limited or unavailable. "
100
  "Get a free key at https://huggingface.co/settings/tokens"
101
  )
102
+ elif self.llm_backend == "groq":
103
+ if not self.groq_api_key:
104
+ print(
105
+ "[WARNING] GROQ_API_KEY is not set. "
106
+ "Get a free key at https://console.groq.com/keys"
107
+ )
108
  else:
109
  print(
110
  f"[WARNING] Unknown LLM_BACKEND '{self.llm_backend}'. "
111
+ "Supported values: 'ollama', 'huggingface', 'groq'."
112
  )
113
 
114