Aramente Claude Opus 4 commited on
Commit
3da512e
·
1 Parent(s): 660b183

security round 2: payload caps, traceback strip, GDPR delete, path allowlist

Browse files

- H6+H8: strip exception details from chat/draft/generate/cover_letter/linkedin;
log internally via logging.exception, return generic 502 to client
- H1: cap snapshot payload at 500KB before allocating slug or hitting DB
- H2: BodySizeLimitMiddleware (1MB cap, PDF routes excepted with own 5MB caps);
ChatMessage.content max_length=8000; ImproveBulletRequest field caps
- H7: DELETE /api/auth/account — wipes user + FK children in one txn, returns counts
- M-tier frontend: cv_actions path allowlist in _apply_substitution (regex gates
to summary/experiences[N].(title|company|dates|bullets[M]|...)/education/skills);
ChatMessage urlTransform restricts links to http(s)/mailto/anchor/relative
- Add missing rate limit to /translate-cv
- Rip ?fix-lang=1 one-shot localStorage migration (served its purpose)
- Fix missing HTTPException import in auth.py from H7 patch

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>

app/main.py CHANGED
@@ -1,7 +1,9 @@
1
  import os
2
 
3
- from fastapi import FastAPI, Header, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
 
 
5
  from starlette.middleware.sessions import SessionMiddleware
6
 
7
  from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
@@ -17,6 +19,32 @@ if not SESSION_SECRET:
17
 
18
  app = FastAPI(title="Bored CV API", version="0.1.0")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
21
 
22
  app.add_middleware(
@@ -103,9 +131,8 @@ async def debug_parse(x_admin_secret: str = Header("")):
103
  """Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
104
  if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
105
  raise HTTPException(status_code=403, detail="Forbidden")
106
- import traceback
107
  from mistralai.client import Mistral
108
- from app.services.pdf_parser import extract_pdf_text
109
  key = os.environ.get("MISTRAL_API_KEY", "")
110
  client = Mistral(api_key=key)
111
 
@@ -131,4 +158,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
131
  data = json.loads(r.choices[0].message.content)
132
  return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
133
  except Exception as e:
134
- return {"ok": False, "error": str(e), "traceback": traceback.format_exc(), "provider": "mistral"}
 
 
 
1
  import os
2
 
3
+ from fastapi import FastAPI, Header, HTTPException, Request
4
  from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.responses import JSONResponse
6
+ from starlette.middleware.base import BaseHTTPMiddleware
7
  from starlette.middleware.sessions import SessionMiddleware
8
 
9
  from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
 
19
 
20
  app = FastAPI(title="Bored CV API", version="0.1.0")
21
 
22
+
23
+ # Cap body size on every request. Prevents (a) memory DoS via gigabyte JSON
24
+ # bodies and (b) mega-payload prompt injection where an attacker stuffs many
25
+ # KB of "ignore previous instructions" into a free-text field. The PDF upload
26
+ # route enforces its own 5 MB limit; everything else is JSON and 1 MB is
27
+ # already 10× a real CV.
28
+ MAX_REQUEST_BYTES = 1_000_000
29
+ PDF_UPLOAD_PATHS = {"/api/parse-linkedin", "/api/debug-parse-pdf", "/api/transcribe"}
30
+
31
+
32
+ class BodySizeLimitMiddleware(BaseHTTPMiddleware):
33
+ async def dispatch(self, request: Request, call_next):
34
+ if request.url.path in PDF_UPLOAD_PATHS:
35
+ # Multipart upload routes enforce their own per-file caps; skip the
36
+ # JSON-tier limit so a 4 MB PDF isn't blocked here.
37
+ return await call_next(request)
38
+ cl = request.headers.get("content-length")
39
+ if cl and cl.isdigit() and int(cl) > MAX_REQUEST_BYTES:
40
+ return JSONResponse(
41
+ status_code=413,
42
+ content={"detail": "Request body too large"},
43
+ )
44
+ return await call_next(request)
45
+
46
+
47
+ app.add_middleware(BodySizeLimitMiddleware)
48
  app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
49
 
50
  app.add_middleware(
 
131
  """Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
132
  if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
133
  raise HTTPException(status_code=403, detail="Forbidden")
134
+ import logging
135
  from mistralai.client import Mistral
 
136
  key = os.environ.get("MISTRAL_API_KEY", "")
137
  client = Mistral(api_key=key)
138
 
 
158
  data = json.loads(r.choices[0].message.content)
159
  return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
160
  except Exception as e:
161
+ # Log internally; return generic message no traceback in response.
162
+ logging.exception("debug_parse failed")
163
+ return {"ok": False, "error": type(e).__name__, "provider": "mistral"}
app/models.py CHANGED
@@ -76,7 +76,11 @@ class GapAnalysis(BaseModel):
76
 
77
  class ChatMessage(BaseModel):
78
  role: str
79
- content: str
 
 
 
 
80
 
81
 
82
  class ChatRequest(BaseModel):
@@ -178,10 +182,10 @@ class ToneSamples(BaseModel):
178
 
179
  class ImproveBulletRequest(BaseModel):
180
  """Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
181
- text: str
182
- role: str = "" # job title for context
183
- company: str = "" # company name for context
184
- offer_title: str = "" # target job title (helps tilt rewrites toward what matters)
185
  ui_language: str = "en"
186
  tone: str = "startup"
187
 
 
76
 
77
  class ChatMessage(BaseModel):
78
  role: str
79
+ # Cap chat content. A realistic answer is a paragraph; 8 KB is already
80
+ # several pages. The cap is the structural defence against
81
+ # "ignore-previous-instructions"-style prompt injections that try to bury
82
+ # the override in a wall of text.
83
+ content: str = Field(default="", max_length=8000)
84
 
85
 
86
  class ChatRequest(BaseModel):
 
182
 
183
  class ImproveBulletRequest(BaseModel):
184
  """Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
185
+ text: str = Field(default="", max_length=2000)
186
+ role: str = Field(default="", max_length=200)
187
+ company: str = Field(default="", max_length=200)
188
+ offer_title: str = Field(default="", max_length=200)
189
  ui_language: str = "en"
190
  tone: str = "startup"
191
 
app/routers/auth.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from urllib.parse import urlencode
3
 
4
  from authlib.integrations.starlette_client import OAuth
5
- from fastapi import APIRouter, Header, Request
6
  from fastapi.responses import RedirectResponse
7
  from itsdangerous import URLSafeTimedSerializer
8
 
@@ -159,3 +159,42 @@ async def get_quota(authorization: str = Header("")):
159
  "daily_limit": 20 if is_auth else 10,
160
  "provider": user.get("provider") if user else None,
161
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from urllib.parse import urlencode
3
 
4
  from authlib.integrations.starlette_client import OAuth
5
+ from fastapi import APIRouter, Header, HTTPException, Request
6
  from fastapi.responses import RedirectResponse
7
  from itsdangerous import URLSafeTimedSerializer
8
 
 
159
  "daily_limit": 20 if is_auth else 10,
160
  "provider": user.get("provider") if user else None,
161
  }
162
+
163
+
164
+ @router.delete("/account")
165
+ async def delete_account(authorization: str = Header("")):
166
+ """GDPR delete-my-account. Wipes the user row and every FK child
167
+ (knowledge, projects, facts, snapshots) in one transaction. The Bearer
168
+ token remains valid until expiry — the client must drop it locally — but
169
+ every authenticated lookup will 401 because the user row is gone.
170
+
171
+ Idempotent: deleting an already-deleted account returns ok. Returns the
172
+ counts of each entity type removed so the client can show a summary.
173
+ """
174
+ user = get_user_from_request(authorization=authorization)
175
+ if not user or not user.get("user_id"):
176
+ raise HTTPException(status_code=401, detail="Sign in required")
177
+ user_id = user["user_id"]
178
+ from app.db import get_db
179
+ counts = {"projects": 0, "knowledge": 0, "facts": 0, "snapshots": 0}
180
+ with get_db() as conn:
181
+ # Order matters for FK: children first, then parent.
182
+ for table in ("facts", "knowledge", "projects"):
183
+ try:
184
+ row = conn.execute(
185
+ f"SELECT COUNT(*) as cnt FROM {table} WHERE user_id = ?", (user_id,)
186
+ ).fetchone()
187
+ counts[table] = row["cnt"] if row else 0
188
+ conn.execute(f"DELETE FROM {table} WHERE user_id = ?", (user_id,))
189
+ except Exception:
190
+ pass # table may not exist (snapshots is created lazily)
191
+ try:
192
+ row = conn.execute(
193
+ "SELECT COUNT(*) as cnt FROM snapshots WHERE user_id = ?", (user_id,)
194
+ ).fetchone()
195
+ counts["snapshots"] = row["cnt"] if row else 0
196
+ conn.execute("DELETE FROM snapshots WHERE user_id = ?", (user_id,))
197
+ except Exception:
198
+ pass
199
+ conn.execute("DELETE FROM users WHERE id = ?", (user_id,))
200
+ return {"status": "deleted", "removed": counts}
app/routers/chat.py CHANGED
@@ -19,8 +19,10 @@ async def analyze(req: AnalyzeRequest, request: Request, x_captcha_token: str =
19
  llm = get_llm()
20
  try:
21
  return llm.analyze(req.profile, req.offer, req.ui_language)
22
- except Exception as e:
23
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
24
 
25
 
26
  @router.post("/chat", response_model=ChatResponse)
@@ -35,5 +37,7 @@ async def chat(req: ChatRequest, request: Request, x_captcha_token: str = Header
35
  known_facts=req.known_facts, contradictions=req.contradictions,
36
  cv_draft=req.cv_draft,
37
  )
38
- except Exception as e:
39
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
 
19
  llm = get_llm()
20
  try:
21
  return llm.analyze(req.profile, req.offer, req.ui_language)
22
+ except Exception:
23
+ import logging
24
+ logging.exception("LLM call failed")
25
+ raise HTTPException(status_code=502, detail="AI service error")
26
 
27
 
28
  @router.post("/chat", response_model=ChatResponse)
 
37
  known_facts=req.known_facts, contradictions=req.contradictions,
38
  cv_draft=req.cv_draft,
39
  )
40
+ except Exception:
41
+ import logging
42
+ logging.exception("LLM call failed")
43
+ raise HTTPException(status_code=502, detail="AI service error")
app/routers/cover_letter.py CHANGED
@@ -19,5 +19,7 @@ async def generate_cover_letter(req: CoverLetterRequest, request: Request, x_cap
19
  llm = get_llm()
20
  try:
21
  return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
22
- except Exception as e:
23
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
 
19
  llm = get_llm()
20
  try:
21
  return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
22
+ except Exception:
23
+ import logging
24
+ logging.exception("LLM call failed")
25
+ raise HTTPException(status_code=502, detail="AI service error")
app/routers/draft.py CHANGED
@@ -23,8 +23,10 @@ async def draft_cv(req: GenerateRequest, request: Request, x_captcha_token: str
23
  llm = get_llm()
24
  try:
25
  return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
26
- except Exception as e:
27
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
28
 
29
 
30
  @router.post("/improve-bullet", response_model=ImproveBulletResponse)
@@ -44,8 +46,10 @@ async def improve_bullet(req: ImproveBulletRequest, request: Request, x_captcha_
44
  tone=req.tone,
45
  )
46
  return ImproveBulletResponse(text=out)
47
- except Exception as e:
48
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
49
 
50
 
51
  @router.post("/audit-cv", response_model=AuditCvResponse)
@@ -58,8 +62,10 @@ async def audit_cv(req: AuditCvRequest, request: Request, x_captcha_token: str =
58
  try:
59
  out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
60
  return AuditCvResponse(**out)
61
- except Exception as e:
62
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
63
 
64
 
65
  @router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
@@ -80,8 +86,10 @@ async def apply_grammar_fixes(req: ApplyGrammarFixesRequest, request: Request, x
80
  skipped=out.get("skipped", 0),
81
  skipped_indices=out.get("skipped_indices", []),
82
  )
83
- except Exception as e:
84
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
85
 
86
 
87
  @router.post("/tone-samples", response_model=ToneSamples)
@@ -95,5 +103,7 @@ async def tone_samples(req: ToneSamplesRequest, request: Request, x_captcha_toke
95
  llm = get_llm()
96
  try:
97
  return llm.tone_samples(req.profile, req.offer, req.ui_language)
98
- except Exception as e:
99
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
 
23
  llm = get_llm()
24
  try:
25
  return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
26
+ except Exception:
27
+ import logging
28
+ logging.exception("LLM call failed")
29
+ raise HTTPException(status_code=502, detail="AI service error")
30
 
31
 
32
  @router.post("/improve-bullet", response_model=ImproveBulletResponse)
 
46
  tone=req.tone,
47
  )
48
  return ImproveBulletResponse(text=out)
49
+ except Exception:
50
+ import logging
51
+ logging.exception("LLM call failed")
52
+ raise HTTPException(status_code=502, detail="AI service error")
53
 
54
 
55
  @router.post("/audit-cv", response_model=AuditCvResponse)
 
62
  try:
63
  out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
64
  return AuditCvResponse(**out)
65
+ except Exception:
66
+ import logging
67
+ logging.exception("LLM call failed")
68
+ raise HTTPException(status_code=502, detail="AI service error")
69
 
70
 
71
  @router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
 
86
  skipped=out.get("skipped", 0),
87
  skipped_indices=out.get("skipped_indices", []),
88
  )
89
+ except Exception:
90
+ import logging
91
+ logging.exception("LLM call failed")
92
+ raise HTTPException(status_code=502, detail="AI service error")
93
 
94
 
95
  @router.post("/tone-samples", response_model=ToneSamples)
 
103
  llm = get_llm()
104
  try:
105
  return llm.tone_samples(req.profile, req.offer, req.ui_language)
106
+ except Exception:
107
+ import logging
108
+ logging.exception("LLM call failed")
109
+ raise HTTPException(status_code=502, detail="AI service error")
app/routers/generate.py CHANGED
@@ -21,8 +21,10 @@ async def generate_cv(req: GenerateRequest, request: Request, x_captcha_token: s
21
  llm = get_llm()
22
  try:
23
  return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
24
- except Exception as e:
25
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
26
 
27
 
28
  class TranslateRequest(BaseModel):
@@ -34,8 +36,13 @@ class TranslateRequest(BaseModel):
34
  async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
35
  if not await verify_turnstile(x_captcha_token):
36
  raise HTTPException(status_code=403, detail="Captcha verification failed")
 
 
 
37
  llm = get_llm()
38
  try:
39
  return llm.translate_cv(req.cv_data, req.target_language)
40
- except Exception as e:
41
- raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 
 
 
21
  llm = get_llm()
22
  try:
23
  return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
24
+ except Exception:
25
+ import logging
26
+ logging.exception("LLM call failed")
27
+ raise HTTPException(status_code=502, detail="AI service error")
28
 
29
 
30
  class TranslateRequest(BaseModel):
 
36
  async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
37
  if not await verify_turnstile(x_captcha_token):
38
  raise HTTPException(status_code=403, detail="Captcha verification failed")
39
+ # Was missing rate limit — translate-cv burns LLM tokens like every other
40
+ # endpoint in this module. 50/day anonymous, 500/day signed in.
41
+ check_rate_limit(request)
42
  llm = get_llm()
43
  try:
44
  return llm.translate_cv(req.cv_data, req.target_language)
45
+ except Exception:
46
+ import logging
47
+ logging.exception("LLM call failed")
48
+ raise HTTPException(status_code=502, detail="AI service error")
app/routers/linkedin.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import os
3
- import traceback
4
 
5
  from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
6
 
@@ -28,8 +27,12 @@ async def parse_linkedin(request: Request, file: UploadFile = File(...)):
28
 
29
  try:
30
  profile = parse_linkedin_pdf(contents)
31
- except Exception as e:
32
- raise HTTPException(status_code=422, detail=f"Could not parse PDF: {e}")
 
 
 
 
33
 
34
  if not profile.name:
35
  raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
@@ -48,6 +51,7 @@ async def debug_parse_pdf(file: UploadFile = File(...), x_admin_secret: str = He
48
  """Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
49
  if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
50
  raise HTTPException(status_code=403, detail="Forbidden")
 
51
  contents = await file.read()
52
  raw_text = extract_pdf_text(contents)
53
 
@@ -81,4 +85,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
81
  "provider": "mistral",
82
  }
83
  except Exception as e:
84
- return {"ok": False, "error": str(e), "traceback": traceback.format_exc()[:1000], "text_length": len(raw_text)}
 
 
 
1
  import json
2
  import os
 
3
 
4
  from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
5
 
 
27
 
28
  try:
29
  profile = parse_linkedin_pdf(contents)
30
+ except Exception:
31
+ # Don't echo internal exception text — could leak file path / library
32
+ # version / stack details. Log internally, return generic to caller.
33
+ import logging
34
+ logging.exception("parse_linkedin_pdf failed")
35
+ raise HTTPException(status_code=422, detail="Could not parse PDF")
36
 
37
  if not profile.name:
38
  raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
 
51
  """Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
52
  if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
53
  raise HTTPException(status_code=403, detail="Forbidden")
54
+ import logging
55
  contents = await file.read()
56
  raw_text = extract_pdf_text(contents)
57
 
 
85
  "provider": "mistral",
86
  }
87
  except Exception as e:
88
+ # Log internally; return error class only — no traceback in HTTP response.
89
+ logging.exception("debug_parse_pdf failed")
90
+ return {"ok": False, "error": type(e).__name__, "text_length": len(raw_text)}
app/routers/snapshots.py CHANGED
@@ -28,6 +28,11 @@ router = APIRouter(prefix="/api/snapshots", tags=["snapshots"])
28
 
29
  SLUG_ALPHABET_BYTES = 12 # token_urlsafe(12) → ~16 chars
30
 
 
 
 
 
 
31
 
32
  def _ensure_schema() -> None:
33
  """Create the snapshots table on first use. Idempotent."""
@@ -84,9 +89,13 @@ async def create_snapshot(request: Request, payload: CreateSnapshotRequest):
84
  if not user_id:
85
  raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
86
 
87
- slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
88
  cv_json = json.dumps(payload.cv_data)
89
  colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
 
 
 
 
 
90
 
91
  with get_db() as conn:
92
  conn.execute(
 
28
 
29
  SLUG_ALPHABET_BYTES = 12 # token_urlsafe(12) → ~16 chars
30
 
31
+ # Cap on serialized snapshot payload (cv_data + brand_colors). A real CV is
32
+ # under 30 KB; a 500 KB ceiling lets data-URI photos through but blocks an
33
+ # attacker from filling Turso with multi-MB rows by repeatedly sharing.
34
+ MAX_SNAPSHOT_BYTES = 500_000
35
+
36
 
37
  def _ensure_schema() -> None:
38
  """Create the snapshots table on first use. Idempotent."""
 
89
  if not user_id:
90
  raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
91
 
 
92
  cv_json = json.dumps(payload.cv_data)
93
  colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
94
+ # Block oversized payloads before allocating a slug or hitting the DB.
95
+ if len(cv_json) + len(colors_json) > MAX_SNAPSHOT_BYTES:
96
+ raise HTTPException(status_code=413, detail="Snapshot payload too large")
97
+
98
+ slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
99
 
100
  with get_db() as conn:
101
  conn.execute(
app/services/llm.py CHANGED
@@ -12,6 +12,22 @@ from app.models import (
12
  MAX_TOKENS_PER_CALL = 8000 # Reduced from 16K — Flash spends most on thinking, not output
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
16
  """Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
17
  literal `old` → `new` replace on the string at that path. Returns True if
@@ -19,7 +35,11 @@ def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
19
 
20
  Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
21
  if the LLM hallucinates a path or an `old` substring that isn't actually
22
- there, the swap is skipped instead of corrupting the CV."""
 
 
 
 
23
  parts = path.split(".") if path else []
24
  if not parts:
25
  return False
 
12
  MAX_TOKENS_PER_CALL = 8000 # Reduced from 16K — Flash spends most on thinking, not output
13
 
14
 
15
+ # Whitelist of CV paths the LLM is allowed to substitute into. Anything off
16
+ # this list is rejected before traversal — a defence against an attacker
17
+ # convincing the LLM to write into arbitrary fields (e.g. `language` to flip
18
+ # locale, `match_score` to pin a perfect rating). The fields below are the only
19
+ # ones a grammar pass should ever touch.
20
+ _ALLOWED_SUBSTITUTION_PATH = re.compile(
21
+ r"^("
22
+ r"name|title|summary|location|"
23
+ r"experiences\.\d+\.(title|company|dates|exitReason|contractType|bullets\.\d+)|"
24
+ r"education\.\d+\.(degree|school|year)|"
25
+ r"skills\.\d+|languages\.\d+|"
26
+ r"strengths\.\d+|improvements\.\d+"
27
+ r")$"
28
+ )
29
+
30
+
31
  def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
32
  """Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
33
  literal `old` → `new` replace on the string at that path. Returns True if
 
35
 
36
  Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
37
  if the LLM hallucinates a path or an `old` substring that isn't actually
38
+ there, the swap is skipped instead of corrupting the CV. Paths must match
39
+ the allowlist above — non-grammar fields (match_score, email, language…)
40
+ are off-limits even if the LLM tries to target them."""
41
+ if not path or not _ALLOWED_SUBSTITUTION_PATH.match(path):
42
+ return False
43
  parts = path.split(".") if path else []
44
  if not parts:
45
  return False