Spaces:

Aramente
/

bored-cv-api

Running

Aramente Claude Opus 4 commited on Apr 26

Commit

3da512e

1 Parent(s): 660b183

security round 2: payload caps, traceback strip, GDPR delete, path allowlist

- H6+H8: strip exception details from chat/draft/generate/cover_letter/linkedin;
log internally via logging.exception, return generic 502 to client
- H1: cap snapshot payload at 500KB before allocating slug or hitting DB
- H2: BodySizeLimitMiddleware (1MB cap, PDF routes excepted with own 5MB caps);
ChatMessage.content max_length=8000; ImproveBulletRequest field caps
- H7: DELETE /api/auth/account — wipes user + FK children in one txn, returns counts
- M-tier frontend: cv_actions path allowlist in _apply_substitution (regex gates
to summary/experiences[N].(title|company|dates|bullets[M]|...)/education/skills);
ChatMessage urlTransform restricts links to http(s)/mailto/anchor/relative
- Add missing rate limit to /translate-cv
- Rip ?fix-lang=1 one-shot localStorage migration (served its purpose)
- Fix missing HTTPException import in auth.py from H7 patch

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>

Files changed (10) hide show

app/main.py +33 -4
app/models.py +9 -5
app/routers/auth.py +40 -1
app/routers/chat.py +8 -4
app/routers/cover_letter.py +4 -2
app/routers/draft.py +20 -10
app/routers/generate.py +11 -4
app/routers/linkedin.py +10 -4
app/routers/snapshots.py +10 -1
app/services/llm.py +21 -1

app/main.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import os
-from fastapi import FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.sessions import SessionMiddleware
 from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
@@ -17,6 +19,32 @@ if not SESSION_SECRET:
 app = FastAPI(title="Bored CV API", version="0.1.0")
 app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
 app.add_middleware(
@@ -103,9 +131,8 @@ async def debug_parse(x_admin_secret: str = Header("")):
     """Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
     if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
         raise HTTPException(status_code=403, detail="Forbidden")
-    import traceback
     from mistralai.client import Mistral
-    from app.services.pdf_parser import extract_pdf_text
     key = os.environ.get("MISTRAL_API_KEY", "")
     client = Mistral(api_key=key)
@@ -131,4 +158,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
         data = json.loads(r.choices[0].message.content)
         return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
     except Exception as e:
-        return {"ok": False, "error": str(e), "traceback": traceback.format_exc(), "provider": "mistral"}

 import os
+from fastapi import FastAPI, Header, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.middleware.sessions import SessionMiddleware
 from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
 app = FastAPI(title="Bored CV API", version="0.1.0")
+# Cap body size on every request. Prevents (a) memory DoS via gigabyte JSON
+# bodies and (b) mega-payload prompt injection where an attacker stuffs many
+# KB of "ignore previous instructions" into a free-text field. The PDF upload
+# route enforces its own 5 MB limit; everything else is JSON and 1 MB is
+# already 10× a real CV.
+MAX_REQUEST_BYTES = 1_000_000
+PDF_UPLOAD_PATHS = {"/api/parse-linkedin", "/api/debug-parse-pdf", "/api/transcribe"}
+class BodySizeLimitMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request: Request, call_next):
+        if request.url.path in PDF_UPLOAD_PATHS:
+            # Multipart upload routes enforce their own per-file caps; skip the
+            # JSON-tier limit so a 4 MB PDF isn't blocked here.
+            return await call_next(request)
+        cl = request.headers.get("content-length")
+        if cl and cl.isdigit() and int(cl) > MAX_REQUEST_BYTES:
+            return JSONResponse(
+                status_code=413,
+                content={"detail": "Request body too large"},
+            )
+        return await call_next(request)
+app.add_middleware(BodySizeLimitMiddleware)
 app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
 app.add_middleware(
     """Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
     if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
         raise HTTPException(status_code=403, detail="Forbidden")
+    import logging
     from mistralai.client import Mistral
     key = os.environ.get("MISTRAL_API_KEY", "")
     client = Mistral(api_key=key)
         data = json.loads(r.choices[0].message.content)
         return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
     except Exception as e:
+        # Log internally; return generic message — no traceback in response.
+        logging.exception("debug_parse failed")
+        return {"ok": False, "error": type(e).__name__, "provider": "mistral"}

app/models.py CHANGED Viewed

@@ -76,7 +76,11 @@ class GapAnalysis(BaseModel):
 class ChatMessage(BaseModel):
     role: str
-    content: str
 class ChatRequest(BaseModel):
@@ -178,10 +182,10 @@ class ToneSamples(BaseModel):
 class ImproveBulletRequest(BaseModel):
     """Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
-    text: str
-    role: str = ""              # job title for context
-    company: str = ""           # company name for context
-    offer_title: str = ""       # target job title (helps tilt rewrites toward what matters)
     ui_language: str = "en"
     tone: str = "startup"

 class ChatMessage(BaseModel):
     role: str
+    # Cap chat content. A realistic answer is a paragraph; 8 KB is already
+    # several pages. The cap is the structural defence against
+    # "ignore-previous-instructions"-style prompt injections that try to bury
+    # the override in a wall of text.
+    content: str = Field(default="", max_length=8000)
 class ChatRequest(BaseModel):
 class ImproveBulletRequest(BaseModel):
     """Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
+    text: str = Field(default="", max_length=2000)
+    role: str = Field(default="", max_length=200)
+    company: str = Field(default="", max_length=200)
+    offer_title: str = Field(default="", max_length=200)
     ui_language: str = "en"
     tone: str = "startup"

app/routers/auth.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from urllib.parse import urlencode
 from authlib.integrations.starlette_client import OAuth
-from fastapi import APIRouter, Header, Request
 from fastapi.responses import RedirectResponse
 from itsdangerous import URLSafeTimedSerializer
@@ -159,3 +159,42 @@ async def get_quota(authorization: str = Header("")):
         "daily_limit": 20 if is_auth else 10,
         "provider": user.get("provider") if user else None,
     }

 from urllib.parse import urlencode
 from authlib.integrations.starlette_client import OAuth
+from fastapi import APIRouter, Header, HTTPException, Request
 from fastapi.responses import RedirectResponse
 from itsdangerous import URLSafeTimedSerializer
         "daily_limit": 20 if is_auth else 10,
         "provider": user.get("provider") if user else None,
     }
+@router.delete("/account")
+async def delete_account(authorization: str = Header("")):
+    """GDPR delete-my-account. Wipes the user row and every FK child
+    (knowledge, projects, facts, snapshots) in one transaction. The Bearer
+    token remains valid until expiry — the client must drop it locally — but
+    every authenticated lookup will 401 because the user row is gone.
+    Idempotent: deleting an already-deleted account returns ok. Returns the
+    counts of each entity type removed so the client can show a summary.
+    """
+    user = get_user_from_request(authorization=authorization)
+    if not user or not user.get("user_id"):
+        raise HTTPException(status_code=401, detail="Sign in required")
+    user_id = user["user_id"]
+    from app.db import get_db
+    counts = {"projects": 0, "knowledge": 0, "facts": 0, "snapshots": 0}
+    with get_db() as conn:
+        # Order matters for FK: children first, then parent.
+        for table in ("facts", "knowledge", "projects"):
+            try:
+                row = conn.execute(
+                    f"SELECT COUNT(*) as cnt FROM {table} WHERE user_id = ?", (user_id,)
+                ).fetchone()
+                counts[table] = row["cnt"] if row else 0
+                conn.execute(f"DELETE FROM {table} WHERE user_id = ?", (user_id,))
+            except Exception:
+                pass  # table may not exist (snapshots is created lazily)
+        try:
+            row = conn.execute(
+                "SELECT COUNT(*) as cnt FROM snapshots WHERE user_id = ?", (user_id,)
+            ).fetchone()
+            counts["snapshots"] = row["cnt"] if row else 0
+            conn.execute("DELETE FROM snapshots WHERE user_id = ?", (user_id,))
+        except Exception:
+            pass
+        conn.execute("DELETE FROM users WHERE id = ?", (user_id,))
+    return {"status": "deleted", "removed": counts}

app/routers/chat.py CHANGED Viewed

@@ -19,8 +19,10 @@ async def analyze(req: AnalyzeRequest, request: Request, x_captcha_token: str =
     llm = get_llm()
     try:
         return llm.analyze(req.profile, req.offer, req.ui_language)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 @router.post("/chat", response_model=ChatResponse)
@@ -35,5 +37,7 @@ async def chat(req: ChatRequest, request: Request, x_captcha_token: str = Header
             known_facts=req.known_facts, contradictions=req.contradictions,
             cv_draft=req.cv_draft,
         )
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")

     llm = get_llm()
     try:
         return llm.analyze(req.profile, req.offer, req.ui_language)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 @router.post("/chat", response_model=ChatResponse)
             known_facts=req.known_facts, contradictions=req.contradictions,
             cv_draft=req.cv_draft,
         )
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")

app/routers/cover_letter.py CHANGED Viewed

@@ -19,5 +19,7 @@ async def generate_cover_letter(req: CoverLetterRequest, request: Request, x_cap
     llm = get_llm()
     try:
         return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")

     llm = get_llm()
     try:
         return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")

app/routers/draft.py CHANGED Viewed

@@ -23,8 +23,10 @@ async def draft_cv(req: GenerateRequest, request: Request, x_captcha_token: str
     llm = get_llm()
     try:
         return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 @router.post("/improve-bullet", response_model=ImproveBulletResponse)
@@ -44,8 +46,10 @@ async def improve_bullet(req: ImproveBulletRequest, request: Request, x_captcha_
             tone=req.tone,
         )
         return ImproveBulletResponse(text=out)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 @router.post("/audit-cv", response_model=AuditCvResponse)
@@ -58,8 +62,10 @@ async def audit_cv(req: AuditCvRequest, request: Request, x_captcha_token: str =
     try:
         out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
         return AuditCvResponse(**out)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 @router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
@@ -80,8 +86,10 @@ async def apply_grammar_fixes(req: ApplyGrammarFixesRequest, request: Request, x
             skipped=out.get("skipped", 0),
             skipped_indices=out.get("skipped_indices", []),
         )
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 @router.post("/tone-samples", response_model=ToneSamples)
@@ -95,5 +103,7 @@ async def tone_samples(req: ToneSamplesRequest, request: Request, x_captcha_toke
     llm = get_llm()
     try:
         return llm.tone_samples(req.profile, req.offer, req.ui_language)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")

     llm = get_llm()
     try:
         return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 @router.post("/improve-bullet", response_model=ImproveBulletResponse)
             tone=req.tone,
         )
         return ImproveBulletResponse(text=out)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 @router.post("/audit-cv", response_model=AuditCvResponse)
     try:
         out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
         return AuditCvResponse(**out)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 @router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
             skipped=out.get("skipped", 0),
             skipped_indices=out.get("skipped_indices", []),
         )
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 @router.post("/tone-samples", response_model=ToneSamples)
     llm = get_llm()
     try:
         return llm.tone_samples(req.profile, req.offer, req.ui_language)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")

app/routers/generate.py CHANGED Viewed

@@ -21,8 +21,10 @@ async def generate_cv(req: GenerateRequest, request: Request, x_captcha_token: s
     llm = get_llm()
     try:
         return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")
 class TranslateRequest(BaseModel):
@@ -34,8 +36,13 @@ class TranslateRequest(BaseModel):
 async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
     if not await verify_turnstile(x_captcha_token):
         raise HTTPException(status_code=403, detail="Captcha verification failed")
     llm = get_llm()
     try:
         return llm.translate_cv(req.cv_data, req.target_language)
-    except Exception as e:
-        raise HTTPException(status_code=502, detail=f"AI service error: {e}")

     llm = get_llm()
     try:
         return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")
 class TranslateRequest(BaseModel):
 async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
     if not await verify_turnstile(x_captcha_token):
         raise HTTPException(status_code=403, detail="Captcha verification failed")
+    # Was missing rate limit — translate-cv burns LLM tokens like every other
+    # endpoint in this module. 50/day anonymous, 500/day signed in.
+    check_rate_limit(request)
     llm = get_llm()
     try:
         return llm.translate_cv(req.cv_data, req.target_language)
+    except Exception:
+        import logging
+        logging.exception("LLM call failed")
+        raise HTTPException(status_code=502, detail="AI service error")

app/routers/linkedin.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import os
-import traceback
 from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
@@ -28,8 +27,12 @@ async def parse_linkedin(request: Request, file: UploadFile = File(...)):
     try:
         profile = parse_linkedin_pdf(contents)
-    except Exception as e:
-        raise HTTPException(status_code=422, detail=f"Could not parse PDF: {e}")
     if not profile.name:
         raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
@@ -48,6 +51,7 @@ async def debug_parse_pdf(file: UploadFile = File(...), x_admin_secret: str = He
     """Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
     if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
         raise HTTPException(status_code=403, detail="Forbidden")
     contents = await file.read()
     raw_text = extract_pdf_text(contents)
@@ -81,4 +85,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
             "provider": "mistral",
         }
     except Exception as e:
-        return {"ok": False, "error": str(e), "traceback": traceback.format_exc()[:1000], "text_length": len(raw_text)}

 import json
 import os
 from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
     try:
         profile = parse_linkedin_pdf(contents)
+    except Exception:
+        # Don't echo internal exception text — could leak file path / library
+        # version / stack details. Log internally, return generic to caller.
+        import logging
+        logging.exception("parse_linkedin_pdf failed")
+        raise HTTPException(status_code=422, detail="Could not parse PDF")
     if not profile.name:
         raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
     """Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
     if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
         raise HTTPException(status_code=403, detail="Forbidden")
+    import logging
     contents = await file.read()
     raw_text = extract_pdf_text(contents)
             "provider": "mistral",
         }
     except Exception as e:
+        # Log internally; return error class only — no traceback in HTTP response.
+        logging.exception("debug_parse_pdf failed")
+        return {"ok": False, "error": type(e).__name__, "text_length": len(raw_text)}

app/routers/snapshots.py CHANGED Viewed

@@ -28,6 +28,11 @@ router = APIRouter(prefix="/api/snapshots", tags=["snapshots"])
 SLUG_ALPHABET_BYTES = 12  # token_urlsafe(12) → ~16 chars
 def _ensure_schema() -> None:
     """Create the snapshots table on first use. Idempotent."""
@@ -84,9 +89,13 @@ async def create_snapshot(request: Request, payload: CreateSnapshotRequest):
     if not user_id:
         raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
-    slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
     cv_json = json.dumps(payload.cv_data)
     colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
     with get_db() as conn:
         conn.execute(

 SLUG_ALPHABET_BYTES = 12  # token_urlsafe(12) → ~16 chars
+# Cap on serialized snapshot payload (cv_data + brand_colors). A real CV is
+# under 30 KB; a 500 KB ceiling lets data-URI photos through but blocks an
+# attacker from filling Turso with multi-MB rows by repeatedly sharing.
+MAX_SNAPSHOT_BYTES = 500_000
 def _ensure_schema() -> None:
     """Create the snapshots table on first use. Idempotent."""
     if not user_id:
         raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
     cv_json = json.dumps(payload.cv_data)
     colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
+    # Block oversized payloads before allocating a slug or hitting the DB.
+    if len(cv_json) + len(colors_json) > MAX_SNAPSHOT_BYTES:
+        raise HTTPException(status_code=413, detail="Snapshot payload too large")
+    slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
     with get_db() as conn:
         conn.execute(

app/services/llm.py CHANGED Viewed

@@ -12,6 +12,22 @@ from app.models import (
 MAX_TOKENS_PER_CALL = 8000  # Reduced from 16K — Flash spends most on thinking, not output
 def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
     """Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
     literal `old` → `new` replace on the string at that path. Returns True if
@@ -19,7 +35,11 @@ def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
     Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
     if the LLM hallucinates a path or an `old` substring that isn't actually
-    there, the swap is skipped instead of corrupting the CV."""
     parts = path.split(".") if path else []
     if not parts:
         return False

 MAX_TOKENS_PER_CALL = 8000  # Reduced from 16K — Flash spends most on thinking, not output
+# Whitelist of CV paths the LLM is allowed to substitute into. Anything off
+# this list is rejected before traversal — a defence against an attacker
+# convincing the LLM to write into arbitrary fields (e.g. `language` to flip
+# locale, `match_score` to pin a perfect rating). The fields below are the only
+# ones a grammar pass should ever touch.
+_ALLOWED_SUBSTITUTION_PATH = re.compile(
+    r"^("
+    r"name|title|summary|location|"
+    r"experiences\.\d+\.(title|company|dates|exitReason|contractType|bullets\.\d+)|"
+    r"education\.\d+\.(degree|school|year)|"
+    r"skills\.\d+|languages\.\d+|"
+    r"strengths\.\d+|improvements\.\d+"
+    r")$"
+)
 def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
     """Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
     literal `old` → `new` replace on the string at that path. Returns True if
     Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
     if the LLM hallucinates a path or an `old` substring that isn't actually
+    there, the swap is skipped instead of corrupting the CV. Paths must match
+    the allowlist above — non-grammar fields (match_score, email, language…)
+    are off-limits even if the LLM tries to target them."""
+    if not path or not _ALLOWED_SUBSTITUTION_PATH.match(path):
+        return False
     parts = path.split(".") if path else []
     if not parts:
         return False