Spaces:
Running
security round 2: payload caps, traceback strip, GDPR delete, path allowlist
Browse files- H6+H8: strip exception details from chat/draft/generate/cover_letter/linkedin;
log internally via logging.exception, return generic 502 to client
- H1: cap snapshot payload at 500KB before allocating slug or hitting DB
- H2: BodySizeLimitMiddleware (1MB cap, PDF routes excepted with own 5MB caps);
ChatMessage.content max_length=8000; ImproveBulletRequest field caps
- H7: DELETE /api/auth/account — wipes user + FK children in one txn, returns counts
- M-tier frontend: cv_actions path allowlist in _apply_substitution (regex gates
to summary/experiences[N].(title|company|dates|bullets[M]|...)/education/skills);
ChatMessage urlTransform restricts links to http(s)/mailto/anchor/relative
- Add missing rate limit to /translate-cv
- Rip ?fix-lang=1 one-shot localStorage migration (served its purpose)
- Fix missing HTTPException import in auth.py from H7 patch
Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
- app/main.py +33 -4
- app/models.py +9 -5
- app/routers/auth.py +40 -1
- app/routers/chat.py +8 -4
- app/routers/cover_letter.py +4 -2
- app/routers/draft.py +20 -10
- app/routers/generate.py +11 -4
- app/routers/linkedin.py +10 -4
- app/routers/snapshots.py +10 -1
- app/services/llm.py +21 -1
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
from fastapi import FastAPI, Header, HTTPException
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
|
|
|
| 5 |
from starlette.middleware.sessions import SessionMiddleware
|
| 6 |
|
| 7 |
from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
|
|
@@ -17,6 +19,32 @@ if not SESSION_SECRET:
|
|
| 17 |
|
| 18 |
app = FastAPI(title="Bored CV API", version="0.1.0")
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
|
| 21 |
|
| 22 |
app.add_middleware(
|
|
@@ -103,9 +131,8 @@ async def debug_parse(x_admin_secret: str = Header("")):
|
|
| 103 |
"""Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
|
| 104 |
if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
|
| 105 |
raise HTTPException(status_code=403, detail="Forbidden")
|
| 106 |
-
import
|
| 107 |
from mistralai.client import Mistral
|
| 108 |
-
from app.services.pdf_parser import extract_pdf_text
|
| 109 |
key = os.environ.get("MISTRAL_API_KEY", "")
|
| 110 |
client = Mistral(api_key=key)
|
| 111 |
|
|
@@ -131,4 +158,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
|
|
| 131 |
data = json.loads(r.choices[0].message.content)
|
| 132 |
return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
|
| 133 |
except Exception as e:
|
| 134 |
-
return
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
from fastapi import FastAPI, Header, HTTPException, Request
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from fastapi.responses import JSONResponse
|
| 6 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 7 |
from starlette.middleware.sessions import SessionMiddleware
|
| 8 |
|
| 9 |
from app.routers import linkedin, offer, chat, generate, auth, draft, projects, knowledge, cover_letter, transcribe, snapshots
|
|
|
|
| 19 |
|
| 20 |
app = FastAPI(title="Bored CV API", version="0.1.0")
|
| 21 |
|
| 22 |
+
|
| 23 |
+
# Cap body size on every request. Prevents (a) memory DoS via gigabyte JSON
|
| 24 |
+
# bodies and (b) mega-payload prompt injection where an attacker stuffs many
|
| 25 |
+
# KB of "ignore previous instructions" into a free-text field. The PDF upload
|
| 26 |
+
# route enforces its own 5 MB limit; everything else is JSON and 1 MB is
|
| 27 |
+
# already 10× a real CV.
|
| 28 |
+
MAX_REQUEST_BYTES = 1_000_000
|
| 29 |
+
PDF_UPLOAD_PATHS = {"/api/parse-linkedin", "/api/debug-parse-pdf", "/api/transcribe"}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class BodySizeLimitMiddleware(BaseHTTPMiddleware):
|
| 33 |
+
async def dispatch(self, request: Request, call_next):
|
| 34 |
+
if request.url.path in PDF_UPLOAD_PATHS:
|
| 35 |
+
# Multipart upload routes enforce their own per-file caps; skip the
|
| 36 |
+
# JSON-tier limit so a 4 MB PDF isn't blocked here.
|
| 37 |
+
return await call_next(request)
|
| 38 |
+
cl = request.headers.get("content-length")
|
| 39 |
+
if cl and cl.isdigit() and int(cl) > MAX_REQUEST_BYTES:
|
| 40 |
+
return JSONResponse(
|
| 41 |
+
status_code=413,
|
| 42 |
+
content={"detail": "Request body too large"},
|
| 43 |
+
)
|
| 44 |
+
return await call_next(request)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
app.add_middleware(BodySizeLimitMiddleware)
|
| 48 |
app.add_middleware(SessionMiddleware, secret_key=SESSION_SECRET)
|
| 49 |
|
| 50 |
app.add_middleware(
|
|
|
|
| 131 |
"""Debug: test full PDF parser pipeline. Protected by ADMIN_SECRET header."""
|
| 132 |
if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
|
| 133 |
raise HTTPException(status_code=403, detail="Forbidden")
|
| 134 |
+
import logging
|
| 135 |
from mistralai.client import Mistral
|
|
|
|
| 136 |
key = os.environ.get("MISTRAL_API_KEY", "")
|
| 137 |
client = Mistral(api_key=key)
|
| 138 |
|
|
|
|
| 158 |
data = json.loads(r.choices[0].message.content)
|
| 159 |
return {"ok": True, "name": data.get("name"), "experiences": len(data.get("experiences", [])), "provider": "mistral"}
|
| 160 |
except Exception as e:
|
| 161 |
+
# Log internally; return generic message — no traceback in response.
|
| 162 |
+
logging.exception("debug_parse failed")
|
| 163 |
+
return {"ok": False, "error": type(e).__name__, "provider": "mistral"}
|
|
@@ -76,7 +76,11 @@ class GapAnalysis(BaseModel):
|
|
| 76 |
|
| 77 |
class ChatMessage(BaseModel):
|
| 78 |
role: str
|
| 79 |
-
content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
class ChatRequest(BaseModel):
|
|
@@ -178,10 +182,10 @@ class ToneSamples(BaseModel):
|
|
| 178 |
|
| 179 |
class ImproveBulletRequest(BaseModel):
|
| 180 |
"""Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
|
| 181 |
-
text: str
|
| 182 |
-
role: str = ""
|
| 183 |
-
company: str = ""
|
| 184 |
-
offer_title: str = ""
|
| 185 |
ui_language: str = "en"
|
| 186 |
tone: str = "startup"
|
| 187 |
|
|
|
|
| 76 |
|
| 77 |
class ChatMessage(BaseModel):
|
| 78 |
role: str
|
| 79 |
+
# Cap chat content. A realistic answer is a paragraph; 8 KB is already
|
| 80 |
+
# several pages. The cap is the structural defence against
|
| 81 |
+
# "ignore-previous-instructions"-style prompt injections that try to bury
|
| 82 |
+
# the override in a wall of text.
|
| 83 |
+
content: str = Field(default="", max_length=8000)
|
| 84 |
|
| 85 |
|
| 86 |
class ChatRequest(BaseModel):
|
|
|
|
| 182 |
|
| 183 |
class ImproveBulletRequest(BaseModel):
|
| 184 |
"""Per-bullet AI rewrite — Notion-style "improve wording" hover button."""
|
| 185 |
+
text: str = Field(default="", max_length=2000)
|
| 186 |
+
role: str = Field(default="", max_length=200)
|
| 187 |
+
company: str = Field(default="", max_length=200)
|
| 188 |
+
offer_title: str = Field(default="", max_length=200)
|
| 189 |
ui_language: str = "en"
|
| 190 |
tone: str = "startup"
|
| 191 |
|
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
from urllib.parse import urlencode
|
| 3 |
|
| 4 |
from authlib.integrations.starlette_client import OAuth
|
| 5 |
-
from fastapi import APIRouter, Header, Request
|
| 6 |
from fastapi.responses import RedirectResponse
|
| 7 |
from itsdangerous import URLSafeTimedSerializer
|
| 8 |
|
|
@@ -159,3 +159,42 @@ async def get_quota(authorization: str = Header("")):
|
|
| 159 |
"daily_limit": 20 if is_auth else 10,
|
| 160 |
"provider": user.get("provider") if user else None,
|
| 161 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from urllib.parse import urlencode
|
| 3 |
|
| 4 |
from authlib.integrations.starlette_client import OAuth
|
| 5 |
+
from fastapi import APIRouter, Header, HTTPException, Request
|
| 6 |
from fastapi.responses import RedirectResponse
|
| 7 |
from itsdangerous import URLSafeTimedSerializer
|
| 8 |
|
|
|
|
| 159 |
"daily_limit": 20 if is_auth else 10,
|
| 160 |
"provider": user.get("provider") if user else None,
|
| 161 |
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
@router.delete("/account")
|
| 165 |
+
async def delete_account(authorization: str = Header("")):
|
| 166 |
+
"""GDPR delete-my-account. Wipes the user row and every FK child
|
| 167 |
+
(knowledge, projects, facts, snapshots) in one transaction. The Bearer
|
| 168 |
+
token remains valid until expiry — the client must drop it locally — but
|
| 169 |
+
every authenticated lookup will 401 because the user row is gone.
|
| 170 |
+
|
| 171 |
+
Idempotent: deleting an already-deleted account returns ok. Returns the
|
| 172 |
+
counts of each entity type removed so the client can show a summary.
|
| 173 |
+
"""
|
| 174 |
+
user = get_user_from_request(authorization=authorization)
|
| 175 |
+
if not user or not user.get("user_id"):
|
| 176 |
+
raise HTTPException(status_code=401, detail="Sign in required")
|
| 177 |
+
user_id = user["user_id"]
|
| 178 |
+
from app.db import get_db
|
| 179 |
+
counts = {"projects": 0, "knowledge": 0, "facts": 0, "snapshots": 0}
|
| 180 |
+
with get_db() as conn:
|
| 181 |
+
# Order matters for FK: children first, then parent.
|
| 182 |
+
for table in ("facts", "knowledge", "projects"):
|
| 183 |
+
try:
|
| 184 |
+
row = conn.execute(
|
| 185 |
+
f"SELECT COUNT(*) as cnt FROM {table} WHERE user_id = ?", (user_id,)
|
| 186 |
+
).fetchone()
|
| 187 |
+
counts[table] = row["cnt"] if row else 0
|
| 188 |
+
conn.execute(f"DELETE FROM {table} WHERE user_id = ?", (user_id,))
|
| 189 |
+
except Exception:
|
| 190 |
+
pass # table may not exist (snapshots is created lazily)
|
| 191 |
+
try:
|
| 192 |
+
row = conn.execute(
|
| 193 |
+
"SELECT COUNT(*) as cnt FROM snapshots WHERE user_id = ?", (user_id,)
|
| 194 |
+
).fetchone()
|
| 195 |
+
counts["snapshots"] = row["cnt"] if row else 0
|
| 196 |
+
conn.execute("DELETE FROM snapshots WHERE user_id = ?", (user_id,))
|
| 197 |
+
except Exception:
|
| 198 |
+
pass
|
| 199 |
+
conn.execute("DELETE FROM users WHERE id = ?", (user_id,))
|
| 200 |
+
return {"status": "deleted", "removed": counts}
|
|
@@ -19,8 +19,10 @@ async def analyze(req: AnalyzeRequest, request: Request, x_captcha_token: str =
|
|
| 19 |
llm = get_llm()
|
| 20 |
try:
|
| 21 |
return llm.analyze(req.profile, req.offer, req.ui_language)
|
| 22 |
-
except Exception
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
@router.post("/chat", response_model=ChatResponse)
|
|
@@ -35,5 +37,7 @@ async def chat(req: ChatRequest, request: Request, x_captcha_token: str = Header
|
|
| 35 |
known_facts=req.known_facts, contradictions=req.contradictions,
|
| 36 |
cv_draft=req.cv_draft,
|
| 37 |
)
|
| 38 |
-
except Exception
|
| 39 |
-
|
|
|
|
|
|
|
|
|
| 19 |
llm = get_llm()
|
| 20 |
try:
|
| 21 |
return llm.analyze(req.profile, req.offer, req.ui_language)
|
| 22 |
+
except Exception:
|
| 23 |
+
import logging
|
| 24 |
+
logging.exception("LLM call failed")
|
| 25 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 26 |
|
| 27 |
|
| 28 |
@router.post("/chat", response_model=ChatResponse)
|
|
|
|
| 37 |
known_facts=req.known_facts, contradictions=req.contradictions,
|
| 38 |
cv_draft=req.cv_draft,
|
| 39 |
)
|
| 40 |
+
except Exception:
|
| 41 |
+
import logging
|
| 42 |
+
logging.exception("LLM call failed")
|
| 43 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
|
@@ -19,5 +19,7 @@ async def generate_cover_letter(req: CoverLetterRequest, request: Request, x_cap
|
|
| 19 |
llm = get_llm()
|
| 20 |
try:
|
| 21 |
return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
|
| 22 |
-
except Exception
|
| 23 |
-
|
|
|
|
|
|
|
|
|
| 19 |
llm = get_llm()
|
| 20 |
try:
|
| 21 |
return llm.generate_cover_letter(req.profile, req.offer, req.cv_data, req.messages, req.ui_language, req.tone, req.target_market)
|
| 22 |
+
except Exception:
|
| 23 |
+
import logging
|
| 24 |
+
logging.exception("LLM call failed")
|
| 25 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
|
@@ -23,8 +23,10 @@ async def draft_cv(req: GenerateRequest, request: Request, x_captcha_token: str
|
|
| 23 |
llm = get_llm()
|
| 24 |
try:
|
| 25 |
return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
|
| 26 |
-
except Exception
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@router.post("/improve-bullet", response_model=ImproveBulletResponse)
|
|
@@ -44,8 +46,10 @@ async def improve_bullet(req: ImproveBulletRequest, request: Request, x_captcha_
|
|
| 44 |
tone=req.tone,
|
| 45 |
)
|
| 46 |
return ImproveBulletResponse(text=out)
|
| 47 |
-
except Exception
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
@router.post("/audit-cv", response_model=AuditCvResponse)
|
|
@@ -58,8 +62,10 @@ async def audit_cv(req: AuditCvRequest, request: Request, x_captcha_token: str =
|
|
| 58 |
try:
|
| 59 |
out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
|
| 60 |
return AuditCvResponse(**out)
|
| 61 |
-
except Exception
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
@router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
|
|
@@ -80,8 +86,10 @@ async def apply_grammar_fixes(req: ApplyGrammarFixesRequest, request: Request, x
|
|
| 80 |
skipped=out.get("skipped", 0),
|
| 81 |
skipped_indices=out.get("skipped_indices", []),
|
| 82 |
)
|
| 83 |
-
except Exception
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
@router.post("/tone-samples", response_model=ToneSamples)
|
|
@@ -95,5 +103,7 @@ async def tone_samples(req: ToneSamplesRequest, request: Request, x_captcha_toke
|
|
| 95 |
llm = get_llm()
|
| 96 |
try:
|
| 97 |
return llm.tone_samples(req.profile, req.offer, req.ui_language)
|
| 98 |
-
except Exception
|
| 99 |
-
|
|
|
|
|
|
|
|
|
| 23 |
llm = get_llm()
|
| 24 |
try:
|
| 25 |
return llm.draft_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, target_market=req.target_market)
|
| 26 |
+
except Exception:
|
| 27 |
+
import logging
|
| 28 |
+
logging.exception("LLM call failed")
|
| 29 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 30 |
|
| 31 |
|
| 32 |
@router.post("/improve-bullet", response_model=ImproveBulletResponse)
|
|
|
|
| 46 |
tone=req.tone,
|
| 47 |
)
|
| 48 |
return ImproveBulletResponse(text=out)
|
| 49 |
+
except Exception:
|
| 50 |
+
import logging
|
| 51 |
+
logging.exception("LLM call failed")
|
| 52 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 53 |
|
| 54 |
|
| 55 |
@router.post("/audit-cv", response_model=AuditCvResponse)
|
|
|
|
| 62 |
try:
|
| 63 |
out = llm.audit_cv(req.cv_data, req.offer, req.ui_language)
|
| 64 |
return AuditCvResponse(**out)
|
| 65 |
+
except Exception:
|
| 66 |
+
import logging
|
| 67 |
+
logging.exception("LLM call failed")
|
| 68 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 69 |
|
| 70 |
|
| 71 |
@router.post("/apply-grammar-fixes", response_model=ApplyGrammarFixesResponse)
|
|
|
|
| 86 |
skipped=out.get("skipped", 0),
|
| 87 |
skipped_indices=out.get("skipped_indices", []),
|
| 88 |
)
|
| 89 |
+
except Exception:
|
| 90 |
+
import logging
|
| 91 |
+
logging.exception("LLM call failed")
|
| 92 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 93 |
|
| 94 |
|
| 95 |
@router.post("/tone-samples", response_model=ToneSamples)
|
|
|
|
| 103 |
llm = get_llm()
|
| 104 |
try:
|
| 105 |
return llm.tone_samples(req.profile, req.offer, req.ui_language)
|
| 106 |
+
except Exception:
|
| 107 |
+
import logging
|
| 108 |
+
logging.exception("LLM call failed")
|
| 109 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
|
@@ -21,8 +21,10 @@ async def generate_cv(req: GenerateRequest, request: Request, x_captcha_token: s
|
|
| 21 |
llm = get_llm()
|
| 22 |
try:
|
| 23 |
return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
|
| 24 |
-
except Exception
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
class TranslateRequest(BaseModel):
|
|
@@ -34,8 +36,13 @@ class TranslateRequest(BaseModel):
|
|
| 34 |
async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
|
| 35 |
if not await verify_turnstile(x_captcha_token):
|
| 36 |
raise HTTPException(status_code=403, detail="Captcha verification failed")
|
|
|
|
|
|
|
|
|
|
| 37 |
llm = get_llm()
|
| 38 |
try:
|
| 39 |
return llm.translate_cv(req.cv_data, req.target_language)
|
| 40 |
-
except Exception
|
| 41 |
-
|
|
|
|
|
|
|
|
|
| 21 |
llm = get_llm()
|
| 22 |
try:
|
| 23 |
return llm.generate_cv(req.profile, req.offer, req.gap_analysis, req.messages, req.ui_language, req.tone, target_market=req.target_market)
|
| 24 |
+
except Exception:
|
| 25 |
+
import logging
|
| 26 |
+
logging.exception("LLM call failed")
|
| 27 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
| 28 |
|
| 29 |
|
| 30 |
class TranslateRequest(BaseModel):
|
|
|
|
| 36 |
async def translate_cv(req: TranslateRequest, request: Request, x_captcha_token: str = Header("")):
|
| 37 |
if not await verify_turnstile(x_captcha_token):
|
| 38 |
raise HTTPException(status_code=403, detail="Captcha verification failed")
|
| 39 |
+
# Was missing rate limit — translate-cv burns LLM tokens like every other
|
| 40 |
+
# endpoint in this module. 50/day anonymous, 500/day signed in.
|
| 41 |
+
check_rate_limit(request)
|
| 42 |
llm = get_llm()
|
| 43 |
try:
|
| 44 |
return llm.translate_cv(req.cv_data, req.target_language)
|
| 45 |
+
except Exception:
|
| 46 |
+
import logging
|
| 47 |
+
logging.exception("LLM call failed")
|
| 48 |
+
raise HTTPException(status_code=502, detail="AI service error")
|
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
-
import traceback
|
| 4 |
|
| 5 |
from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
|
| 6 |
|
|
@@ -28,8 +27,12 @@ async def parse_linkedin(request: Request, file: UploadFile = File(...)):
|
|
| 28 |
|
| 29 |
try:
|
| 30 |
profile = parse_linkedin_pdf(contents)
|
| 31 |
-
except Exception
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
if not profile.name:
|
| 35 |
raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
|
|
@@ -48,6 +51,7 @@ async def debug_parse_pdf(file: UploadFile = File(...), x_admin_secret: str = He
|
|
| 48 |
"""Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
|
| 49 |
if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
|
| 50 |
raise HTTPException(status_code=403, detail="Forbidden")
|
|
|
|
| 51 |
contents = await file.read()
|
| 52 |
raw_text = extract_pdf_text(contents)
|
| 53 |
|
|
@@ -81,4 +85,6 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
|
|
| 81 |
"provider": "mistral",
|
| 82 |
}
|
| 83 |
except Exception as e:
|
| 84 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
from fastapi import APIRouter, File, Header, Request, UploadFile, HTTPException
|
| 5 |
|
|
|
|
| 27 |
|
| 28 |
try:
|
| 29 |
profile = parse_linkedin_pdf(contents)
|
| 30 |
+
except Exception:
|
| 31 |
+
# Don't echo internal exception text — could leak file path / library
|
| 32 |
+
# version / stack details. Log internally, return generic to caller.
|
| 33 |
+
import logging
|
| 34 |
+
logging.exception("parse_linkedin_pdf failed")
|
| 35 |
+
raise HTTPException(status_code=422, detail="Could not parse PDF")
|
| 36 |
|
| 37 |
if not profile.name:
|
| 38 |
raise HTTPException(status_code=422, detail="Could not extract profile data from PDF")
|
|
|
|
| 51 |
"""Debug: show exactly what happens when parsing a PDF. Protected by ADMIN_SECRET header."""
|
| 52 |
if not ADMIN_SECRET or x_admin_secret != ADMIN_SECRET:
|
| 53 |
raise HTTPException(status_code=403, detail="Forbidden")
|
| 54 |
+
import logging
|
| 55 |
contents = await file.read()
|
| 56 |
raw_text = extract_pdf_text(contents)
|
| 57 |
|
|
|
|
| 85 |
"provider": "mistral",
|
| 86 |
}
|
| 87 |
except Exception as e:
|
| 88 |
+
# Log internally; return error class only — no traceback in HTTP response.
|
| 89 |
+
logging.exception("debug_parse_pdf failed")
|
| 90 |
+
return {"ok": False, "error": type(e).__name__, "text_length": len(raw_text)}
|
|
@@ -28,6 +28,11 @@ router = APIRouter(prefix="/api/snapshots", tags=["snapshots"])
|
|
| 28 |
|
| 29 |
SLUG_ALPHABET_BYTES = 12 # token_urlsafe(12) → ~16 chars
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def _ensure_schema() -> None:
|
| 33 |
"""Create the snapshots table on first use. Idempotent."""
|
|
@@ -84,9 +89,13 @@ async def create_snapshot(request: Request, payload: CreateSnapshotRequest):
|
|
| 84 |
if not user_id:
|
| 85 |
raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
|
| 86 |
|
| 87 |
-
slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
|
| 88 |
cv_json = json.dumps(payload.cv_data)
|
| 89 |
colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
with get_db() as conn:
|
| 92 |
conn.execute(
|
|
|
|
| 28 |
|
| 29 |
SLUG_ALPHABET_BYTES = 12 # token_urlsafe(12) → ~16 chars
|
| 30 |
|
| 31 |
+
# Cap on serialized snapshot payload (cv_data + brand_colors). A real CV is
|
| 32 |
+
# under 30 KB; a 500 KB ceiling lets data-URI photos through but blocks an
|
| 33 |
+
# attacker from filling Turso with multi-MB rows by repeatedly sharing.
|
| 34 |
+
MAX_SNAPSHOT_BYTES = 500_000
|
| 35 |
+
|
| 36 |
|
| 37 |
def _ensure_schema() -> None:
|
| 38 |
"""Create the snapshots table on first use. Idempotent."""
|
|
|
|
| 89 |
if not user_id:
|
| 90 |
raise HTTPException(status_code=401, detail="Sign in to share a CV publicly")
|
| 91 |
|
|
|
|
| 92 |
cv_json = json.dumps(payload.cv_data)
|
| 93 |
colors_json = json.dumps(payload.brand_colors) if payload.brand_colors else ""
|
| 94 |
+
# Block oversized payloads before allocating a slug or hitting the DB.
|
| 95 |
+
if len(cv_json) + len(colors_json) > MAX_SNAPSHOT_BYTES:
|
| 96 |
+
raise HTTPException(status_code=413, detail="Snapshot payload too large")
|
| 97 |
+
|
| 98 |
+
slug = secrets.token_urlsafe(SLUG_ALPHABET_BYTES)
|
| 99 |
|
| 100 |
with get_db() as conn:
|
| 101 |
conn.execute(
|
|
@@ -12,6 +12,22 @@ from app.models import (
|
|
| 12 |
MAX_TOKENS_PER_CALL = 8000 # Reduced from 16K — Flash spends most on thinking, not output
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
|
| 16 |
"""Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
|
| 17 |
literal `old` → `new` replace on the string at that path. Returns True if
|
|
@@ -19,7 +35,11 @@ def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
|
|
| 19 |
|
| 20 |
Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
|
| 21 |
if the LLM hallucinates a path or an `old` substring that isn't actually
|
| 22 |
-
there, the swap is skipped instead of corrupting the CV.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
parts = path.split(".") if path else []
|
| 24 |
if not parts:
|
| 25 |
return False
|
|
|
|
| 12 |
MAX_TOKENS_PER_CALL = 8000 # Reduced from 16K — Flash spends most on thinking, not output
|
| 13 |
|
| 14 |
|
| 15 |
+
# Whitelist of CV paths the LLM is allowed to substitute into. Anything off
|
| 16 |
+
# this list is rejected before traversal — a defence against an attacker
|
| 17 |
+
# convincing the LLM to write into arbitrary fields (e.g. `language` to flip
|
| 18 |
+
# locale, `match_score` to pin a perfect rating). The fields below are the only
|
| 19 |
+
# ones a grammar pass should ever touch.
|
| 20 |
+
_ALLOWED_SUBSTITUTION_PATH = re.compile(
|
| 21 |
+
r"^("
|
| 22 |
+
r"name|title|summary|location|"
|
| 23 |
+
r"experiences\.\d+\.(title|company|dates|exitReason|contractType|bullets\.\d+)|"
|
| 24 |
+
r"education\.\d+\.(degree|school|year)|"
|
| 25 |
+
r"skills\.\d+|languages\.\d+|"
|
| 26 |
+
r"strengths\.\d+|improvements\.\d+"
|
| 27 |
+
r")$"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
def _apply_substitution(cv_dict: dict, path: str, old: str, new: str) -> bool:
|
| 32 |
"""Walk a dot-path like 'experiences.2.bullets.3' into cv_dict and do a
|
| 33 |
literal `old` → `new` replace on the string at that path. Returns True if
|
|
|
|
| 35 |
|
| 36 |
Used by LLMService.apply_grammar_fixes to apply audit substitutions safely:
|
| 37 |
if the LLM hallucinates a path or an `old` substring that isn't actually
|
| 38 |
+
there, the swap is skipped instead of corrupting the CV. Paths must match
|
| 39 |
+
the allowlist above — non-grammar fields (match_score, email, language…)
|
| 40 |
+
are off-limits even if the LLM tries to target them."""
|
| 41 |
+
if not path or not _ALLOWED_SUBSTITUTION_PATH.match(path):
|
| 42 |
+
return False
|
| 43 |
parts = path.split(".") if path else []
|
| 44 |
if not parts:
|
| 45 |
return False
|