"""Generate synthetic UCP-student-shaped resume PDFs for Module 8 tests + demo.
Idempotent — re-running overwrites the same filenames. Output goes to
backend/tests/fixtures/resumes/.
What each fixture exercises (see fixtures/resumes/README.md for details):
resume_ds_strong.pdf — dense DS skills (Python/pandas/sklearn/...)
resume_fe_junior.pdf — junior FE profile (React/JS/HTML/CSS/Git)
resume_fullstack_mixed.pdf — composite Django+React+Postgres+Docker
resume_minimal.pdf — few literal skill names — forces SBERT/fuzzy
resume_scanned.pdf — image-only PDF → triggers ResumeParseError
Dev-only deps (requirements-dev.txt): reportlab.
"""
from __future__ import annotations
from pathlib import Path
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
from reportlab.pdfgen import canvas
BASE_DIR = Path(__file__).resolve().parent.parent
FIXTURES_DIR = BASE_DIR / "tests" / "fixtures" / "resumes"
def _write_paragraphs(path: Path, paragraphs: list[str]) -> None:
"""Emit a single-column text-only PDF from the given paragraphs."""
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(
str(path), pagesize=LETTER,
leftMargin=54, rightMargin=54, topMargin=54, bottomMargin=54,
)
story = []
for i, text in enumerate(paragraphs):
style = styles["Heading2"] if i == 0 else styles["BodyText"]
story.append(Paragraph(text, style))
story.append(Spacer(1, 8))
doc.build(story)
def resume_ds_strong() -> list[str]:
return [
"Ayesha Khan — Data Science BS (UCP, Semester 7)",
"Summary: Final-year Data Science student with 2 internships in "
"machine learning and dashboarding. Experienced in Python, SQL, Pandas, "
"NumPy, Scikit-learn, TensorFlow, and PyTorch.",
"Skills: Python (advanced, 3 years), SQL (advanced), Pandas, "
"NumPy, Matplotlib, Scikit-learn, TensorFlow, PyTorch, Tableau, "
"Statistical Hypothesis Testing, Git, Jupyter.",
"Projects: Built an end-to-end churn-prediction pipeline using "
"scikit-learn and XGBoost; deployed a neural-network image classifier "
"with PyTorch and served it via FastAPI.",
"Internship — Data Analyst @ TechCorp (2024): Designed SQL "
"reports and Tableau dashboards for marketing funnel analysis.",
"Education: UCP Lahore, BS Data Science, CGPA 3.8/4.0. "
"Relevant coursework: Machine Learning, Deep Learning, Statistics.",
]
def resume_fe_junior() -> list[str]:
return [
"Bilal Ahmed — Computer Science BS (UCP, Semester 5)",
"Summary: Junior frontend developer with a focus on React, "
"HTML, and CSS. Familiar with basic Git workflows and modern "
"JavaScript (ES2020+).",
"Skills: React, JavaScript, HTML, CSS, Git, Figma (beginner), "
"Bootstrap, TailwindCSS, Node.js (basic).",
"Projects: Personal portfolio site (React, Vite, Tailwind). "
"Classroom attendance dashboard UI with React and Ant Design.",
"Education: UCP Lahore, BS Computer Science, CGPA 3.2/4.0.",
]
def resume_fullstack_mixed() -> list[str]:
return [
"Zainab Ali — Software Engineering BS (UCP, Semester 7)",
"Summary: Full-stack developer with production experience in "
"Django and React. Built and deployed containerised web apps using "
"Docker Compose and PostgreSQL.",
"Skills: Python, Django, Django REST Framework, JavaScript, "
"React, PostgreSQL, Redis, Docker, Git, GitHub, Linux, Nginx, "
"RESTful APIs, HTML, CSS.",
"Projects: UCP club management portal (Django + React + "
"Postgres, deployed via Docker on a DigitalOcean droplet). "
"CI/CD with GitHub Actions.",
"Education: UCP Lahore, BS Software Engineering, CGPA 3.7/4.0.",
]
def resume_minimal() -> list[str]:
# Uses paraphrased / near-names (data visualisation, cloud computing,
# databases, version control) so the lexical layer largely misses and
# SBERT/pgvector has to map the aliases to canonical catalog entries.
return [
"Usman Shah — BS Applied Mathematics (UCP, Semester 4)",
"Summary: Math undergraduate interested in data-driven work. "
"Some exposure to programming and statistical analysis.",
"Skills: programming in scripting languages, data visualisation, "
"spreadsheet modelling, basic cloud computing exposure, version "
"control, databases, statistical analysis.",
"Coursework: Probability, Linear Algebra, Numerical Methods.",
]
def generate_text_pdfs() -> None:
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
jobs = {
"resume_ds_strong.pdf": resume_ds_strong(),
"resume_fe_junior.pdf": resume_fe_junior(),
"resume_fullstack_mixed.pdf": resume_fullstack_mixed(),
"resume_minimal.pdf": resume_minimal(),
}
for filename, paragraphs in jobs.items():
out = FIXTURES_DIR / filename
_write_paragraphs(out, paragraphs)
print(f" wrote {filename} ({out.stat().st_size // 1024} KB)")
def generate_scanned_pdf() -> None:
"""Emit a PDF with *no* extractable text — just vector shapes.
Exercises the ResumeParseError('No text could be extracted') branch in
resume_parser._extract_text. A real scanned CV would be a rasterised
image; reportlab can't easily make one without Pillow, but an empty-text
PDF lands in the same code path because extract_text() returns ''.
"""
out = FIXTURES_DIR / "resume_scanned.pdf"
c = canvas.Canvas(str(out), pagesize=LETTER)
# Draw a gray rectangle where "text" would be — no drawString calls.
c.setFillColorRGB(0.85, 0.85, 0.85)
c.rect(54, 600, 500, 120, fill=1, stroke=0)
c.rect(54, 420, 500, 150, fill=1, stroke=0)
c.showPage()
c.save()
print(f" wrote resume_scanned.pdf ({out.stat().st_size // 1024} KB)")
def main() -> None:
print(f"Generating resume fixtures into {FIXTURES_DIR}")
generate_text_pdfs()
generate_scanned_pdf()
print("Done.")
if __name__ == "__main__":
main()