gapguide-api / scripts /generate_resume_fixtures.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
6.43 kB
"""Generate synthetic UCP-student-shaped resume PDFs for Module 8 tests + demo.
Idempotent β€” re-running overwrites the same filenames. Output goes to
backend/tests/fixtures/resumes/.
What each fixture exercises (see fixtures/resumes/README.md for details):
resume_ds_strong.pdf β€” dense DS skills (Python/pandas/sklearn/...)
resume_fe_junior.pdf β€” junior FE profile (React/JS/HTML/CSS/Git)
resume_fullstack_mixed.pdf β€” composite Django+React+Postgres+Docker
resume_minimal.pdf β€” few literal skill names β€” forces SBERT/fuzzy
resume_scanned.pdf β€” image-only PDF β†’ triggers ResumeParseError
Dev-only deps (requirements-dev.txt): reportlab.
"""
from __future__ import annotations
from pathlib import Path
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
from reportlab.pdfgen import canvas
BASE_DIR = Path(__file__).resolve().parent.parent
FIXTURES_DIR = BASE_DIR / "tests" / "fixtures" / "resumes"
def _write_paragraphs(path: Path, paragraphs: list[str]) -> None:
"""Emit a single-column text-only PDF from the given paragraphs."""
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(
str(path), pagesize=LETTER,
leftMargin=54, rightMargin=54, topMargin=54, bottomMargin=54,
)
story = []
for i, text in enumerate(paragraphs):
style = styles["Heading2"] if i == 0 else styles["BodyText"]
story.append(Paragraph(text, style))
story.append(Spacer(1, 8))
doc.build(story)
def resume_ds_strong() -> list[str]:
return [
"Ayesha Khan β€” Data Science BS (UCP, Semester 7)",
"<b>Summary:</b> Final-year Data Science student with 2 internships in "
"machine learning and dashboarding. Experienced in Python, SQL, Pandas, "
"NumPy, Scikit-learn, TensorFlow, and PyTorch.",
"<b>Skills:</b> Python (advanced, 3 years), SQL (advanced), Pandas, "
"NumPy, Matplotlib, Scikit-learn, TensorFlow, PyTorch, Tableau, "
"Statistical Hypothesis Testing, Git, Jupyter.",
"<b>Projects:</b> Built an end-to-end churn-prediction pipeline using "
"scikit-learn and XGBoost; deployed a neural-network image classifier "
"with PyTorch and served it via FastAPI.",
"<b>Internship β€” Data Analyst @ TechCorp (2024):</b> Designed SQL "
"reports and Tableau dashboards for marketing funnel analysis.",
"<b>Education:</b> UCP Lahore, BS Data Science, CGPA 3.8/4.0. "
"Relevant coursework: Machine Learning, Deep Learning, Statistics.",
]
def resume_fe_junior() -> list[str]:
return [
"Bilal Ahmed β€” Computer Science BS (UCP, Semester 5)",
"<b>Summary:</b> Junior frontend developer with a focus on React, "
"HTML, and CSS. Familiar with basic Git workflows and modern "
"JavaScript (ES2020+).",
"<b>Skills:</b> React, JavaScript, HTML, CSS, Git, Figma (beginner), "
"Bootstrap, TailwindCSS, Node.js (basic).",
"<b>Projects:</b> Personal portfolio site (React, Vite, Tailwind). "
"Classroom attendance dashboard UI with React and Ant Design.",
"<b>Education:</b> UCP Lahore, BS Computer Science, CGPA 3.2/4.0.",
]
def resume_fullstack_mixed() -> list[str]:
return [
"Zainab Ali β€” Software Engineering BS (UCP, Semester 7)",
"<b>Summary:</b> Full-stack developer with production experience in "
"Django and React. Built and deployed containerised web apps using "
"Docker Compose and PostgreSQL.",
"<b>Skills:</b> Python, Django, Django REST Framework, JavaScript, "
"React, PostgreSQL, Redis, Docker, Git, GitHub, Linux, Nginx, "
"RESTful APIs, HTML, CSS.",
"<b>Projects:</b> UCP club management portal (Django + React + "
"Postgres, deployed via Docker on a DigitalOcean droplet). "
"CI/CD with GitHub Actions.",
"<b>Education:</b> UCP Lahore, BS Software Engineering, CGPA 3.7/4.0.",
]
def resume_minimal() -> list[str]:
# Uses paraphrased / near-names (data visualisation, cloud computing,
# databases, version control) so the lexical layer largely misses and
# SBERT/pgvector has to map the aliases to canonical catalog entries.
return [
"Usman Shah β€” BS Applied Mathematics (UCP, Semester 4)",
"<b>Summary:</b> Math undergraduate interested in data-driven work. "
"Some exposure to programming and statistical analysis.",
"<b>Skills:</b> programming in scripting languages, data visualisation, "
"spreadsheet modelling, basic cloud computing exposure, version "
"control, databases, statistical analysis.",
"<b>Coursework:</b> Probability, Linear Algebra, Numerical Methods.",
]
def generate_text_pdfs() -> None:
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
jobs = {
"resume_ds_strong.pdf": resume_ds_strong(),
"resume_fe_junior.pdf": resume_fe_junior(),
"resume_fullstack_mixed.pdf": resume_fullstack_mixed(),
"resume_minimal.pdf": resume_minimal(),
}
for filename, paragraphs in jobs.items():
out = FIXTURES_DIR / filename
_write_paragraphs(out, paragraphs)
print(f" wrote {filename} ({out.stat().st_size // 1024} KB)")
def generate_scanned_pdf() -> None:
"""Emit a PDF with *no* extractable text β€” just vector shapes.
Exercises the ResumeParseError('No text could be extracted') branch in
resume_parser._extract_text. A real scanned CV would be a rasterised
image; reportlab can't easily make one without Pillow, but an empty-text
PDF lands in the same code path because extract_text() returns ''.
"""
out = FIXTURES_DIR / "resume_scanned.pdf"
c = canvas.Canvas(str(out), pagesize=LETTER)
# Draw a gray rectangle where "text" would be β€” no drawString calls.
c.setFillColorRGB(0.85, 0.85, 0.85)
c.rect(54, 600, 500, 120, fill=1, stroke=0)
c.rect(54, 420, 500, 150, fill=1, stroke=0)
c.showPage()
c.save()
print(f" wrote resume_scanned.pdf ({out.stat().st_size // 1024} KB)")
def main() -> None:
print(f"Generating resume fixtures into {FIXTURES_DIR}")
generate_text_pdfs()
generate_scanned_pdf()
print("Done.")
if __name__ == "__main__":
main()