Spaces:

arifRB
/

gapguide-api

Sleeping

App Files Files Community

gapguide-api / scripts /generate_resume_fixtures.py

arifRB

Deploy GapGuide backend (Docker)

ffd36e0 verified 15 days ago

Raw

History Blame Contribute Delete

6.43 kB

	"""Generate synthetic UCP-student-shaped resume PDFs for Module 8 tests + demo.

	Idempotent — re-running overwrites the same filenames. Output goes to
	backend/tests/fixtures/resumes/.

	What each fixture exercises (see fixtures/resumes/README.md for details):
	resume_ds_strong.pdf — dense DS skills (Python/pandas/sklearn/...)
	resume_fe_junior.pdf — junior FE profile (React/JS/HTML/CSS/Git)
	resume_fullstack_mixed.pdf — composite Django+React+Postgres+Docker
	resume_minimal.pdf — few literal skill names — forces SBERT/fuzzy
	resume_scanned.pdf — image-only PDF → triggers ResumeParseError

	Dev-only deps (requirements-dev.txt): reportlab.
	"""
	from __future__ import annotations

	from pathlib import Path

	from reportlab.lib.pagesizes import LETTER
	from reportlab.lib.styles import getSampleStyleSheet
	from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
	from reportlab.pdfgen import canvas


	BASE_DIR = Path(__file__).resolve().parent.parent
	FIXTURES_DIR = BASE_DIR / "tests" / "fixtures" / "resumes"


	def _write_paragraphs(path: Path, paragraphs: list[str]) -> None:
	"""Emit a single-column text-only PDF from the given paragraphs."""
	styles = getSampleStyleSheet()
	doc = SimpleDocTemplate(
	str(path), pagesize=LETTER,
	leftMargin=54, rightMargin=54, topMargin=54, bottomMargin=54,
	)
	story = []
	for i, text in enumerate(paragraphs):
	style = styles["Heading2"] if i == 0 else styles["BodyText"]
	story.append(Paragraph(text, style))
	story.append(Spacer(1, 8))
	doc.build(story)


	def resume_ds_strong() -> list[str]:
	return [
	"Ayesha Khan — Data Science BS (UCP, Semester 7)",
	"<b>Summary:</b> Final-year Data Science student with 2 internships in "
	"machine learning and dashboarding. Experienced in Python, SQL, Pandas, "
	"NumPy, Scikit-learn, TensorFlow, and PyTorch.",
	"<b>Skills:</b> Python (advanced, 3 years), SQL (advanced), Pandas, "
	"NumPy, Matplotlib, Scikit-learn, TensorFlow, PyTorch, Tableau, "
	"Statistical Hypothesis Testing, Git, Jupyter.",
	"<b>Projects:</b> Built an end-to-end churn-prediction pipeline using "
	"scikit-learn and XGBoost; deployed a neural-network image classifier "
	"with PyTorch and served it via FastAPI.",
	"<b>Internship — Data Analyst @ TechCorp (2024):</b> Designed SQL "
	"reports and Tableau dashboards for marketing funnel analysis.",
	"<b>Education:</b> UCP Lahore, BS Data Science, CGPA 3.8/4.0. "
	"Relevant coursework: Machine Learning, Deep Learning, Statistics.",
	]


	def resume_fe_junior() -> list[str]:
	return [
	"Bilal Ahmed — Computer Science BS (UCP, Semester 5)",
	"<b>Summary:</b> Junior frontend developer with a focus on React, "
	"HTML, and CSS. Familiar with basic Git workflows and modern "
	"JavaScript (ES2020+).",
	"<b>Skills:</b> React, JavaScript, HTML, CSS, Git, Figma (beginner), "
	"Bootstrap, TailwindCSS, Node.js (basic).",
	"<b>Projects:</b> Personal portfolio site (React, Vite, Tailwind). "
	"Classroom attendance dashboard UI with React and Ant Design.",
	"<b>Education:</b> UCP Lahore, BS Computer Science, CGPA 3.2/4.0.",
	]


	def resume_fullstack_mixed() -> list[str]:
	return [
	"Zainab Ali — Software Engineering BS (UCP, Semester 7)",
	"<b>Summary:</b> Full-stack developer with production experience in "
	"Django and React. Built and deployed containerised web apps using "
	"Docker Compose and PostgreSQL.",
	"<b>Skills:</b> Python, Django, Django REST Framework, JavaScript, "
	"React, PostgreSQL, Redis, Docker, Git, GitHub, Linux, Nginx, "
	"RESTful APIs, HTML, CSS.",
	"<b>Projects:</b> UCP club management portal (Django + React + "
	"Postgres, deployed via Docker on a DigitalOcean droplet). "
	"CI/CD with GitHub Actions.",
	"<b>Education:</b> UCP Lahore, BS Software Engineering, CGPA 3.7/4.0.",
	]


	def resume_minimal() -> list[str]:
	# Uses paraphrased / near-names (data visualisation, cloud computing,
	# databases, version control) so the lexical layer largely misses and
	# SBERT/pgvector has to map the aliases to canonical catalog entries.
	return [
	"Usman Shah — BS Applied Mathematics (UCP, Semester 4)",
	"<b>Summary:</b> Math undergraduate interested in data-driven work. "
	"Some exposure to programming and statistical analysis.",
	"<b>Skills:</b> programming in scripting languages, data visualisation, "
	"spreadsheet modelling, basic cloud computing exposure, version "
	"control, databases, statistical analysis.",
	"<b>Coursework:</b> Probability, Linear Algebra, Numerical Methods.",
	]


	def generate_text_pdfs() -> None:
	FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
	jobs = {
	"resume_ds_strong.pdf": resume_ds_strong(),
	"resume_fe_junior.pdf": resume_fe_junior(),
	"resume_fullstack_mixed.pdf": resume_fullstack_mixed(),
	"resume_minimal.pdf": resume_minimal(),
	}
	for filename, paragraphs in jobs.items():
	out = FIXTURES_DIR / filename
	_write_paragraphs(out, paragraphs)
	print(f" wrote {filename} ({out.stat().st_size // 1024} KB)")


	def generate_scanned_pdf() -> None:
	"""Emit a PDF with no extractable text — just vector shapes.

	Exercises the ResumeParseError('No text could be extracted') branch in
	resume_parser._extract_text. A real scanned CV would be a rasterised
	image; reportlab can't easily make one without Pillow, but an empty-text
	PDF lands in the same code path because extract_text() returns ''.
	"""
	out = FIXTURES_DIR / "resume_scanned.pdf"
	c = canvas.Canvas(str(out), pagesize=LETTER)
	# Draw a gray rectangle where "text" would be — no drawString calls.
	c.setFillColorRGB(0.85, 0.85, 0.85)
	c.rect(54, 600, 500, 120, fill=1, stroke=0)
	c.rect(54, 420, 500, 150, fill=1, stroke=0)
	c.showPage()
	c.save()
	print(f" wrote resume_scanned.pdf ({out.stat().st_size // 1024} KB)")


	def main() -> None:
	print(f"Generating resume fixtures into {FIXTURES_DIR}")
	generate_text_pdfs()
	generate_scanned_pdf()
	print("Done.")


	if __name__ == "__main__":
	main()