"""Generate synthetic UCP-student-shaped resume PDFs for Module 8 tests + demo. Idempotent — re-running overwrites the same filenames. Output goes to backend/tests/fixtures/resumes/. What each fixture exercises (see fixtures/resumes/README.md for details): resume_ds_strong.pdf — dense DS skills (Python/pandas/sklearn/...) resume_fe_junior.pdf — junior FE profile (React/JS/HTML/CSS/Git) resume_fullstack_mixed.pdf — composite Django+React+Postgres+Docker resume_minimal.pdf — few literal skill names — forces SBERT/fuzzy resume_scanned.pdf — image-only PDF → triggers ResumeParseError Dev-only deps (requirements-dev.txt): reportlab. """ from __future__ import annotations from pathlib import Path from reportlab.lib.pagesizes import LETTER from reportlab.lib.styles import getSampleStyleSheet from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer from reportlab.pdfgen import canvas BASE_DIR = Path(__file__).resolve().parent.parent FIXTURES_DIR = BASE_DIR / "tests" / "fixtures" / "resumes" def _write_paragraphs(path: Path, paragraphs: list[str]) -> None: """Emit a single-column text-only PDF from the given paragraphs.""" styles = getSampleStyleSheet() doc = SimpleDocTemplate( str(path), pagesize=LETTER, leftMargin=54, rightMargin=54, topMargin=54, bottomMargin=54, ) story = [] for i, text in enumerate(paragraphs): style = styles["Heading2"] if i == 0 else styles["BodyText"] story.append(Paragraph(text, style)) story.append(Spacer(1, 8)) doc.build(story) def resume_ds_strong() -> list[str]: return [ "Ayesha Khan — Data Science BS (UCP, Semester 7)", "Summary: Final-year Data Science student with 2 internships in " "machine learning and dashboarding. Experienced in Python, SQL, Pandas, " "NumPy, Scikit-learn, TensorFlow, and PyTorch.", "Skills: Python (advanced, 3 years), SQL (advanced), Pandas, " "NumPy, Matplotlib, Scikit-learn, TensorFlow, PyTorch, Tableau, " "Statistical Hypothesis Testing, Git, Jupyter.", "Projects: Built an end-to-end churn-prediction pipeline using " "scikit-learn and XGBoost; deployed a neural-network image classifier " "with PyTorch and served it via FastAPI.", "Internship — Data Analyst @ TechCorp (2024): Designed SQL " "reports and Tableau dashboards for marketing funnel analysis.", "Education: UCP Lahore, BS Data Science, CGPA 3.8/4.0. " "Relevant coursework: Machine Learning, Deep Learning, Statistics.", ] def resume_fe_junior() -> list[str]: return [ "Bilal Ahmed — Computer Science BS (UCP, Semester 5)", "Summary: Junior frontend developer with a focus on React, " "HTML, and CSS. Familiar with basic Git workflows and modern " "JavaScript (ES2020+).", "Skills: React, JavaScript, HTML, CSS, Git, Figma (beginner), " "Bootstrap, TailwindCSS, Node.js (basic).", "Projects: Personal portfolio site (React, Vite, Tailwind). " "Classroom attendance dashboard UI with React and Ant Design.", "Education: UCP Lahore, BS Computer Science, CGPA 3.2/4.0.", ] def resume_fullstack_mixed() -> list[str]: return [ "Zainab Ali — Software Engineering BS (UCP, Semester 7)", "Summary: Full-stack developer with production experience in " "Django and React. Built and deployed containerised web apps using " "Docker Compose and PostgreSQL.", "Skills: Python, Django, Django REST Framework, JavaScript, " "React, PostgreSQL, Redis, Docker, Git, GitHub, Linux, Nginx, " "RESTful APIs, HTML, CSS.", "Projects: UCP club management portal (Django + React + " "Postgres, deployed via Docker on a DigitalOcean droplet). " "CI/CD with GitHub Actions.", "Education: UCP Lahore, BS Software Engineering, CGPA 3.7/4.0.", ] def resume_minimal() -> list[str]: # Uses paraphrased / near-names (data visualisation, cloud computing, # databases, version control) so the lexical layer largely misses and # SBERT/pgvector has to map the aliases to canonical catalog entries. return [ "Usman Shah — BS Applied Mathematics (UCP, Semester 4)", "Summary: Math undergraduate interested in data-driven work. " "Some exposure to programming and statistical analysis.", "Skills: programming in scripting languages, data visualisation, " "spreadsheet modelling, basic cloud computing exposure, version " "control, databases, statistical analysis.", "Coursework: Probability, Linear Algebra, Numerical Methods.", ] def generate_text_pdfs() -> None: FIXTURES_DIR.mkdir(parents=True, exist_ok=True) jobs = { "resume_ds_strong.pdf": resume_ds_strong(), "resume_fe_junior.pdf": resume_fe_junior(), "resume_fullstack_mixed.pdf": resume_fullstack_mixed(), "resume_minimal.pdf": resume_minimal(), } for filename, paragraphs in jobs.items(): out = FIXTURES_DIR / filename _write_paragraphs(out, paragraphs) print(f" wrote {filename} ({out.stat().st_size // 1024} KB)") def generate_scanned_pdf() -> None: """Emit a PDF with *no* extractable text — just vector shapes. Exercises the ResumeParseError('No text could be extracted') branch in resume_parser._extract_text. A real scanned CV would be a rasterised image; reportlab can't easily make one without Pillow, but an empty-text PDF lands in the same code path because extract_text() returns ''. """ out = FIXTURES_DIR / "resume_scanned.pdf" c = canvas.Canvas(str(out), pagesize=LETTER) # Draw a gray rectangle where "text" would be — no drawString calls. c.setFillColorRGB(0.85, 0.85, 0.85) c.rect(54, 600, 500, 120, fill=1, stroke=0) c.rect(54, 420, 500, 150, fill=1, stroke=0) c.showPage() c.save() print(f" wrote resume_scanned.pdf ({out.stat().st_size // 1024} KB)") def main() -> None: print(f"Generating resume fixtures into {FIXTURES_DIR}") generate_text_pdfs() generate_scanned_pdf() print("Done.") if __name__ == "__main__": main()