Spaces:
Sleeping
Sleeping
| """Generate synthetic UCP-student-shaped resume PDFs for Module 8 tests + demo. | |
| Idempotent β re-running overwrites the same filenames. Output goes to | |
| backend/tests/fixtures/resumes/. | |
| What each fixture exercises (see fixtures/resumes/README.md for details): | |
| resume_ds_strong.pdf β dense DS skills (Python/pandas/sklearn/...) | |
| resume_fe_junior.pdf β junior FE profile (React/JS/HTML/CSS/Git) | |
| resume_fullstack_mixed.pdf β composite Django+React+Postgres+Docker | |
| resume_minimal.pdf β few literal skill names β forces SBERT/fuzzy | |
| resume_scanned.pdf β image-only PDF β triggers ResumeParseError | |
| Dev-only deps (requirements-dev.txt): reportlab. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from reportlab.lib.pagesizes import LETTER | |
| from reportlab.lib.styles import getSampleStyleSheet | |
| from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer | |
| from reportlab.pdfgen import canvas | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| FIXTURES_DIR = BASE_DIR / "tests" / "fixtures" / "resumes" | |
| def _write_paragraphs(path: Path, paragraphs: list[str]) -> None: | |
| """Emit a single-column text-only PDF from the given paragraphs.""" | |
| styles = getSampleStyleSheet() | |
| doc = SimpleDocTemplate( | |
| str(path), pagesize=LETTER, | |
| leftMargin=54, rightMargin=54, topMargin=54, bottomMargin=54, | |
| ) | |
| story = [] | |
| for i, text in enumerate(paragraphs): | |
| style = styles["Heading2"] if i == 0 else styles["BodyText"] | |
| story.append(Paragraph(text, style)) | |
| story.append(Spacer(1, 8)) | |
| doc.build(story) | |
| def resume_ds_strong() -> list[str]: | |
| return [ | |
| "Ayesha Khan β Data Science BS (UCP, Semester 7)", | |
| "<b>Summary:</b> Final-year Data Science student with 2 internships in " | |
| "machine learning and dashboarding. Experienced in Python, SQL, Pandas, " | |
| "NumPy, Scikit-learn, TensorFlow, and PyTorch.", | |
| "<b>Skills:</b> Python (advanced, 3 years), SQL (advanced), Pandas, " | |
| "NumPy, Matplotlib, Scikit-learn, TensorFlow, PyTorch, Tableau, " | |
| "Statistical Hypothesis Testing, Git, Jupyter.", | |
| "<b>Projects:</b> Built an end-to-end churn-prediction pipeline using " | |
| "scikit-learn and XGBoost; deployed a neural-network image classifier " | |
| "with PyTorch and served it via FastAPI.", | |
| "<b>Internship β Data Analyst @ TechCorp (2024):</b> Designed SQL " | |
| "reports and Tableau dashboards for marketing funnel analysis.", | |
| "<b>Education:</b> UCP Lahore, BS Data Science, CGPA 3.8/4.0. " | |
| "Relevant coursework: Machine Learning, Deep Learning, Statistics.", | |
| ] | |
| def resume_fe_junior() -> list[str]: | |
| return [ | |
| "Bilal Ahmed β Computer Science BS (UCP, Semester 5)", | |
| "<b>Summary:</b> Junior frontend developer with a focus on React, " | |
| "HTML, and CSS. Familiar with basic Git workflows and modern " | |
| "JavaScript (ES2020+).", | |
| "<b>Skills:</b> React, JavaScript, HTML, CSS, Git, Figma (beginner), " | |
| "Bootstrap, TailwindCSS, Node.js (basic).", | |
| "<b>Projects:</b> Personal portfolio site (React, Vite, Tailwind). " | |
| "Classroom attendance dashboard UI with React and Ant Design.", | |
| "<b>Education:</b> UCP Lahore, BS Computer Science, CGPA 3.2/4.0.", | |
| ] | |
| def resume_fullstack_mixed() -> list[str]: | |
| return [ | |
| "Zainab Ali β Software Engineering BS (UCP, Semester 7)", | |
| "<b>Summary:</b> Full-stack developer with production experience in " | |
| "Django and React. Built and deployed containerised web apps using " | |
| "Docker Compose and PostgreSQL.", | |
| "<b>Skills:</b> Python, Django, Django REST Framework, JavaScript, " | |
| "React, PostgreSQL, Redis, Docker, Git, GitHub, Linux, Nginx, " | |
| "RESTful APIs, HTML, CSS.", | |
| "<b>Projects:</b> UCP club management portal (Django + React + " | |
| "Postgres, deployed via Docker on a DigitalOcean droplet). " | |
| "CI/CD with GitHub Actions.", | |
| "<b>Education:</b> UCP Lahore, BS Software Engineering, CGPA 3.7/4.0.", | |
| ] | |
| def resume_minimal() -> list[str]: | |
| # Uses paraphrased / near-names (data visualisation, cloud computing, | |
| # databases, version control) so the lexical layer largely misses and | |
| # SBERT/pgvector has to map the aliases to canonical catalog entries. | |
| return [ | |
| "Usman Shah β BS Applied Mathematics (UCP, Semester 4)", | |
| "<b>Summary:</b> Math undergraduate interested in data-driven work. " | |
| "Some exposure to programming and statistical analysis.", | |
| "<b>Skills:</b> programming in scripting languages, data visualisation, " | |
| "spreadsheet modelling, basic cloud computing exposure, version " | |
| "control, databases, statistical analysis.", | |
| "<b>Coursework:</b> Probability, Linear Algebra, Numerical Methods.", | |
| ] | |
| def generate_text_pdfs() -> None: | |
| FIXTURES_DIR.mkdir(parents=True, exist_ok=True) | |
| jobs = { | |
| "resume_ds_strong.pdf": resume_ds_strong(), | |
| "resume_fe_junior.pdf": resume_fe_junior(), | |
| "resume_fullstack_mixed.pdf": resume_fullstack_mixed(), | |
| "resume_minimal.pdf": resume_minimal(), | |
| } | |
| for filename, paragraphs in jobs.items(): | |
| out = FIXTURES_DIR / filename | |
| _write_paragraphs(out, paragraphs) | |
| print(f" wrote {filename} ({out.stat().st_size // 1024} KB)") | |
| def generate_scanned_pdf() -> None: | |
| """Emit a PDF with *no* extractable text β just vector shapes. | |
| Exercises the ResumeParseError('No text could be extracted') branch in | |
| resume_parser._extract_text. A real scanned CV would be a rasterised | |
| image; reportlab can't easily make one without Pillow, but an empty-text | |
| PDF lands in the same code path because extract_text() returns ''. | |
| """ | |
| out = FIXTURES_DIR / "resume_scanned.pdf" | |
| c = canvas.Canvas(str(out), pagesize=LETTER) | |
| # Draw a gray rectangle where "text" would be β no drawString calls. | |
| c.setFillColorRGB(0.85, 0.85, 0.85) | |
| c.rect(54, 600, 500, 120, fill=1, stroke=0) | |
| c.rect(54, 420, 500, 150, fill=1, stroke=0) | |
| c.showPage() | |
| c.save() | |
| print(f" wrote resume_scanned.pdf ({out.stat().st_size // 1024} KB)") | |
| def main() -> None: | |
| print(f"Generating resume fixtures into {FIXTURES_DIR}") | |
| generate_text_pdfs() | |
| generate_scanned_pdf() | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |