draftme / filters /structure.py
dokster's picture
Upload 105 files
7d2fea2 verified
Raw
History Blame Contribute Delete
3.3 kB
import re
import tempfile
from html.parser import HTMLParser
from core.renderer import _ensure_homebrew_library_path, _wrap_resume_html
from filters.base import Filter
from models.config import AppSettings
from models.cv import CVData
from models.filters import FilterResult
class _TextExtractor(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.parts: list[str] = []
def handle_data(self, data: str) -> None:
self.parts.append(data)
@property
def text(self) -> str:
return " ".join(self.parts)
class StructureFilter(Filter):
name = "structure"
priority = 1
min_words = 300
ideal_min_words = 400
max_words = 750
def run(self, html: str, cv_data: CVData, jd_text: str, settings: AppSettings | None = None) -> FilterResult:
parser = _TextExtractor()
parser.feed(html)
text = re.sub(r"\s+", " ", parser.text).strip()
lowered = text.lower()
failures: list[str] = []
if not any(keyword in lowered for keyword in ("experience", "work")):
failures.append("Missing Experience/Work section.")
if "education" not in lowered:
failures.append("Missing Education section.")
if "skills" not in lowered:
failures.append("Missing Skills section.")
if any(marker.lower() in lowered for marker in ("[your name]", "lorem ipsum", "insert")):
failures.append("Contains placeholder text.")
word_count = len(re.findall(r"\b\w+\b", text))
warnings: list[str] = []
if word_count < self.min_words:
failures.append(f"Word count is too low; expected at least {self.min_words}, got {word_count}.")
elif word_count < self.ideal_min_words:
warnings.append(f"Resume is concise ({word_count} words); ideal range starts around {self.ideal_min_words}.")
elif word_count > self.max_words:
failures.append(f"Word count is too high; expected at most {self.max_words}, got {word_count}.")
page_count = None
try:
_ensure_homebrew_library_path()
from pypdf import PdfReader
from weasyprint import HTML
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
HTML(string=_wrap_resume_html(html)).write_pdf(tmp.name)
page_count = len(PdfReader(tmp.name).pages)
if page_count != 1:
failures.append(f"Rendered PDF must be exactly 1 page; got {page_count}.")
except Exception as exc:
failures.append(f"Could not render HTML to PDF: {exc}")
return FilterResult(
filter_name=self.name,
passed=not failures,
score=_word_count_score(word_count, self.min_words, self.ideal_min_words, self.max_words) if not failures else 0.0,
feedback="\n".join(failures),
detail={"word_count": word_count, "page_count": page_count, "warnings": warnings},
)
def _word_count_score(word_count: int, min_words: int, ideal_min_words: int, max_words: int) -> float:
if ideal_min_words <= word_count <= max_words:
return 1.0
if min_words <= word_count < ideal_min_words:
return max(0.75, word_count / ideal_min_words)
return 0.0