Spaces:
Sleeping
Sleeping
| # resume_parser.py | |
| import fitz # PyMuPDF | |
| import re | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| def parse_resume(pdf_path): | |
| """ | |
| Parse PDF resume to extract text, hyperlinks, and generate summary | |
| Returns: { | |
| "full_text": str, | |
| "hyperlinks": list, | |
| "summary": str | |
| } | |
| """ | |
| doc = fitz.open(pdf_path) | |
| full_text = "" | |
| hyperlinks = [] | |
| # Extract text and hyperlinks | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| full_text += page.get_text() + "\n" | |
| # Extract hyperlinks | |
| links = page.get_links() | |
| for link in links: | |
| if link.get('uri'): | |
| # Clean up common PDF hyperlink artifacts | |
| url = re.sub(r'\s+', '', link['uri']) | |
| if url.startswith(('http://', 'https://', 'mailto:')): | |
| hyperlinks.append(url) | |
| # Remove duplicates while preserving order | |
| hyperlinks = list(dict.fromkeys(hyperlinks)) | |
| # Generate summary (fallback to first 200 words if summarization fails) | |
| try: | |
| parser = PlaintextParser.from_string(full_text, Tokenizer("english")) | |
| summarizer = LexRankSummarizer() | |
| # Summarize to 5 sentences | |
| summary_sentences = summarizer(parser.document, 5) | |
| summary = " ".join(str(sentence) for sentence in summary_sentences) | |
| except Exception as e: | |
| print(f"Warning: Summarization failed ({e}). Using fallback summary.") | |
| summary = " ".join(full_text.split()[:200]) + "..." | |
| return { | |
| "full_text": full_text.strip(), | |
| "hyperlinks": hyperlinks, | |
| "summary": summary.strip() | |
| } |