Student_Analyzer / resume_parser.py
akshit7093's picture
Changes before Firebase Studio auto-run
f4552a1
# resume_parser.py
import fitz # PyMuPDF
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
def parse_resume(pdf_path):
"""
Parse PDF resume to extract text, hyperlinks, and generate summary
Returns: {
"full_text": str,
"hyperlinks": list,
"summary": str
}
"""
doc = fitz.open(pdf_path)
full_text = ""
hyperlinks = []
# Extract text and hyperlinks
for page_num in range(len(doc)):
page = doc[page_num]
full_text += page.get_text() + "\n"
# Extract hyperlinks
links = page.get_links()
for link in links:
if link.get('uri'):
# Clean up common PDF hyperlink artifacts
url = re.sub(r'\s+', '', link['uri'])
if url.startswith(('http://', 'https://', 'mailto:')):
hyperlinks.append(url)
# Remove duplicates while preserving order
hyperlinks = list(dict.fromkeys(hyperlinks))
# Generate summary (fallback to first 200 words if summarization fails)
try:
parser = PlaintextParser.from_string(full_text, Tokenizer("english"))
summarizer = LexRankSummarizer()
# Summarize to 5 sentences
summary_sentences = summarizer(parser.document, 5)
summary = " ".join(str(sentence) for sentence in summary_sentences)
except Exception as e:
print(f"Warning: Summarization failed ({e}). Using fallback summary.")
summary = " ".join(full_text.split()[:200]) + "..."
return {
"full_text": full_text.strip(),
"hyperlinks": hyperlinks,
"summary": summary.strip()
}