edutech / linkdin_job_data.py
RajatMalviya's picture
Update linkdin_job_data.py
d3cba69 verified
import requests
import re
import json
from typing import List, Dict, Any
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_LEFT
import google.generativeai as genai
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Dict, Optional
import pdfplumber
import fitz # PyMuPDF
import tempfile
import os
load_dotenv()
class JobCrawler:
"""Job description crawler using Jina.ai"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def fetch_markdown(self, url: str) -> str:
"""Fetch job description as markdown using Jina.ai"""
jina_url = f"https://r.jina.ai/{url}"
try:
response = self.session.get(jina_url, timeout=30)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
raise Exception(f"Failed to fetch job description: {str(e)}")
def extract_text(self, markdown: str, patterns: List[str]) -> str:
"""Extract text using regex patterns"""
for pattern in patterns:
match = re.search(pattern, markdown, re.MULTILINE | re.IGNORECASE)
if match:
return match.group(1).strip() if match.lastindex else match.group(0)
return "Not Specified"
def extract_list_items(self, markdown: str, section_name: str, max_items: int = 10) -> List[str]:
"""Extract bullet points from a specific section"""
section_pattern = rf'(?:{section_name})[:\s]*\n([\s\S]*?)(?=\n#{1,3}\s|\n\*\*[A-Z]|$)'
section_match = re.search(section_pattern, markdown, re.IGNORECASE)
if not section_match:
return []
section_text = section_match.group(1)
items = []
for pattern in [r'[-•*]\s*([^\n]+)', r'\d+\.\s*([^\n]+)']:
matches = re.findall(pattern, section_text)
if matches:
items.extend([m.strip() for m in matches[:max_items]])
break
return items[:max_items]
def crawl(self, url: str) -> dict:
"""Crawl a job posting URL and extract information"""
markdown = self.fetch_markdown(url)
return {
'title': self.extract_text(markdown, [r'^#\s+(.+)$', r'\*\*Job Title:\*\*\s*(.+)']),
'company': self.extract_text(markdown, [r'[Cc]ompany[:\s]+([^\n]+)', r'at\s+([A-Z][^\n,]+)']),
'location': self.extract_text(markdown, [r'[Ll]ocation[:\s]+([^\n]+)', r'\b(Remote|Hybrid|On-?site)\b']),
'salary': self.extract_text(markdown, [r'[Ss]alary[:\s]+([^\n]+)', r'\$[\d,]+-\$?[\d,]+']),
'job_type': self.extract_text(markdown, [r'\b(Full[- ]?time|Part[- ]?time|Contract|Internship)\b']),
'experience': self.extract_text(markdown, [r'(\d+)\+?\s*years?\s*(?:of\s*)?experience', r'\b(Entry[- ]?level|Mid[- ]?level|Senior|Junior)\b']),
'requirements': self.extract_list_items(markdown, r'requirements?|qualifications?'),
'responsibilities': self.extract_list_items(markdown, r'responsibilities?|duties|what you\'?ll do'),
'skills': self.extract_list_items(markdown, r'skills?|technologies?|tools?', 15),
'benefits': self.extract_list_items(markdown, r'benefits?|perks?|what we offer'),
'raw_markdown': markdown,
'source_url': url
}
def to_markdown(self, job_data: dict) -> str:
"""Convert job data to markdown format"""
md = [
f"# {job_data['title']}\n",
f"**Company:** {job_data['company']} ",
f"**Location:** {job_data['location']} ",
f"**Salary:** {job_data['salary']} ",
f"**Job Type:** {job_data['job_type']} ",
f"**Experience:** {job_data['experience']}\n",
"\n---\n"
]
if job_data['requirements']:
md.append("\n## Requirements\n")
md.extend([f"- {req}\n" for req in job_data['requirements']])
if job_data['responsibilities']:
md.append("\n## Responsibilities\n")
md.extend([f"- {resp}\n" for resp in job_data['responsibilities']])
if job_data['skills']:
md.append("\n## Skills\n")
md.extend([f"- {skill}\n" for skill in job_data['skills']])
if job_data['benefits']:
md.append("\n## Benefits\n")
md.extend([f"- {benefit}\n" for benefit in job_data['benefits']])
return "".join(md)
# use resume
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY","AIzaSyAiXk08VxFdC9d8axGj_tCMsRjg8YI6bZw")
if GEMINI_API_KEY:
genai.configure(api_key=GEMINI_API_KEY)
else:
print("⚠️ No GEMINI_API_KEY found. Using local enhancement fallback.")
# === Local fallback ===
def local_enhance(text: str) -> list[str]:
base = text.strip().capitalize()
return [
f"Led {base} to measurable outcomes and improved efficiency.",
f"Implemented {base} using scalable, modern frameworks.",
f"Optimized processes delivering consistent, ATS-friendly results.",
]
# === Gemini enrichment ===
def enrich_bullets(data):
items, mapping = [], []
for section_name in ["experience", "projects"]:
for i, section in enumerate(data.get(section_name, [])):
text = " ".join(section.get("description", []))
if text.strip():
items.append(text)
mapping.append((section_name, i))
if not items:
return data
if not GEMINI_API_KEY:
for section, i in mapping:
data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", [])))
return data
try:
model = genai.GenerativeModel("gemini-2.0-flash")
prompt = (
"You are a professional resume writer optimizing for ATS systems.\n"
"For each summary, generate exactly THREE short, single-line bullets (under 20 words).\n"
"Each bullet should start with an action verb and highlight quantifiable results.\n"
"Number them 1., 2., 3.\n\n"
)
combined = "\n".join([f"Item {i+1}: {text}" for i, text in enumerate(items)])
response = model.generate_content(prompt + combined)
enriched_blocks = response.text.strip().split("Item")
for idx, block in enumerate(enriched_blocks):
if idx == 0 or not block.strip():
continue
lines = [
ln.split(". ", 1)[1].strip()
for ln in block.splitlines()
if ln.strip().startswith(("1.", "2.", "3."))
]
if not lines:
lines = local_enhance(items[idx - 1])
section, i = mapping[idx - 1]
data[section][i]["enhanced"] = lines[:3]
except Exception as e:
print("⚠️ Gemini failed:", e)
for section, i in mapping:
data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", [])))
return data
# === PDF Generator ===
def create_pdf(data, output_path):
styles = getSampleStyleSheet()
# Fonts & spacing
normal = ParagraphStyle(
"NormalCustom",
parent=styles["Normal"],
fontName="Times-Roman",
fontSize=10.5,
leading=13,
spaceAfter=2,
)
indent = ParagraphStyle(
"Indented",
parent=normal,
leftIndent=25, # Indent for bullets and sub-lines
spaceAfter=1.5,
)
heading = ParagraphStyle(
"HeadingCustom",
parent=styles["Heading2"],
fontName="Times-Bold",
fontSize=12.5,
leading=14,
spaceBefore=10,
spaceAfter=4,
alignment=TA_LEFT,
)
name_style = ParagraphStyle(
"NameStyle",
parent=styles["Title"],
fontName="Times-Bold",
fontSize=16,
leading=18,
spaceAfter=6,
)
doc = SimpleDocTemplate(
output_path,
pagesize=A4,
rightMargin=36,
leftMargin=36,
topMargin=36,
bottomMargin=36,
)
story = []
# === Header ===
story.append(Paragraph(f"<b>{data.get('name', '')}</b>", name_style))
profile = data.get("profile", {})
contact = " | ".join(
filter(
None,
[
profile.get("location"),
profile.get("phone"),
profile.get("email"),
profile.get("linkedin"),
profile.get("github"),
],
)
)
story.append(Paragraph(contact, normal))
story.append(Spacer(1, 0.12 * inch))
# Helper: aligned two-column row (Title left, Date right)
def aligned_row(left_text, right_text):
left = Paragraph(left_text, normal)
right = Paragraph(right_text or "", normal)
table = Table([[left, right]], colWidths=[4.8 * inch, 1.5 * inch])
table.setStyle(
TableStyle(
[
("VALIGN", (0, 0), (-1, -1), "TOP"),
("ALIGN", (1, 0), (1, 0), "RIGHT"),
("BOTTOMPADDING", (0, 0), (-1, -1), 0),
("TOPPADDING", (0, 0), (-1, -1), 0),
]
)
)
return table
# === Education ===
if data.get("education"):
story.append(Paragraph("EDUCATION", heading))
for edu in data["education"]:
story.append(aligned_row(f"<b>{edu['degree']}</b> — {edu['institution']}", edu.get("dates")))
if edu.get("specialization"):
story.append(Paragraph(edu["specialization"], indent))
if edu.get("gpa"):
story.append(Paragraph(f"GPA: {edu['gpa']}", indent))
story.append(Spacer(1, 5))
# === Skills ===
if data.get("skills"):
story.append(Paragraph("SKILLS", heading))
for skill in data["skills"]:
story.append(Paragraph(f"<b>{skill['category']}:</b> {', '.join(skill['technologies'])}", indent))
story.append(Spacer(1, 5))
# === Experience ===
if data.get("experience"):
story.append(Paragraph("EXPERIENCE", heading))
for exp in data["experience"]:
story.append(aligned_row(f"<b>{exp['title']}</b> — {exp['company']} ({exp.get('location','')})", exp.get("dates")))
if exp.get("position_details"):
story.append(Paragraph(exp["position_details"], indent))
for desc in exp.get("enhanced", []):
story.append(Paragraph(f"• {desc}", indent))
story.append(Spacer(1, 5))
# === Projects ===
if data.get("projects"):
story.append(Paragraph("PROJECTS", heading))
for proj in data["projects"]:
story.append(aligned_row(f"<b>{proj['name']}</b>", proj.get("dates")))
if proj.get("tech_stack"):
story.append(Paragraph(f"Tech Stack: {proj['tech_stack']}", indent))
for desc in proj.get("enhanced", []):
story.append(Paragraph(f"• {desc}", indent))
story.append(Spacer(1, 5))
doc.build(story)
print(f"✅ Resume PDF created successfully: {output_path}")
# === Main ===
def generate_pdf_resume(data: dict, output_path: str):
data = enrich_bullets(data)
create_pdf(data, output_path)
# === Resume PDF Parser ===
# -------------------------------
# Pydantic Models
# -------------------------------
class Experience(BaseModel):
title: Optional[str] = None
company: Optional[str] = None
location: Optional[str] = None
dates: Optional[str] = None
description: Optional[List[str]] = None
class Education(BaseModel):
degree: Optional[str] = None
institution: Optional[str] = None
dates: Optional[str] = None
gpa: Optional[str] = None
details: Optional[List[str]] = None
class SkillCategory(BaseModel):
category: Optional[str] = None
technologies: Optional[List[str]] = None
class Project(BaseModel):
name: Optional[str] = None
description: Optional[List[str]] = None
class Profile(BaseModel):
location: Optional[str] = None
phone: Optional[str] = None
email: Optional[str] = None
linkedin: Optional[str] = None
github: Optional[str] = None
class ResumeData(BaseModel):
name: Optional[str] = None
profile: Optional[Profile] = None
summary: Optional[str] = ""
experience: Optional[List[Experience]] = []
education: Optional[List[Education]] = []
skills: Optional[List[SkillCategory]] = []
projects: Optional[List[Project]] = []
certifications: Optional[List[str]] = []
publications: Optional[List[str]] = []
languages: Optional[List[str]] = []
# -------------------------------
# HELPER FUNCTIONS
# -------------------------------
def extract_text_from_pdf(file_path: str) -> str:
text = ""
try:
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text()
except Exception:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text.strip()
def clean_json_output(output: str) -> str:
cleaned = re.sub(r"```(?:json)?", "", output)
cleaned = cleaned.replace("```", "").strip()
return cleaned
def classify_resume_with_gemini(resume_text: str) -> dict:
prompt = f"""
You are an expert resume parser.
Given the resume text below, extract and classify the information into this JSON format:
{{
"name": "string",
"profile": {{
"location": "string",
"phone": "string",
"email": "string",
"linkedin": "string",
"github": "string"
}},
"experience": [{{"title": "string","company": "string","location": "string","dates": "string","description": ["string"]}}],
"education": [{{"degree": "string","institution": "string","dates": "string","gpa": "string","details": ["string"]}}],
"skills": [{{"category": "string","technologies": ["string"]}}],
"projects": [{{"name": "string","description": ["string"]}}],
"certifications": ["string"],
"publications": ["string"],
"languages": ["string"]
}}
Resume Text:
\"\"\"{resume_text}\"\"\"
Return only valid JSON (no explanations or markdown).
"""
try:
model = genai.GenerativeModel("gemini-2.5-flash")
response = model.generate_content(prompt)
raw_output = response.text.strip()
cleaned_output = clean_json_output(raw_output)
return json.loads(cleaned_output)
except json.JSONDecodeError:
raise HTTPException(status_code=500, detail="Gemini returned invalid JSON output.")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Gemini API Error: {str(e)}")