Spaces:

RajatMalviya
/

edutech

Sleeping

App Files Files Community

edutech / linkdin_job_data.py

RajatMalviya

Update linkdin_job_data.py

d3cba69 verified 6 months ago

raw

history blame contribute delete

15.3 kB

	import requests
	import re
	import json
	from typing import List, Dict, Any
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.pagesizes import A4
	from reportlab.lib.units import inch
	from reportlab.lib.enums import TA_LEFT
	import google.generativeai as genai
	from dotenv import load_dotenv
	from pydantic import BaseModel
	from typing import List, Dict, Optional
	import pdfplumber
	import fitz # PyMuPDF
	import tempfile
	import os

	load_dotenv()


	class JobCrawler:
	"""Job description crawler using Jina.ai"""

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	})

	def fetch_markdown(self, url: str) -> str:
	"""Fetch job description as markdown using Jina.ai"""
	jina_url = f"https://r.jina.ai/{url}"

	try:
	response = self.session.get(jina_url, timeout=30)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	raise Exception(f"Failed to fetch job description: {str(e)}")

	def extract_text(self, markdown: str, patterns: List[str]) -> str:
	"""Extract text using regex patterns"""
	for pattern in patterns:
	match = re.search(pattern, markdown, re.MULTILINE \| re.IGNORECASE)
	if match:
	return match.group(1).strip() if match.lastindex else match.group(0)
	return "Not Specified"

	def extract_list_items(self, markdown: str, section_name: str, max_items: int = 10) -> List[str]:
	"""Extract bullet points from a specific section"""
	section_pattern = rf'(?:{section_name})[:\s]\n([\s\S]?)(?=\n#{1,3}\s\|\n\\[A-Z]\|$)'
	section_match = re.search(section_pattern, markdown, re.IGNORECASE)

	if not section_match:
	return []

	section_text = section_match.group(1)
	items = []

	for pattern in [r'[-•]\s([^\n]+)', r'\d+\.\s*([^\n]+)']:
	matches = re.findall(pattern, section_text)
	if matches:
	items.extend([m.strip() for m in matches[:max_items]])
	break

	return items[:max_items]

	def crawl(self, url: str) -> dict:
	"""Crawl a job posting URL and extract information"""
	markdown = self.fetch_markdown(url)

	return {
	'title': self.extract_text(markdown, [r'^#\s+(.+)$', r'\\Job Title:\\\s*(.+)']),
	'company': self.extract_text(markdown, [r'[Cc]ompany[:\s]+([^\n]+)', r'at\s+([A-Z][^\n,]+)']),
	'location': self.extract_text(markdown, [r'[Ll]ocation[:\s]+([^\n]+)', r'\b(Remote\|Hybrid\|On-?site)\b']),
	'salary': self.extract_text(markdown, [r'[Ss]alary[:\s]+([^\n]+)', r'\$[\d,]+-\$?[\d,]+']),
	'job_type': self.extract_text(markdown, [r'\b(Full[- ]?time\|Part[- ]?time\|Contract\|Internship)\b']),
	'experience': self.extract_text(markdown, [r'(\d+)\+?\syears?\s(?:of\s*)?experience', r'\b(Entry[- ]?level\|Mid[- ]?level\|Senior\|Junior)\b']),
	'requirements': self.extract_list_items(markdown, r'requirements?\|qualifications?'),
	'responsibilities': self.extract_list_items(markdown, r'responsibilities?\|duties\|what you\'?ll do'),
	'skills': self.extract_list_items(markdown, r'skills?\|technologies?\|tools?', 15),
	'benefits': self.extract_list_items(markdown, r'benefits?\|perks?\|what we offer'),
	'raw_markdown': markdown,
	'source_url': url
	}

	def to_markdown(self, job_data: dict) -> str:
	"""Convert job data to markdown format"""
	md = [
	f"# {job_data['title']}\n",
	f"Company: {job_data['company']} ",
	f"Location: {job_data['location']} ",
	f"Salary: {job_data['salary']} ",
	f"Job Type: {job_data['job_type']} ",
	f"Experience: {job_data['experience']}\n",
	"\n---\n"
	]

	if job_data['requirements']:
	md.append("\n## Requirements\n")
	md.extend([f"- {req}\n" for req in job_data['requirements']])

	if job_data['responsibilities']:
	md.append("\n## Responsibilities\n")
	md.extend([f"- {resp}\n" for resp in job_data['responsibilities']])

	if job_data['skills']:
	md.append("\n## Skills\n")
	md.extend([f"- {skill}\n" for skill in job_data['skills']])

	if job_data['benefits']:
	md.append("\n## Benefits\n")
	md.extend([f"- {benefit}\n" for benefit in job_data['benefits']])

	return "".join(md)



	# use resume
	load_dotenv()
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY","AIzaSyAiXk08VxFdC9d8axGj_tCMsRjg8YI6bZw")
	if GEMINI_API_KEY:
	genai.configure(api_key=GEMINI_API_KEY)
	else:
	print("⚠️ No GEMINI_API_KEY found. Using local enhancement fallback.")

	# === Local fallback ===
	def local_enhance(text: str) -> list[str]:
	base = text.strip().capitalize()
	return [
	f"Led {base} to measurable outcomes and improved efficiency.",
	f"Implemented {base} using scalable, modern frameworks.",
	f"Optimized processes delivering consistent, ATS-friendly results.",
	]

	# === Gemini enrichment ===
	def enrich_bullets(data):
	items, mapping = [], []
	for section_name in ["experience", "projects"]:
	for i, section in enumerate(data.get(section_name, [])):
	text = " ".join(section.get("description", []))
	if text.strip():
	items.append(text)
	mapping.append((section_name, i))
	if not items:
	return data

	if not GEMINI_API_KEY:
	for section, i in mapping:
	data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", [])))
	return data

	try:
	model = genai.GenerativeModel("gemini-2.0-flash")
	prompt = (
	"You are a professional resume writer optimizing for ATS systems.\n"
	"For each summary, generate exactly THREE short, single-line bullets (under 20 words).\n"
	"Each bullet should start with an action verb and highlight quantifiable results.\n"
	"Number them 1., 2., 3.\n\n"
	)
	combined = "\n".join([f"Item {i+1}: {text}" for i, text in enumerate(items)])
	response = model.generate_content(prompt + combined)
	enriched_blocks = response.text.strip().split("Item")

	for idx, block in enumerate(enriched_blocks):
	if idx == 0 or not block.strip():
	continue
	lines = [
	ln.split(". ", 1)[1].strip()
	for ln in block.splitlines()
	if ln.strip().startswith(("1.", "2.", "3."))
	]
	if not lines:
	lines = local_enhance(items[idx - 1])
	section, i = mapping[idx - 1]
	data[section][i]["enhanced"] = lines[:3]
	except Exception as e:
	print("⚠️ Gemini failed:", e)
	for section, i in mapping:
	data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", [])))
	return data


	# === PDF Generator ===
	def create_pdf(data, output_path):
	styles = getSampleStyleSheet()

	# Fonts & spacing
	normal = ParagraphStyle(
	"NormalCustom",
	parent=styles["Normal"],
	fontName="Times-Roman",
	fontSize=10.5,
	leading=13,
	spaceAfter=2,
	)
	indent = ParagraphStyle(
	"Indented",
	parent=normal,
	leftIndent=25, # Indent for bullets and sub-lines
	spaceAfter=1.5,
	)
	heading = ParagraphStyle(
	"HeadingCustom",
	parent=styles["Heading2"],
	fontName="Times-Bold",
	fontSize=12.5,
	leading=14,
	spaceBefore=10,
	spaceAfter=4,
	alignment=TA_LEFT,
	)
	name_style = ParagraphStyle(
	"NameStyle",
	parent=styles["Title"],
	fontName="Times-Bold",
	fontSize=16,
	leading=18,
	spaceAfter=6,
	)

	doc = SimpleDocTemplate(
	output_path,
	pagesize=A4,
	rightMargin=36,
	leftMargin=36,
	topMargin=36,
	bottomMargin=36,
	)

	story = []

	# === Header ===
	story.append(Paragraph(f"<b>{data.get('name', '')}</b>", name_style))
	profile = data.get("profile", {})
	contact = " \| ".join(
	filter(
	None,
	[
	profile.get("location"),
	profile.get("phone"),
	profile.get("email"),
	profile.get("linkedin"),
	profile.get("github"),
	],
	)
	)
	story.append(Paragraph(contact, normal))
	story.append(Spacer(1, 0.12 * inch))

	# Helper: aligned two-column row (Title left, Date right)
	def aligned_row(left_text, right_text):
	left = Paragraph(left_text, normal)
	right = Paragraph(right_text or "", normal)
	table = Table([[left, right]], colWidths=[4.8 * inch, 1.5 * inch])
	table.setStyle(
	TableStyle(
	[
	("VALIGN", (0, 0), (-1, -1), "TOP"),
	("ALIGN", (1, 0), (1, 0), "RIGHT"),
	("BOTTOMPADDING", (0, 0), (-1, -1), 0),
	("TOPPADDING", (0, 0), (-1, -1), 0),
	]
	)
	)
	return table

	# === Education ===
	if data.get("education"):
	story.append(Paragraph("EDUCATION", heading))
	for edu in data["education"]:
	story.append(aligned_row(f"<b>{edu['degree']}</b> — {edu['institution']}", edu.get("dates")))
	if edu.get("specialization"):
	story.append(Paragraph(edu["specialization"], indent))
	if edu.get("gpa"):
	story.append(Paragraph(f"GPA: {edu['gpa']}", indent))
	story.append(Spacer(1, 5))

	# === Skills ===
	if data.get("skills"):
	story.append(Paragraph("SKILLS", heading))
	for skill in data["skills"]:
	story.append(Paragraph(f"<b>{skill['category']}:</b> {', '.join(skill['technologies'])}", indent))
	story.append(Spacer(1, 5))

	# === Experience ===
	if data.get("experience"):
	story.append(Paragraph("EXPERIENCE", heading))
	for exp in data["experience"]:
	story.append(aligned_row(f"<b>{exp['title']}</b> — {exp['company']} ({exp.get('location','')})", exp.get("dates")))
	if exp.get("position_details"):
	story.append(Paragraph(exp["position_details"], indent))
	for desc in exp.get("enhanced", []):
	story.append(Paragraph(f"• {desc}", indent))
	story.append(Spacer(1, 5))

	# === Projects ===
	if data.get("projects"):
	story.append(Paragraph("PROJECTS", heading))
	for proj in data["projects"]:
	story.append(aligned_row(f"<b>{proj['name']}</b>", proj.get("dates")))
	if proj.get("tech_stack"):
	story.append(Paragraph(f"Tech Stack: {proj['tech_stack']}", indent))
	for desc in proj.get("enhanced", []):
	story.append(Paragraph(f"• {desc}", indent))
	story.append(Spacer(1, 5))

	doc.build(story)
	print(f"✅ Resume PDF created successfully: {output_path}")


	# === Main ===
	def generate_pdf_resume(data: dict, output_path: str):
	data = enrich_bullets(data)
	create_pdf(data, output_path)




	# === Resume PDF Parser ===

	# -------------------------------
	# Pydantic Models
	# -------------------------------
	class Experience(BaseModel):
	title: Optional[str] = None
	company: Optional[str] = None
	location: Optional[str] = None
	dates: Optional[str] = None
	description: Optional[List[str]] = None


	class Education(BaseModel):
	degree: Optional[str] = None
	institution: Optional[str] = None
	dates: Optional[str] = None
	gpa: Optional[str] = None
	details: Optional[List[str]] = None


	class SkillCategory(BaseModel):
	category: Optional[str] = None
	technologies: Optional[List[str]] = None


	class Project(BaseModel):
	name: Optional[str] = None
	description: Optional[List[str]] = None


	class Profile(BaseModel):
	location: Optional[str] = None
	phone: Optional[str] = None
	email: Optional[str] = None
	linkedin: Optional[str] = None
	github: Optional[str] = None


	class ResumeData(BaseModel):
	name: Optional[str] = None
	profile: Optional[Profile] = None
	summary: Optional[str] = ""
	experience: Optional[List[Experience]] = []
	education: Optional[List[Education]] = []
	skills: Optional[List[SkillCategory]] = []
	projects: Optional[List[Project]] = []
	certifications: Optional[List[str]] = []
	publications: Optional[List[str]] = []
	languages: Optional[List[str]] = []


	# -------------------------------
	# HELPER FUNCTIONS
	# -------------------------------
	def extract_text_from_pdf(file_path: str) -> str:
	text = ""
	try:
	with fitz.open(file_path) as doc:
	for page in doc:
	text += page.get_text()
	except Exception:
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() or ""
	return text.strip()


	def clean_json_output(output: str) -> str:
	cleaned = re.sub(r"```(?:json)?", "", output)
	cleaned = cleaned.replace("```", "").strip()
	return cleaned


	def classify_resume_with_gemini(resume_text: str) -> dict:
	prompt = f"""
	You are an expert resume parser.
	Given the resume text below, extract and classify the information into this JSON format:
	{{
	"name": "string",
	"profile": {{
	"location": "string",
	"phone": "string",
	"email": "string",
	"linkedin": "string",
	"github": "string"
	}},
	"experience": [{{"title": "string","company": "string","location": "string","dates": "string","description": ["string"]}}],
	"education": [{{"degree": "string","institution": "string","dates": "string","gpa": "string","details": ["string"]}}],
	"skills": [{{"category": "string","technologies": ["string"]}}],
	"projects": [{{"name": "string","description": ["string"]}}],
	"certifications": ["string"],
	"publications": ["string"],
	"languages": ["string"]
	}}

	Resume Text:
	\"\"\"{resume_text}\"\"\"

	Return only valid JSON (no explanations or markdown).
	"""

	try:
	model = genai.GenerativeModel("gemini-2.5-flash")
	response = model.generate_content(prompt)
	raw_output = response.text.strip()
	cleaned_output = clean_json_output(raw_output)
	return json.loads(cleaned_output)

	except json.JSONDecodeError:
	raise HTTPException(status_code=500, detail="Gemini returned invalid JSON output.")
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Gemini API Error: {str(e)}")