Spaces:

learnopolis
/

JobPulse-Bouncer

Running

App Files Files Community

JobPulse-Bouncer / server.py

learnopolis

Update server.py

39e4cd3 verified about 1 month ago

Raw

History Blame Contribute Delete

32.1 kB

	from groq import Groq
	from fastapi import FastAPI, HTTPException, Response
	from fastapi.responses import HTMLResponse
	from pydantic import BaseModel
	from bs4 import BeautifulSoup
	from typing import List, Dict
	import email as email_lib
	import json
	import os
	import re
	import hashlib
	import subprocess
	import tempfile
	from dotenv import load_dotenv
	from datetime import datetime, timedelta, timezone
	from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl, urlencode, unquote

	import firebase_admin
	from firebase_admin import credentials, firestore


	# ─────────────────────────────────────────
	# 1. LOAD ENVIRONMENT VARIABLES
	# ─────────────────────────────────────────
	load_dotenv()
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	groq_client = Groq(api_key=GROQ_API_KEY)


	# ─────────────────────────────────────────
	# 2. INITIALIZE FIREBASE
	# ─────────────────────────────────────────
	firebase_secret = os.getenv("FIREBASE_CREDENTIALS")
	if firebase_secret:
	cred_dict = json.loads(firebase_secret)
	cred = credentials.Certificate(cred_dict)
	else:
	cred = credentials.Certificate("firebase-credentials.json")

	firebase_admin.initialize_app(cred)
	db = firestore.client()

	app = FastAPI(title="JobPulse AI Parser")


	# ─────────────────────────────────────────
	# PYDANTIC MODELS
	# ─────────────────────────────────────────
	class EmailPayload(BaseModel):
	user_email: str
	email_text: str


	class JDPayload(BaseModel):
	jd_text: str


	class LatexPayload(BaseModel):
	latex_code: str


	# ═════════════════════════════════════════════════════════════════
	# STAGE 0: MIME + Quoted-Printable Decoder
	# Emails arriving as raw RFC-2822 messages are:
	# - Multipart MIME -> must extract only the text/html part
	# - QP-encoded -> =3D means =, line-ending = means line continuation
	# Running quopri on the full raw email (headers + body) corrupts everything.
	# Python stdlib `email` module splits MIME correctly first.
	# ═════════════════════════════════════════════════════════════════
	def extract_html_from_email(raw: str) -> str:
	"""
	Properly parse a raw RFC-2822 email and return the decoded HTML body.
	Falls back to treating the input as plain HTML if MIME parsing fails.
	"""
	try:
	msg = email_lib.message_from_string(raw)
	for part in msg.walk():
	if part.get_content_type() == "text/html":
	# get_payload(decode=True) handles both base64 and QP automatically
	payload = part.get_payload(decode=True)
	charset = part.get_content_charset() or "utf-8"
	return payload.decode(charset, errors="replace")
	# No HTML part found — maybe input is already plain HTML
	return raw
	except Exception:
	return raw


	# ═════════════════════════════════════════════════════════════════
	# STAGE 1: Platform Detector
	# ═════════════════════════════════════════════════════════════════
	def detect_platform(soup: BeautifulSoup, raw_text: str) -> str:
	all_links = [a.get("href", "") for a in soup.find_all("a", href=True)]
	link_text = " ".join(all_links).lower()
	text_lower = raw_text.lower()

	if "glassdoor.com" in link_text: return "glassdoor"
	if "linkedin.com" in link_text: return "linkedin"
	if "naukri.com" in link_text: return "naukri"
	if "foundit.in" in link_text or "monster.com" in link_text: return "foundit"
	if "indeed.com" in link_text: return "indeed"
	if "instahyre.com" in link_text: return "instahyre"

	if "glassdoor" in text_lower: return "glassdoor"
	if "linkedin" in text_lower: return "linkedin"
	if "naukri" in text_lower: return "naukri"

	return "generic"


	# ═════════════════════════════════════════════════════════════════
	# STAGE 2: URL Utilities
	# ═════════════════════════════════════════════════════════════════
	JUNK_PARAMS = {
	"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
	"jrtk", "guid", "ja", "uido", "cs", "cb", "ao", "s", "vt", "ea",
	"tgt", "src", "t", "pos",
	"trackingid", "refid", "lipi", "midtoken", "midsig", "trk", "trkemail", "eid", "otptoken",
	"spl", "notification_frequency", "autoApply", "jr_source", "apop", "notificationid", "response", "type",
	# Indeed tracking — 'jk' is intentionally NOT here, it is the job ID
	"qd", "rd", "tk", "alid", "bb", "mo", "ad", "xkcb", "camk", "p", "jsa", "rjs", "gdfvj", "plid", "fvj",
	}

	NOISE_SIGNALS = [
	"unsubscribe", "privacy", "terms", "manage", "email-pref",
	"brand-views", "brandview", "wf/open", "logomark", "logo.png",
	"easy-apply-icon", "location-icon", "bell-icon", "jobmatch",
	"twitter.com", "facebook.com", "instagram.com", "youtube.com",
	"glassdoor.com/about", "mailto:", "jobalertajax", "emailsettings",
	"job-alert/jobalert", "job-alert-email-unsubscribe", "jobs/alerts",
	"jobs/search", "comm/feed", "comm/mynetwork", "comm/messaging",
	"comm/notifications", "comm/premium", "comm/widgets",
	"linkedin.com/help", "in.linkedin.com/comm/in/",
	"static.licdn.com", "media.licdn.com",
	"naukri.com/mnjuser", "naukri.com/user",
	"seeker/dashboard", "seeker/profile", "seeker/jobalert-feedback",
	"trex/unsubscribe", "appurl.io", "play.google.com", "itunes.apple.com",
	"media.monsterindia.com", "media.foundit.in",
	"widget", "promo", "feed", "mynetwork",
	]

	PLATFORM_JOB_SIGNALS = {
	"glassdoor": ["/partner/joblisting", "joblistingid="],
	"linkedin": ["/comm/jobs/view/", "/jobs/view/"],
	"naukri": ["/job-listings-", "naukri.com/view"],
	"foundit": ["/rio/autoLogin/"],
	"indeed": ["/viewjob", "indeed.com/rc/clk", "indeed.com/pagead/clk", "cts.indeed.com"],
	"instahyre": ["instahyre.com/job-"],
	"generic": ["/job", "/career", "/apply", "/position", "/vacancy"],
	}


	def unwrap_autologin_url(url: str) -> str:
	try:
	unquoted = unquote(url)
	if "instahyre.com/job-" in unquoted:
	match = re.search(r"(https://www\.instahyre\.com/job-[^/?]+)", unquoted)
	if match:
	return match.group(1) + "/"
	parsed = urlparse(url)
	if "/rio/autoLogin/" in parsed.path or "/autoLogin/" in parsed.path:
	params = parse_qs(parsed.query)
	return_url = params.get("return_url", [None])[0]
	if return_url:
	return return_url
	except Exception:
	pass
	return url


	def clean_url(url: str) -> str:
	try:
	url = unwrap_autologin_url(url)
	parsed = urlparse(url)
	query_params = parse_qsl(parsed.query, keep_blank_values=True)
	clean_query = [(k, v) for k, v in query_params if k.lower() not in JUNK_PARAMS]
	parsed = parsed._replace(query=urlencode(clean_query))
	result = urlunparse(parsed)
	clean_paths = ["/comm/jobs/view/", "/jobs/view/", "/job/", "/job-listings-"]
	if any(p in result for p in clean_paths):
	parsed = parsed._replace(query="")
	result = urlunparse(parsed)
	return result
	except Exception:
	return url


	def is_job_link(url: str, platform: str = "generic") -> bool:
	url_lower = unquote(url).lower()
	if any(noise in url_lower for noise in NOISE_SIGNALS):
	return False
	if platform == "foundit" and "/rio/autologin/" in url_lower:
	unwrapped = unwrap_autologin_url(url)
	return "/job/" in unwrapped.lower()
	signals = PLATFORM_JOB_SIGNALS.get(platform, PLATFORM_JOB_SIGNALS["generic"])
	return any(signal in url_lower for signal in signals)


	# ═════════════════════════════════════════════════════════════════
	# STAGE 3: Platform-Specific Card Extractors
	# CRITICAL: Each card gets its OWN individual job_link.
	# We never extract one link and paste it across multiple cards.
	# ═════════════════════════════════════════════════════════════════

	def extract_glassdoor(soup: BeautifulSoup) -> List[Dict]:
	cards = []
	card_tables = soup.find_all("table", class_="gd-dbe9ce2b4a")
	print(f" [Glassdoor] Found {len(card_tables)} card containers")
	for card_table in card_tables:
	card: Dict = {"company": "", "role": "", "job_link": None}
	for a_tag in card_table.find_all("a", href=True):
	if is_job_link(a_tag["href"], "glassdoor"):
	card["job_link"] = clean_url(a_tag["href"])
	break
	company_span = card_table.find("span", class_="gd-628b46d9ce")
	if company_span:
	card["company"] = company_span.get_text(strip=True)
	role_p = card_table.find("p", class_="gd-6c2846d4dc")
	if role_p:
	card["role"] = role_p.get_text(strip=True)
	if card["role"] or card["company"]:
	cards.append(card)
	return cards


	def extract_linkedin(soup: BeautifulSoup) -> List[Dict]:
	cards = []
	card_tds = soup.find_all("td", attrs={"data-test-id": "job-card"})
	print(f" [LinkedIn] Found {len(card_tds)} job-card containers")
	for card_td in card_tds:
	card: Dict = {"company": "", "role": "", "job_link": None}
	for a_tag in card_td.find_all("a", href=True):
	href = a_tag["href"]
	if is_job_link(href, "linkedin"):
	card["job_link"] = clean_url(href)
	break
	role_a = card_td.find("a", class_=lambda c: c and "font-bold" in c and "text-md" in c)
	if role_a:
	card["role"] = role_a.get_text(strip=True)
	company_p = card_td.find("p", class_=lambda c: c and "text-system-gray-100" in c)
	if company_p:
	raw = company_p.get_text(strip=True)
	# FIX: original split on "·" (middle dot), not "." (period) — preserved correctly
	parts = raw.split("·")
	card["company"] = parts[0].strip() if parts else raw
	if card["role"] or card["company"]:
	cards.append(card)
	return cards


	def extract_indeed(soup: BeautifulSoup) -> List[Dict]:
	"""
	Indeed emails: each job title is <a class="strong-text-link">.
	That anchor's own href is the link for THAT specific job.
	Company is in the next <tr> sibling of the title's parent <tr>.
	"""
	cards = []
	title_links = soup.find_all("a", class_="strong-text-link")
	print(f" [Indeed] Found {len(title_links)} job title links")
	for title_tag in title_links:
	card: Dict = {"company": "", "role": "", "job_link": None}
	href = title_tag.get("href")
	if href and is_job_link(href, "indeed"):
	card["job_link"] = clean_url(href)
	card["role"] = title_tag.get_text(strip=True)
	parent_tr = title_tag.find_parent("tr")
	if parent_tr:
	next_tr = parent_tr.find_next_sibling("tr")
	if next_tr:
	company_text = next_tr.get_text(separator=" \| ", strip=True)
	card["company"] = company_text.split(" \| ")[0].strip()
	if card["role"] or card["company"]:
	cards.append(card)
	return cards


	def extract_instahyre(soup: BeautifulSoup) -> List[Dict]:
	"""
	Instahyre: cards are <div class="job-block">.
	Company = strong[0], Role = strong[1], link = first anchor in block.
	"""
	cards = []
	job_blocks = soup.find_all("div", class_="job-block")
	print(f" [Instahyre] Found {len(job_blocks)} job blocks")
	for block in job_blocks:
	card: Dict = {"company": "", "role": "", "job_link": None}
	a_tag = block.find("a", href=True)
	if a_tag and is_job_link(a_tag["href"], "instahyre"):
	card["job_link"] = clean_url(a_tag["href"])
	strong_tags = block.find_all("strong")
	if len(strong_tags) >= 2:
	card["company"] = strong_tags[0].get_text(strip=True)
	card["role"] = strong_tags[1].get_text(strip=True)
	if card["role"] or card["company"]:
	cards.append(card)
	return cards


	def extract_naukri(soup: BeautifulSoup) -> List[Dict]:
	return _generic_extract(soup, "naukri")


	def extract_foundit(soup: BeautifulSoup) -> List[Dict]:
	return _generic_extract(soup, "foundit")


	def _generic_extract(soup: BeautifulSoup, platform: str = "generic") -> List[Dict]:
	"""
	Generic fallback: scan all anchors matching job-link signals.
	Each unique URL = one card. Surrounding text used for company/role context.
	"""
	cards = []
	seen_links: set = set()
	for a_tag in soup.find_all("a", href=True):
	href = a_tag["href"]
	if not is_job_link(href, platform):
	continue
	cleaned = clean_url(href)
	if cleaned in seen_links:
	continue
	seen_links.add(cleaned)
	role_text = a_tag.get_text(strip=True)
	company_text = ""
	for parent in a_tag.parents:
	if parent.name in ["td", "div", "li", "tr", "table"]:
	all_text = parent.get_text(separator=" \| ", strip=True)
	if len(all_text) < 400:
	company_text = all_text
	break
	cards.append({
	"company": company_text[:200],
	"role": role_text,
	"job_link": cleaned,
	})
	print(f" [Generic/{platform}] Found {len(cards)} unique job links")
	return cards


	PLATFORM_EXTRACTORS = {
	"glassdoor": extract_glassdoor,
	"linkedin": extract_linkedin,
	"naukri": extract_naukri,
	"foundit": extract_foundit,
	"indeed": extract_indeed,
	"instahyre": extract_instahyre,
	"generic": _generic_extract,
	}


	def extract_cards(soup: BeautifulSoup, platform: str) -> List[Dict]:
	extractor = PLATFORM_EXTRACTORS.get(platform, _generic_extract)
	return extractor(soup)


	# ═════════════════════════════════════════════════════════════════
	# STAGE 4: Bouncer
	# ═════════════════════════════════════════════════════════════════
	JOB_KEYWORDS = [
	"applied", "application", "interview", "rejection", "job alert",
	"offer", "hiring", "shortlisted", "assessment", "jobs", "apply",
	"internship", "intern", "career", "glassdoor", "linkedin", "naukri",
	"opportunity", "resume", "foundit", "indeed", "instahyre",
	"position", "role", "vacancy", "opening",
	]


	def is_job_email(text: str) -> bool:
	return any(word in text.lower() for word in JOB_KEYWORDS)


	# ═════════════════════════════════════════════════════════════════
	# STAGE 5: LLM Enrichment
	# Cards have company, role, job_link already set correctly.
	# LLM adds: status, sourcePlatform, domainCategory, coreTech, interpretation.
	# After LLM returns, we FORCE re-inject the original job_link from the card
	# so even if LLM disobeys, the correct link is always used.
	# ═════════════════════════════════════════════════════════════════

	LLM_CARD_PROMPT = """
	You are a structured data extraction engine for a job application tracker.
	You receive pre-parsed job cards AND the full original email text as context.

	Each card has: company, role, job_link (job_link was extracted by code — do NOT change it).
	Company and role may be empty or wrong — use the FULL EMAIL TEXT below to find the correct values.

	Return a JSON ARRAY — one object per card, SAME COUNT and SAME ORDER as input.

	STRICT RULES:
	1. Return ONLY a raw JSON array []. No markdown, no backticks, no explanation.
	2. Exactly one object per card — same count, same order as input.
	3. Copy job_link EXACTLY as given. Never modify, guess, or omit it.
	4. If job_link is null, output null (not the string "null").
	5. For companyName: if the card value is empty/Unknown/wrong, find the REAL hiring company name from the EMAIL TEXT. Never output "Unknown Company" if the email text contains the company name.
	6. For jobRole: if the card value is empty, find the real job title from the EMAIL TEXT.
	7. Clean company: if "CompanyName · Location" format, extract only company name.
	8. Clean role: remove extra whitespace or codes like [T500-25894].

	FIELDS per object:
	- "companyName": string — real hiring company name (use email text if card value is missing)
	- "jobRole": string — clean job title (use email text if card value is missing)
	- "jobLink": string or null — EXACT copy of job_link provided, never change this
	- "status": one of: "Opportunity" \| "Applied" \| "Interview" \| "Selection" \| "Rejection"
	* Opportunity = job alert, new opening not yet applied to
	* Applied = application submitted confirmation
	* Interview = interview or assessment invite
	* Selection = offer letter or selected to proceed
	* Rejection = application declined
	- "sourcePlatform": one of: LinkedIn, Naukri, Indeed, Glassdoor, Wellfound, Instahyre, Workday, Greenhouse, Direct Email, Company Portal, Other
	- "domainCategory": e.g. "Mobile Development", "Backend Engineering", "Data Science", "DevOps", "Frontend", "Full Stack", "Design", "Product Management", "Other"
	- "coreTech": array of 1-3 strings — tech skills inferred from the role title
	- "interpretation": 1 sentence describing what this role involves for the applicant

	SOURCE PLATFORM HINT: {platform}

	FULL EMAIL TEXT (use this to fill missing company/role):
	{email_text}

	JOB CARDS:
	{card_summary}
	"""


	def build_card_summary(cards: List[Dict]) -> str:
	lines = []
	for i, c in enumerate(cards, 1):
	lines.append(
	f"Job {i}:\n"
	f" company: {c.get('company') or 'Unknown'}\n"
	f" role: {c.get('role') or 'Unspecified'}\n"
	f" job_link: {c.get('job_link') or 'null'}"
	)
	return "\n\n".join(lines)


	def enrich_cards_with_llm(cards: List[Dict], platform: str, email_text: str = "") -> List[Dict]:
	all_results: List[Dict] = []
	chunk_size = 10

	for i in range(0, len(cards), chunk_size):
	chunk = cards[i : i + chunk_size]
	card_summary = build_card_summary(chunk)
	prompt = LLM_CARD_PROMPT.format(
	platform=platform.capitalize(),
	email_text=email_text[:3000], # cap to avoid token overflow
	card_summary=card_summary,
	)
	print(f"🧠 Enriching cards {i + 1}–{i + len(chunk)} via Groq...")

	try:
	response = groq_client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1,
	)
	raw = response.choices[0].message.content
	batch_result = _safe_parse_json(raw)

	# HARD SAFETY: Re-inject original job_link from each card.
	# This runs AFTER LLM returns — so even if LLM changed/hallucinated
	# a link, the correct one from the card extractor always wins.
	for j, enriched in enumerate(batch_result):
	if j < len(chunk):
	enriched["jobLink"] = chunk[j].get("job_link")

	all_results.extend(batch_result)

	except Exception as e:
	print(f"⚠️ LLM enrichment failed for chunk {i + 1}–{i + len(chunk)}: {e}")
	# Fallback: preserve card data with minimal enrichment
	for card in chunk:
	all_results.append({
	"companyName": card.get("company") or "Unknown Company",
	"jobRole": card.get("role") or "Unspecified Role",
	"jobLink": card.get("job_link"),
	"status": "Opportunity",
	"sourcePlatform": platform.capitalize(),
	"domainCategory": "Other",
	"coreTech": [],
	"interpretation": "Could not enrich — LLM call failed.",
	})

	return all_results


	def _safe_parse_json(raw_text: str) -> list:
	raw_text = raw_text.replace("```json", "").replace("```", "").strip()
	match = re.search(r"\[.*\]", raw_text, re.DOTALL)
	if not match:
	print("⚠️ No JSON array found in LLM response.")
	return []
	try:
	return json.loads(match.group())
	except json.JSONDecodeError as e:
	print(f"⚠️ JSON parse failed: {e}")
	partial = re.findall(r"\{[^{}]+\}", match.group(), re.DOTALL)
	results = []
	for obj_str in partial:
	try:
	results.append(json.loads(obj_str))
	except Exception:
	pass
	if results:
	print(f" Salvaged {len(results)} partial objects.")
	return results


	# ═════════════════════════════════════════════════════════════════
	# FIREBASE HELPERS
	# ═════════════════════════════════════════════════════════════════
	def generate_job_fingerprint(user_email: str, job: dict) -> str:
	raw = f"{user_email}\|{job.get('companyName', '')}\|{job.get('jobRole', '')}".lower()
	return hashlib.md5(raw.encode()).hexdigest()


	def cleanup_expired_jobs(user_doc_id: str) -> None:
	try:
	now = datetime.now(timezone.utc)
	expired_query = (
	db.collection("users")
	.document(user_doc_id)
	.collection("applications")
	.where("expireAt", "<", now)
	.stream()
	)
	batch = db.batch()
	count = 0
	for doc in expired_query:
	batch.delete(doc.reference)
	count += 1
	if count > 0:
	batch.commit()
	print(f"🧹 Sweeper: Deleted {count} expired jobs.")
	except Exception as e:
	print(f"⚠️ Sweeper Error: {e}")


	def extract_json_array(raw_text: str) -> list:
	raw_text = raw_text.replace("```json", "").replace("```", "").strip()
	match = re.search(r"\[.*\]", raw_text, re.DOTALL)
	if not match:
	return []
	try:
	return json.loads(match.group())
	except json.JSONDecodeError:
	return []


	# ═════════════════════════════════════════════════════════════════
	# ROUTES
	# ═════════════════════════════════════════════════════════════════

	@app.get("/", response_class=HTMLResponse)
	def get_testing_ui():
	return "<h1>JobPulse Server is Running!</h1>"


	# ─────────────────────────────────────────
	# ROUTE 1: Parse Email → Extract Cards → Enrich → Save to Firebase
	# ─────────────────────────────────────────
	@app.post("/api/parse-email")
	def parse_email_with_ai(payload: EmailPayload):

	# STEP 1: Decode MIME + QP properly
	html_body = extract_html_from_email(payload.email_text)

	# STEP 2: Parse HTML, strip noise tags
	soup = BeautifulSoup(html_body, "html.parser")
	for tag in soup(["script", "style", "meta", "noscript", "head"]):
	tag.extract()

	raw_text = soup.get_text(separator=" ", strip=True)

	# STEP 3: Bouncer
	if not is_job_email(raw_text):
	print("🛡️ BOUNCER: Not a job email. Skipped.")
	return {"status": "success", "message": "Ignored: Not a job email."}

	# STEP 4: Find user in Firebase
	users_ref = db.collection("users")
	query = users_ref.where("email", "==", payload.user_email).limit(1).stream()
	user_doc_id = None
	for doc in query:
	user_doc_id = doc.id
	break

	if not user_doc_id:
	raise HTTPException(
	status_code=404,
	detail=f"User with email {payload.user_email} not found in database.",
	)

	cleanup_expired_jobs(user_doc_id)

	# STEP 5: Detect platform
	platform = detect_platform(soup, raw_text)
	print(f"🎯 Detected platform: {platform.upper()}")

	# STEP 6: Extract job cards — each card gets its OWN individual link
	print("📦 Extracting job cards...")
	cards = extract_cards(soup, platform)

	if not cards:
	print("⚠️ No cards found. Trying generic fallback...")
	cards = _generic_extract(soup, "generic")

	if not cards:
	return {"status": "success", "message": "No job listings found in this email."}

	print(f"✅ Extracted {len(cards)} job cards — each with its own unique link.")

	# STEP 7: Enrich with LLM (adds status, coreTech, domainCategory, etc.)
	enriched_jobs = enrich_cards_with_llm(cards, platform, email_text=raw_text)

	if not enriched_jobs:
	return {"status": "success", "message": "LLM enrichment returned no results."}

	# STEP 8: IST timestamp
	ist_tz = timezone(timedelta(hours=5, minutes=30))
	exact_timestamp = datetime.now(ist_tz).strftime("%H-%M %d/%m/%Y")

	# STEP 9: Firebase batch write with deduplication + TTL
	batch = db.batch()
	applications_ref = (
	db.collection("users")
	.document(user_doc_id)
	.collection("applications")
	)
	expiry_date = datetime.now(timezone.utc) + timedelta(days=60)

	saved_count = 0
	updated_count = 0
	skipped_count = 0

	for job in enriched_jobs:
	job["dateApplied"] = exact_timestamp
	if job.get("status") == "Opportunity":
	job["expireAt"] = expiry_date

	fingerprint = generate_job_fingerprint(payload.user_email, job)
	job_doc_ref = applications_ref.document(fingerprint)
	existing_snap = job_doc_ref.get()

	if existing_snap.exists:
	existing_status = existing_snap.to_dict().get("status")
	new_status = job.get("status")
	if existing_status != new_status and new_status != "Opportunity":
	batch.update(job_doc_ref, {
	"status": new_status,
	"dateApplied": exact_timestamp,
	})
	updated_count += 1
	print(f"🔄 Updated status: {job.get('companyName')} → {new_status}")
	else:
	skipped_count += 1
	print(f"⏭️ Skipped duplicate: {job.get('companyName')} - {job.get('jobRole')}")
	continue

	batch.set(job_doc_ref, job)
	saved_count += 1

	if (saved_count + updated_count) > 0:
	batch.commit()
	print(f"💾 Firebase: Saved {saved_count} new jobs, Updated {updated_count} jobs.")

	return {
	"status": "success",
	"message": f"Saved {saved_count} jobs. Updated {updated_count}. Skipped {skipped_count} duplicates.",
	"platform": platform,
	"cardsExtracted": len(cards),
	"data": enriched_jobs,
	}


	# ─────────────────────────────────────────
	# ROUTE 2: JD Skill Extractor
	# ─────────────────────────────────────────
	@app.post("/api/extract-skills")
	def extract_jd_skills(payload: JDPayload):
	soup = BeautifulSoup(payload.jd_text, "html.parser")
	clean_jd = soup.get_text(separator="\n", strip=True)

	if not clean_jd or len(clean_jd) < 50:
	raise HTTPException(status_code=400, detail="Job description text is too short or empty.")

	prompt = f"""
	Extract the top 5 to 10 core 'hard skills' (technical skills, tools, languages, frameworks)
	from the following Job Description. Ignore soft skills like communication or teamwork.
	OUTPUT FORMAT: Return ONLY a raw JSON array of strings. No markdown, no explanation.
	Example: ["Python", "SQL", "React", "AWS", "Docker"]

	Job Description:
	{clean_jd}
	"""
	try:
	response = groq_client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.0,
	)
	ai_text = response.choices[0].message.content
	extracted_skills = extract_json_array(ai_text)
	return {"status": "success", "skills": extracted_skills or []}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	# ─────────────────────────────────────────
	# ROUTE 3: LaTeX Resume → PDF Compiler
	# ─────────────────────────────────────────
	@app.post("/api/compile-latex")
	def compile_latex_to_pdf(payload: LatexPayload):
	try:
	with tempfile.TemporaryDirectory() as temp_dir:
	tex_file_path = os.path.join(temp_dir, "resume.tex")
	pdf_file_path = os.path.join(temp_dir, "resume.pdf")

	with open(tex_file_path, "w", encoding="utf-8") as f:
	f.write(payload.latex_code)

	for _ in range(2):
	subprocess.run(
	["pdflatex", "-interaction=nonstopmode", "-output-directory", temp_dir, tex_file_path],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	)

	if not os.path.exists(pdf_file_path):
	raise HTTPException(
	status_code=500,
	detail="LaTeX compilation failed. Check your LaTeX syntax.",
	)

	with open(pdf_file_path, "rb") as pdf_file:
	pdf_bytes = pdf_file.read()

	# FIX: use single quotes inside the f-string to avoid backslash-in-expression error
	return Response(
	content=pdf_bytes,
	media_type="application/pdf",
	headers={"Content-Disposition": 'attachment; filename="Tailored_Resume.pdf"'},
	)
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))