Spaces:

learnopolis
/

JobPulse-Bouncer

Running

File size: 32,128 Bytes

4d7533d
 
 
 
 
39e4cd3
 
4d7533d
da0c6a1
67aedc8
 
2e619fa
 
8fc7730
4d7533d
a12f13c
da0c6a1
 
 
 
39e4cd3
d74b98a
4d7533d
d74b98a
da0c6a1
f724e04
 
da0c6a1
39e4cd3
d74b98a
4d7533d
d74b98a
da0c6a1
 
 
 
 
d74b98a
da0c6a1
 
 
 
 
 
39e4cd3
d74b98a
 
 
da0c6a1
 
 
 
39e4cd3
d74b98a
 
 
39e4cd3
d74b98a
 
 
 
 
a12f13c
39e4cd3
 
 
 
 
d74b98a
a12f13c
 
 
 
 
67aedc8
a12f13c
 
 
 
 
 
 
 
 
 
 
67aedc8
d74b98a
 
a12f13c
d74b98a
a12f13c
 
 
 
 
39e4cd3
 
 
 
 
 
a12f13c
39e4cd3
 
 
a12f13c
 
67aedc8
d74b98a
 
a12f13c
d74b98a
a12f13c
39e4cd3
 
 
 
 
a12f13c
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a152e7c
a12f13c
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a152e7c
da0c6a1
d74b98a
a12f13c
39e4cd3
 
d74b98a
 
a12f13c
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
39e4cd3
 
 
 
 
a12f13c
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
 
 
 
a12f13c
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
d74b98a
 
a12f13c
 
d74b98a
 
a12f13c
 
d74b98a
 
a12f13c
39e4cd3
 
 
 
a12f13c
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
d74b98a
 
a12f13c
 
 
 
 
 
 
 
 
 
d74b98a
39e4cd3
a12f13c
 
d74b98a
 
a12f13c
 
39e4cd3
 
 
 
a12f13c
d74b98a
a12f13c
 
5bb5bea
 
 
 
a12f13c
 
 
 
 
 
 
 
5bb5bea
 
 
 
a12f13c
 
5bb5bea
 
 
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
5bb5bea
 
 
a12f13c
 
 
d74b98a
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
5bb5bea
39e4cd3
a12f13c
 
 
39e4cd3
a12f13c
 
 
5bb5bea
39e4cd3
a12f13c
39e4cd3
d74b98a
 
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
 
a12f13c
 
 
 
 
 
d74b98a
39e4cd3
 
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
d74b98a
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d74b98a
 
 
 
 
 
4d7533d
 
 
da0c6a1
d74b98a
 
a12f13c
d74b98a
4d7533d
 
d74b98a
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
da0c6a1
 
a12f13c
67aedc8
 
 
 
 
 
da0c6a1
67aedc8
d74b98a
 
39e4cd3
d74b98a
4d7533d
a152e7c
 
a12f13c
 
 
d74b98a
39e4cd3
a12f13c
 
da0c6a1
a12f13c
 
 
da0c6a1
a12f13c
 
d74b98a
a12f13c
da0c6a1
39e4cd3
5bb5bea
d74b98a
a12f13c
 
d74b98a
a12f13c
 
 
da0c6a1
a12f13c
 
 
 
 
 
 
 
d74b98a
a12f13c
39e4cd3
a12f13c
d74b98a
a12f13c
 
 
 
67aedc8
a12f13c
 
39e4cd3
67aedc8
39e4cd3
 
a650320
 
 
 
39e4cd3
a650320
39e4cd3
 
a650320
 
39e4cd3
a12f13c
da0c6a1
a12f13c
 
da0c6a1
a650320
a12f13c
a650320
da0c6a1
a12f13c
 
a650320
a12f13c
 
39e4cd3
a12f13c
99f2d40
4d7533d
99f2d40
d74b98a
99f2d40
 
 
 
 
 
 
a12f13c
99f2d40
 
a12f13c
d74b98a
 
 
 
 
 
 
99f2d40
f724e04
 
 
 
 
 
 
a152e7c
99f2d40
2e619fa
 
4d7533d
2e619fa
d74b98a
2e619fa
 
 
 
 
 
 
 
 
 
 
 
a12f13c
 
2e619fa
 
39e4cd3
2e619fa
 
 
39e4cd3
 
 
 
2e619fa
ee62e00
 
 
39e4cd3
ee62e00
d74b98a
 
39e4cd3
ee62e00
d74b98a
 
2e619fa
d74b98a

from groq import Groq
from fastapi import FastAPI, HTTPException, Response
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from bs4 import BeautifulSoup
from typing import List, Dict
import email as email_lib
import json
import os
import re
import hashlib
import subprocess
import tempfile
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl, urlencode, unquote

import firebase_admin
from firebase_admin import credentials, firestore


# ─────────────────────────────────────────
# 1. LOAD ENVIRONMENT VARIABLES
# ─────────────────────────────────────────
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY)


# ─────────────────────────────────────────
# 2. INITIALIZE FIREBASE
# ─────────────────────────────────────────
firebase_secret = os.getenv("FIREBASE_CREDENTIALS")
if firebase_secret:
    cred_dict = json.loads(firebase_secret)
    cred = credentials.Certificate(cred_dict)
else:
    cred = credentials.Certificate("firebase-credentials.json")

firebase_admin.initialize_app(cred)
db = firestore.client()

app = FastAPI(title="JobPulse AI Parser")


# ─────────────────────────────────────────
# PYDANTIC MODELS
# ─────────────────────────────────────────
class EmailPayload(BaseModel):
    user_email: str
    email_text: str


class JDPayload(BaseModel):
    jd_text: str


class LatexPayload(BaseModel):
    latex_code: str


# ═════════════════════════════════════════════════════════════════
# STAGE 0: MIME + Quoted-Printable Decoder
# Emails arriving as raw RFC-2822 messages are:
#   - Multipart MIME  ->  must extract only the text/html part
#   - QP-encoded      ->  =3D means =, line-ending = means line continuation
# Running quopri on the full raw email (headers + body) corrupts everything.
# Python stdlib `email` module splits MIME correctly first.
# ═════════════════════════════════════════════════════════════════
def extract_html_from_email(raw: str) -> str:
    """
    Properly parse a raw RFC-2822 email and return the decoded HTML body.
    Falls back to treating the input as plain HTML if MIME parsing fails.
    """
    try:
        msg = email_lib.message_from_string(raw)
        for part in msg.walk():
            if part.get_content_type() == "text/html":
                # get_payload(decode=True) handles both base64 and QP automatically
                payload = part.get_payload(decode=True)
                charset = part.get_content_charset() or "utf-8"
                return payload.decode(charset, errors="replace")
        # No HTML part found — maybe input is already plain HTML
        return raw
    except Exception:
        return raw


# ═════════════════════════════════════════════════════════════════
# STAGE 1: Platform Detector
# ═════════════════════════════════════════════════════════════════
def detect_platform(soup: BeautifulSoup, raw_text: str) -> str:
    all_links = [a.get("href", "") for a in soup.find_all("a", href=True)]
    link_text = " ".join(all_links).lower()
    text_lower = raw_text.lower()

    if "glassdoor.com" in link_text:                             return "glassdoor"
    if "linkedin.com" in link_text:                              return "linkedin"
    if "naukri.com" in link_text:                                return "naukri"
    if "foundit.in" in link_text or "monster.com" in link_text: return "foundit"
    if "indeed.com" in link_text:                                return "indeed"
    if "instahyre.com" in link_text:                             return "instahyre"

    if "glassdoor" in text_lower: return "glassdoor"
    if "linkedin" in text_lower:  return "linkedin"
    if "naukri" in text_lower:    return "naukri"

    return "generic"


# ═════════════════════════════════════════════════════════════════
# STAGE 2: URL Utilities
# ═════════════════════════════════════════════════════════════════
JUNK_PARAMS = {
    "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
    "jrtk", "guid", "ja", "uido", "cs", "cb", "ao", "s", "vt", "ea",
    "tgt", "src", "t", "pos",
    "trackingid", "refid", "lipi", "midtoken", "midsig", "trk", "trkemail", "eid", "otptoken",
    "spl", "notification_frequency", "autoApply", "jr_source", "apop", "notificationid", "response", "type",
    # Indeed tracking — 'jk' is intentionally NOT here, it is the job ID
    "qd", "rd", "tk", "alid", "bb", "mo", "ad", "xkcb", "camk", "p", "jsa", "rjs", "gdfvj", "plid", "fvj",
}

NOISE_SIGNALS = [
    "unsubscribe", "privacy", "terms", "manage", "email-pref",
    "brand-views", "brandview", "wf/open", "logomark", "logo.png",
    "easy-apply-icon", "location-icon", "bell-icon", "jobmatch",
    "twitter.com", "facebook.com", "instagram.com", "youtube.com",
    "glassdoor.com/about", "mailto:", "jobalertajax", "emailsettings",
    "job-alert/jobalert", "job-alert-email-unsubscribe", "jobs/alerts",
    "jobs/search", "comm/feed", "comm/mynetwork", "comm/messaging",
    "comm/notifications", "comm/premium", "comm/widgets",
    "linkedin.com/help", "in.linkedin.com/comm/in/",
    "static.licdn.com", "media.licdn.com",
    "naukri.com/mnjuser", "naukri.com/user",
    "seeker/dashboard", "seeker/profile", "seeker/jobalert-feedback",
    "trex/unsubscribe", "appurl.io", "play.google.com", "itunes.apple.com",
    "media.monsterindia.com", "media.foundit.in",
    "widget", "promo", "feed", "mynetwork",
]

PLATFORM_JOB_SIGNALS = {
    "glassdoor": ["/partner/joblisting", "joblistingid="],
    "linkedin":  ["/comm/jobs/view/", "/jobs/view/"],
    "naukri":    ["/job-listings-", "naukri.com/view"],
    "foundit":   ["/rio/autoLogin/"],
    "indeed":    ["/viewjob", "indeed.com/rc/clk", "indeed.com/pagead/clk", "cts.indeed.com"],
    "instahyre": ["instahyre.com/job-"],
    "generic":   ["/job", "/career", "/apply", "/position", "/vacancy"],
}


def unwrap_autologin_url(url: str) -> str:
    try:
        unquoted = unquote(url)
        if "instahyre.com/job-" in unquoted:
            match = re.search(r"(https://www\.instahyre\.com/job-[^/?]+)", unquoted)
            if match:
                return match.group(1) + "/"
        parsed = urlparse(url)
        if "/rio/autoLogin/" in parsed.path or "/autoLogin/" in parsed.path:
            params = parse_qs(parsed.query)
            return_url = params.get("return_url", [None])[0]
            if return_url:
                return return_url
    except Exception:
        pass
    return url


def clean_url(url: str) -> str:
    try:
        url = unwrap_autologin_url(url)
        parsed = urlparse(url)
        query_params = parse_qsl(parsed.query, keep_blank_values=True)
        clean_query = [(k, v) for k, v in query_params if k.lower() not in JUNK_PARAMS]
        parsed = parsed._replace(query=urlencode(clean_query))
        result = urlunparse(parsed)
        clean_paths = ["/comm/jobs/view/", "/jobs/view/", "/job/", "/job-listings-"]
        if any(p in result for p in clean_paths):
            parsed = parsed._replace(query="")
            result = urlunparse(parsed)
        return result
    except Exception:
        return url


def is_job_link(url: str, platform: str = "generic") -> bool:
    url_lower = unquote(url).lower()
    if any(noise in url_lower for noise in NOISE_SIGNALS):
        return False
    if platform == "foundit" and "/rio/autologin/" in url_lower:
        unwrapped = unwrap_autologin_url(url)
        return "/job/" in unwrapped.lower()
    signals = PLATFORM_JOB_SIGNALS.get(platform, PLATFORM_JOB_SIGNALS["generic"])
    return any(signal in url_lower for signal in signals)


# ═════════════════════════════════════════════════════════════════
# STAGE 3: Platform-Specific Card Extractors
# CRITICAL: Each card gets its OWN individual job_link.
# We never extract one link and paste it across multiple cards.
# ═════════════════════════════════════════════════════════════════

def extract_glassdoor(soup: BeautifulSoup) -> List[Dict]:
    cards = []
    card_tables = soup.find_all("table", class_="gd-dbe9ce2b4a")
    print(f"   [Glassdoor] Found {len(card_tables)} card containers")
    for card_table in card_tables:
        card: Dict = {"company": "", "role": "", "job_link": None}
        for a_tag in card_table.find_all("a", href=True):
            if is_job_link(a_tag["href"], "glassdoor"):
                card["job_link"] = clean_url(a_tag["href"])
                break
        company_span = card_table.find("span", class_="gd-628b46d9ce")
        if company_span:
            card["company"] = company_span.get_text(strip=True)
        role_p = card_table.find("p", class_="gd-6c2846d4dc")
        if role_p:
            card["role"] = role_p.get_text(strip=True)
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_linkedin(soup: BeautifulSoup) -> List[Dict]:
    cards = []
    card_tds = soup.find_all("td", attrs={"data-test-id": "job-card"})
    print(f"   [LinkedIn] Found {len(card_tds)} job-card containers")
    for card_td in card_tds:
        card: Dict = {"company": "", "role": "", "job_link": None}
        for a_tag in card_td.find_all("a", href=True):
            href = a_tag["href"]
            if is_job_link(href, "linkedin"):
                card["job_link"] = clean_url(href)
                break
        role_a = card_td.find("a", class_=lambda c: c and "font-bold" in c and "text-md" in c)
        if role_a:
            card["role"] = role_a.get_text(strip=True)
        company_p = card_td.find("p", class_=lambda c: c and "text-system-gray-100" in c)
        if company_p:
            raw = company_p.get_text(strip=True)
            # FIX: original split on "·" (middle dot), not "." (period) — preserved correctly
            parts = raw.split("·")
            card["company"] = parts[0].strip() if parts else raw
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_indeed(soup: BeautifulSoup) -> List[Dict]:
    """
    Indeed emails: each job title is <a class="strong-text-link">.
    That anchor's own href is the link for THAT specific job.
    Company is in the next <tr> sibling of the title's parent <tr>.
    """
    cards = []
    title_links = soup.find_all("a", class_="strong-text-link")
    print(f"   [Indeed] Found {len(title_links)} job title links")
    for title_tag in title_links:
        card: Dict = {"company": "", "role": "", "job_link": None}
        href = title_tag.get("href")
        if href and is_job_link(href, "indeed"):
            card["job_link"] = clean_url(href)
        card["role"] = title_tag.get_text(strip=True)
        parent_tr = title_tag.find_parent("tr")
        if parent_tr:
            next_tr = parent_tr.find_next_sibling("tr")
            if next_tr:
                company_text = next_tr.get_text(separator=" | ", strip=True)
                card["company"] = company_text.split(" | ")[0].strip()
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_instahyre(soup: BeautifulSoup) -> List[Dict]:
    """
    Instahyre: cards are <div class="job-block">.
    Company = strong[0], Role = strong[1], link = first anchor in block.
    """
    cards = []
    job_blocks = soup.find_all("div", class_="job-block")
    print(f"   [Instahyre] Found {len(job_blocks)} job blocks")
    for block in job_blocks:
        card: Dict = {"company": "", "role": "", "job_link": None}
        a_tag = block.find("a", href=True)
        if a_tag and is_job_link(a_tag["href"], "instahyre"):
            card["job_link"] = clean_url(a_tag["href"])
        strong_tags = block.find_all("strong")
        if len(strong_tags) >= 2:
            card["company"] = strong_tags[0].get_text(strip=True)
            card["role"] = strong_tags[1].get_text(strip=True)
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_naukri(soup: BeautifulSoup) -> List[Dict]:
    return _generic_extract(soup, "naukri")


def extract_foundit(soup: BeautifulSoup) -> List[Dict]:
    return _generic_extract(soup, "foundit")


def _generic_extract(soup: BeautifulSoup, platform: str = "generic") -> List[Dict]:
    """
    Generic fallback: scan all anchors matching job-link signals.
    Each unique URL = one card. Surrounding text used for company/role context.
    """
    cards = []
    seen_links: set = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if not is_job_link(href, platform):
            continue
        cleaned = clean_url(href)
        if cleaned in seen_links:
            continue
        seen_links.add(cleaned)
        role_text = a_tag.get_text(strip=True)
        company_text = ""
        for parent in a_tag.parents:
            if parent.name in ["td", "div", "li", "tr", "table"]:
                all_text = parent.get_text(separator=" | ", strip=True)
                if len(all_text) < 400:
                    company_text = all_text
                    break
        cards.append({
            "company": company_text[:200],
            "role": role_text,
            "job_link": cleaned,
        })
    print(f"   [Generic/{platform}] Found {len(cards)} unique job links")
    return cards


PLATFORM_EXTRACTORS = {
    "glassdoor": extract_glassdoor,
    "linkedin":  extract_linkedin,
    "naukri":    extract_naukri,
    "foundit":   extract_foundit,
    "indeed":    extract_indeed,
    "instahyre": extract_instahyre,
    "generic":   _generic_extract,
}


def extract_cards(soup: BeautifulSoup, platform: str) -> List[Dict]:
    extractor = PLATFORM_EXTRACTORS.get(platform, _generic_extract)
    return extractor(soup)


# ═════════════════════════════════════════════════════════════════
# STAGE 4: Bouncer
# ═════════════════════════════════════════════════════════════════
JOB_KEYWORDS = [
    "applied", "application", "interview", "rejection", "job alert",
    "offer", "hiring", "shortlisted", "assessment", "jobs", "apply",
    "internship", "intern", "career", "glassdoor", "linkedin", "naukri",
    "opportunity", "resume", "foundit", "indeed", "instahyre",
    "position", "role", "vacancy", "opening",
]


def is_job_email(text: str) -> bool:
    return any(word in text.lower() for word in JOB_KEYWORDS)


# ═════════════════════════════════════════════════════════════════
# STAGE 5: LLM Enrichment
# Cards have company, role, job_link already set correctly.
# LLM adds: status, sourcePlatform, domainCategory, coreTech, interpretation.
# After LLM returns, we FORCE re-inject the original job_link from the card
# so even if LLM disobeys, the correct link is always used.
# ═════════════════════════════════════════════════════════════════

LLM_CARD_PROMPT = """
You are a structured data extraction engine for a job application tracker.
You receive pre-parsed job cards AND the full original email text as context.

Each card has: company, role, job_link (job_link was extracted by code — do NOT change it).
Company and role may be empty or wrong — use the FULL EMAIL TEXT below to find the correct values.

Return a JSON ARRAY — one object per card, SAME COUNT and SAME ORDER as input.

STRICT RULES:
1. Return ONLY a raw JSON array []. No markdown, no backticks, no explanation.
2. Exactly one object per card — same count, same order as input.
3. Copy job_link EXACTLY as given. Never modify, guess, or omit it.
4. If job_link is null, output null (not the string "null").
5. For companyName: if the card value is empty/Unknown/wrong, find the REAL hiring company name from the EMAIL TEXT. Never output "Unknown Company" if the email text contains the company name.
6. For jobRole: if the card value is empty, find the real job title from the EMAIL TEXT.
7. Clean company: if "CompanyName · Location" format, extract only company name.
8. Clean role: remove extra whitespace or codes like [T500-25894].

FIELDS per object:
- "companyName": string — real hiring company name (use email text if card value is missing)
- "jobRole": string — clean job title (use email text if card value is missing)
- "jobLink": string or null — EXACT copy of job_link provided, never change this
- "status": one of: "Opportunity" | "Applied" | "Interview" | "Selection" | "Rejection"
    * Opportunity  = job alert, new opening not yet applied to
    * Applied      = application submitted confirmation
    * Interview    = interview or assessment invite
    * Selection    = offer letter or selected to proceed
    * Rejection    = application declined
- "sourcePlatform": one of: LinkedIn, Naukri, Indeed, Glassdoor, Wellfound, Instahyre, Workday, Greenhouse, Direct Email, Company Portal, Other
- "domainCategory": e.g. "Mobile Development", "Backend Engineering", "Data Science", "DevOps", "Frontend", "Full Stack", "Design", "Product Management", "Other"
- "coreTech": array of 1-3 strings — tech skills inferred from the role title
- "interpretation": 1 sentence describing what this role involves for the applicant

SOURCE PLATFORM HINT: {platform}

FULL EMAIL TEXT (use this to fill missing company/role):
{email_text}

JOB CARDS:
{card_summary}
"""


def build_card_summary(cards: List[Dict]) -> str:
    lines = []
    for i, c in enumerate(cards, 1):
        lines.append(
            f"Job {i}:\n"
            f"  company: {c.get('company') or 'Unknown'}\n"
            f"  role: {c.get('role') or 'Unspecified'}\n"
            f"  job_link: {c.get('job_link') or 'null'}"
        )
    return "\n\n".join(lines)


def enrich_cards_with_llm(cards: List[Dict], platform: str, email_text: str = "") -> List[Dict]:
    all_results: List[Dict] = []
    chunk_size = 10

    for i in range(0, len(cards), chunk_size):
        chunk = cards[i : i + chunk_size]
        card_summary = build_card_summary(chunk)
        prompt = LLM_CARD_PROMPT.format(
            platform=platform.capitalize(),
            email_text=email_text[:3000],  # cap to avoid token overflow
            card_summary=card_summary,
        )
        print(f"🧠 Enriching cards {i + 1}–{i + len(chunk)} via Groq...")

        try:
            response = groq_client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = response.choices[0].message.content
            batch_result = _safe_parse_json(raw)

            # HARD SAFETY: Re-inject original job_link from each card.
            # This runs AFTER LLM returns — so even if LLM changed/hallucinated
            # a link, the correct one from the card extractor always wins.
            for j, enriched in enumerate(batch_result):
                if j < len(chunk):
                    enriched["jobLink"] = chunk[j].get("job_link")

            all_results.extend(batch_result)

        except Exception as e:
            print(f"⚠️ LLM enrichment failed for chunk {i + 1}–{i + len(chunk)}: {e}")
            # Fallback: preserve card data with minimal enrichment
            for card in chunk:
                all_results.append({
                    "companyName": card.get("company") or "Unknown Company",
                    "jobRole": card.get("role") or "Unspecified Role",
                    "jobLink": card.get("job_link"),
                    "status": "Opportunity",
                    "sourcePlatform": platform.capitalize(),
                    "domainCategory": "Other",
                    "coreTech": [],
                    "interpretation": "Could not enrich — LLM call failed.",
                })

    return all_results


def _safe_parse_json(raw_text: str) -> list:
    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
    match = re.search(r"\[.*\]", raw_text, re.DOTALL)
    if not match:
        print("⚠️ No JSON array found in LLM response.")
        return []
    try:
        return json.loads(match.group())
    except json.JSONDecodeError as e:
        print(f"⚠️ JSON parse failed: {e}")
        partial = re.findall(r"\{[^{}]+\}", match.group(), re.DOTALL)
        results = []
        for obj_str in partial:
            try:
                results.append(json.loads(obj_str))
            except Exception:
                pass
        if results:
            print(f"   Salvaged {len(results)} partial objects.")
        return results


# ═════════════════════════════════════════════════════════════════
# FIREBASE HELPERS
# ═════════════════════════════════════════════════════════════════
def generate_job_fingerprint(user_email: str, job: dict) -> str:
    raw = f"{user_email}|{job.get('companyName', '')}|{job.get('jobRole', '')}".lower()
    return hashlib.md5(raw.encode()).hexdigest()


def cleanup_expired_jobs(user_doc_id: str) -> None:
    try:
        now = datetime.now(timezone.utc)
        expired_query = (
            db.collection("users")
            .document(user_doc_id)
            .collection("applications")
            .where("expireAt", "<", now)
            .stream()
        )
        batch = db.batch()
        count = 0
        for doc in expired_query:
            batch.delete(doc.reference)
            count += 1
        if count > 0:
            batch.commit()
            print(f"🧹 Sweeper: Deleted {count} expired jobs.")
    except Exception as e:
        print(f"⚠️ Sweeper Error: {e}")


def extract_json_array(raw_text: str) -> list:
    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
    match = re.search(r"\[.*\]", raw_text, re.DOTALL)
    if not match:
        return []
    try:
        return json.loads(match.group())
    except json.JSONDecodeError:
        return []


# ═════════════════════════════════════════════════════════════════
# ROUTES
# ═════════════════════════════════════════════════════════════════

@app.get("/", response_class=HTMLResponse)
def get_testing_ui():
    return "<h1>JobPulse Server is Running!</h1>"


# ─────────────────────────────────────────
# ROUTE 1: Parse Email → Extract Cards → Enrich → Save to Firebase
# ─────────────────────────────────────────
@app.post("/api/parse-email")
def parse_email_with_ai(payload: EmailPayload):

    # STEP 1: Decode MIME + QP properly
    html_body = extract_html_from_email(payload.email_text)

    # STEP 2: Parse HTML, strip noise tags
    soup = BeautifulSoup(html_body, "html.parser")
    for tag in soup(["script", "style", "meta", "noscript", "head"]):
        tag.extract()

    raw_text = soup.get_text(separator=" ", strip=True)

    # STEP 3: Bouncer
    if not is_job_email(raw_text):
        print("🛡️ BOUNCER: Not a job email. Skipped.")
        return {"status": "success", "message": "Ignored: Not a job email."}

    # STEP 4: Find user in Firebase
    users_ref = db.collection("users")
    query = users_ref.where("email", "==", payload.user_email).limit(1).stream()
    user_doc_id = None
    for doc in query:
        user_doc_id = doc.id
        break

    if not user_doc_id:
        raise HTTPException(
            status_code=404,
            detail=f"User with email {payload.user_email} not found in database.",
        )

    cleanup_expired_jobs(user_doc_id)

    # STEP 5: Detect platform
    platform = detect_platform(soup, raw_text)
    print(f"🎯 Detected platform: {platform.upper()}")

    # STEP 6: Extract job cards — each card gets its OWN individual link
    print("📦 Extracting job cards...")
    cards = extract_cards(soup, platform)

    if not cards:
        print("⚠️ No cards found. Trying generic fallback...")
        cards = _generic_extract(soup, "generic")

    if not cards:
        return {"status": "success", "message": "No job listings found in this email."}

    print(f"✅ Extracted {len(cards)} job cards — each with its own unique link.")

    # STEP 7: Enrich with LLM (adds status, coreTech, domainCategory, etc.)
    enriched_jobs = enrich_cards_with_llm(cards, platform, email_text=raw_text)

    if not enriched_jobs:
        return {"status": "success", "message": "LLM enrichment returned no results."}

    # STEP 8: IST timestamp
    ist_tz = timezone(timedelta(hours=5, minutes=30))
    exact_timestamp = datetime.now(ist_tz).strftime("%H-%M %d/%m/%Y")

    # STEP 9: Firebase batch write with deduplication + TTL
    batch = db.batch()
    applications_ref = (
        db.collection("users")
        .document(user_doc_id)
        .collection("applications")
    )
    expiry_date = datetime.now(timezone.utc) + timedelta(days=60)

    saved_count = 0
    updated_count = 0
    skipped_count = 0

    for job in enriched_jobs:
        job["dateApplied"] = exact_timestamp
        if job.get("status") == "Opportunity":
            job["expireAt"] = expiry_date

        fingerprint = generate_job_fingerprint(payload.user_email, job)
        job_doc_ref = applications_ref.document(fingerprint)
        existing_snap = job_doc_ref.get()

        if existing_snap.exists:
            existing_status = existing_snap.to_dict().get("status")
            new_status = job.get("status")
            if existing_status != new_status and new_status != "Opportunity":
                batch.update(job_doc_ref, {
                    "status": new_status,
                    "dateApplied": exact_timestamp,
                })
                updated_count += 1
                print(f"🔄 Updated status: {job.get('companyName')} → {new_status}")
            else:
                skipped_count += 1
                print(f"⏭️  Skipped duplicate: {job.get('companyName')} - {job.get('jobRole')}")
            continue

        batch.set(job_doc_ref, job)
        saved_count += 1

    if (saved_count + updated_count) > 0:
        batch.commit()
        print(f"💾 Firebase: Saved {saved_count} new jobs, Updated {updated_count} jobs.")

    return {
        "status": "success",
        "message": f"Saved {saved_count} jobs. Updated {updated_count}. Skipped {skipped_count} duplicates.",
        "platform": platform,
        "cardsExtracted": len(cards),
        "data": enriched_jobs,
    }


# ─────────────────────────────────────────
# ROUTE 2: JD Skill Extractor
# ─────────────────────────────────────────
@app.post("/api/extract-skills")
def extract_jd_skills(payload: JDPayload):
    soup = BeautifulSoup(payload.jd_text, "html.parser")
    clean_jd = soup.get_text(separator="\n", strip=True)

    if not clean_jd or len(clean_jd) < 50:
        raise HTTPException(status_code=400, detail="Job description text is too short or empty.")

    prompt = f"""
Extract the top 5 to 10 core 'hard skills' (technical skills, tools, languages, frameworks)
from the following Job Description. Ignore soft skills like communication or teamwork.
OUTPUT FORMAT: Return ONLY a raw JSON array of strings. No markdown, no explanation.
Example: ["Python", "SQL", "React", "AWS", "Docker"]

Job Description:
{clean_jd}
"""
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
        ai_text = response.choices[0].message.content
        extracted_skills = extract_json_array(ai_text)
        return {"status": "success", "skills": extracted_skills or []}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# ─────────────────────────────────────────
# ROUTE 3: LaTeX Resume → PDF Compiler
# ─────────────────────────────────────────
@app.post("/api/compile-latex")
def compile_latex_to_pdf(payload: LatexPayload):
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            tex_file_path = os.path.join(temp_dir, "resume.tex")
            pdf_file_path = os.path.join(temp_dir, "resume.pdf")

            with open(tex_file_path, "w", encoding="utf-8") as f:
                f.write(payload.latex_code)

            for _ in range(2):
                subprocess.run(
                    ["pdflatex", "-interaction=nonstopmode", "-output-directory", temp_dir, tex_file_path],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                )

            if not os.path.exists(pdf_file_path):
                raise HTTPException(
                    status_code=500,
                    detail="LaTeX compilation failed. Check your LaTeX syntax.",
                )

            with open(pdf_file_path, "rb") as pdf_file:
                pdf_bytes = pdf_file.read()

        # FIX: use single quotes inside the f-string to avoid backslash-in-expression error
        return Response(
            content=pdf_bytes,
            media_type="application/pdf",
            headers={"Content-Disposition": 'attachment; filename="Tailored_Resume.pdf"'},
        )
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))