Spaces:
Running
Running
| from groq import Groq | |
| from fastapi import FastAPI, HTTPException, Response | |
| from fastapi.responses import HTMLResponse | |
| from pydantic import BaseModel | |
| from bs4 import BeautifulSoup | |
| from typing import List, Dict | |
| import email as email_lib | |
| import json | |
| import os | |
| import re | |
| import hashlib | |
| import subprocess | |
| import tempfile | |
| from dotenv import load_dotenv | |
| from datetime import datetime, timedelta, timezone | |
| from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl, urlencode, unquote | |
| import firebase_admin | |
| from firebase_admin import credentials, firestore | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 1. LOAD ENVIRONMENT VARIABLES | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| load_dotenv() | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| groq_client = Groq(api_key=GROQ_API_KEY) | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 2. INITIALIZE FIREBASE | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| firebase_secret = os.getenv("FIREBASE_CREDENTIALS") | |
| if firebase_secret: | |
| cred_dict = json.loads(firebase_secret) | |
| cred = credentials.Certificate(cred_dict) | |
| else: | |
| cred = credentials.Certificate("firebase-credentials.json") | |
| firebase_admin.initialize_app(cred) | |
| db = firestore.client() | |
| app = FastAPI(title="JobPulse AI Parser") | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # PYDANTIC MODELS | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| class EmailPayload(BaseModel): | |
| user_email: str | |
| email_text: str | |
| class JDPayload(BaseModel): | |
| jd_text: str | |
| class LatexPayload(BaseModel): | |
| latex_code: str | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 0: MIME + Quoted-Printable Decoder | |
| # Emails arriving as raw RFC-2822 messages are: | |
| # - Multipart MIME -> must extract only the text/html part | |
| # - QP-encoded -> =3D means =, line-ending = means line continuation | |
| # Running quopri on the full raw email (headers + body) corrupts everything. | |
| # Python stdlib `email` module splits MIME correctly first. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_html_from_email(raw: str) -> str: | |
| """ | |
| Properly parse a raw RFC-2822 email and return the decoded HTML body. | |
| Falls back to treating the input as plain HTML if MIME parsing fails. | |
| """ | |
| try: | |
| msg = email_lib.message_from_string(raw) | |
| for part in msg.walk(): | |
| if part.get_content_type() == "text/html": | |
| # get_payload(decode=True) handles both base64 and QP automatically | |
| payload = part.get_payload(decode=True) | |
| charset = part.get_content_charset() or "utf-8" | |
| return payload.decode(charset, errors="replace") | |
| # No HTML part found β maybe input is already plain HTML | |
| return raw | |
| except Exception: | |
| return raw | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 1: Platform Detector | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_platform(soup: BeautifulSoup, raw_text: str) -> str: | |
| all_links = [a.get("href", "") for a in soup.find_all("a", href=True)] | |
| link_text = " ".join(all_links).lower() | |
| text_lower = raw_text.lower() | |
| if "glassdoor.com" in link_text: return "glassdoor" | |
| if "linkedin.com" in link_text: return "linkedin" | |
| if "naukri.com" in link_text: return "naukri" | |
| if "foundit.in" in link_text or "monster.com" in link_text: return "foundit" | |
| if "indeed.com" in link_text: return "indeed" | |
| if "instahyre.com" in link_text: return "instahyre" | |
| if "glassdoor" in text_lower: return "glassdoor" | |
| if "linkedin" in text_lower: return "linkedin" | |
| if "naukri" in text_lower: return "naukri" | |
| return "generic" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 2: URL Utilities | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| JUNK_PARAMS = { | |
| "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", | |
| "jrtk", "guid", "ja", "uido", "cs", "cb", "ao", "s", "vt", "ea", | |
| "tgt", "src", "t", "pos", | |
| "trackingid", "refid", "lipi", "midtoken", "midsig", "trk", "trkemail", "eid", "otptoken", | |
| "spl", "notification_frequency", "autoApply", "jr_source", "apop", "notificationid", "response", "type", | |
| # Indeed tracking β 'jk' is intentionally NOT here, it is the job ID | |
| "qd", "rd", "tk", "alid", "bb", "mo", "ad", "xkcb", "camk", "p", "jsa", "rjs", "gdfvj", "plid", "fvj", | |
| } | |
| NOISE_SIGNALS = [ | |
| "unsubscribe", "privacy", "terms", "manage", "email-pref", | |
| "brand-views", "brandview", "wf/open", "logomark", "logo.png", | |
| "easy-apply-icon", "location-icon", "bell-icon", "jobmatch", | |
| "twitter.com", "facebook.com", "instagram.com", "youtube.com", | |
| "glassdoor.com/about", "mailto:", "jobalertajax", "emailsettings", | |
| "job-alert/jobalert", "job-alert-email-unsubscribe", "jobs/alerts", | |
| "jobs/search", "comm/feed", "comm/mynetwork", "comm/messaging", | |
| "comm/notifications", "comm/premium", "comm/widgets", | |
| "linkedin.com/help", "in.linkedin.com/comm/in/", | |
| "static.licdn.com", "media.licdn.com", | |
| "naukri.com/mnjuser", "naukri.com/user", | |
| "seeker/dashboard", "seeker/profile", "seeker/jobalert-feedback", | |
| "trex/unsubscribe", "appurl.io", "play.google.com", "itunes.apple.com", | |
| "media.monsterindia.com", "media.foundit.in", | |
| "widget", "promo", "feed", "mynetwork", | |
| ] | |
| PLATFORM_JOB_SIGNALS = { | |
| "glassdoor": ["/partner/joblisting", "joblistingid="], | |
| "linkedin": ["/comm/jobs/view/", "/jobs/view/"], | |
| "naukri": ["/job-listings-", "naukri.com/view"], | |
| "foundit": ["/rio/autoLogin/"], | |
| "indeed": ["/viewjob", "indeed.com/rc/clk", "indeed.com/pagead/clk", "cts.indeed.com"], | |
| "instahyre": ["instahyre.com/job-"], | |
| "generic": ["/job", "/career", "/apply", "/position", "/vacancy"], | |
| } | |
| def unwrap_autologin_url(url: str) -> str: | |
| try: | |
| unquoted = unquote(url) | |
| if "instahyre.com/job-" in unquoted: | |
| match = re.search(r"(https://www\.instahyre\.com/job-[^/?]+)", unquoted) | |
| if match: | |
| return match.group(1) + "/" | |
| parsed = urlparse(url) | |
| if "/rio/autoLogin/" in parsed.path or "/autoLogin/" in parsed.path: | |
| params = parse_qs(parsed.query) | |
| return_url = params.get("return_url", [None])[0] | |
| if return_url: | |
| return return_url | |
| except Exception: | |
| pass | |
| return url | |
| def clean_url(url: str) -> str: | |
| try: | |
| url = unwrap_autologin_url(url) | |
| parsed = urlparse(url) | |
| query_params = parse_qsl(parsed.query, keep_blank_values=True) | |
| clean_query = [(k, v) for k, v in query_params if k.lower() not in JUNK_PARAMS] | |
| parsed = parsed._replace(query=urlencode(clean_query)) | |
| result = urlunparse(parsed) | |
| clean_paths = ["/comm/jobs/view/", "/jobs/view/", "/job/", "/job-listings-"] | |
| if any(p in result for p in clean_paths): | |
| parsed = parsed._replace(query="") | |
| result = urlunparse(parsed) | |
| return result | |
| except Exception: | |
| return url | |
| def is_job_link(url: str, platform: str = "generic") -> bool: | |
| url_lower = unquote(url).lower() | |
| if any(noise in url_lower for noise in NOISE_SIGNALS): | |
| return False | |
| if platform == "foundit" and "/rio/autologin/" in url_lower: | |
| unwrapped = unwrap_autologin_url(url) | |
| return "/job/" in unwrapped.lower() | |
| signals = PLATFORM_JOB_SIGNALS.get(platform, PLATFORM_JOB_SIGNALS["generic"]) | |
| return any(signal in url_lower for signal in signals) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 3: Platform-Specific Card Extractors | |
| # CRITICAL: Each card gets its OWN individual job_link. | |
| # We never extract one link and paste it across multiple cards. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_glassdoor(soup: BeautifulSoup) -> List[Dict]: | |
| cards = [] | |
| card_tables = soup.find_all("table", class_="gd-dbe9ce2b4a") | |
| print(f" [Glassdoor] Found {len(card_tables)} card containers") | |
| for card_table in card_tables: | |
| card: Dict = {"company": "", "role": "", "job_link": None} | |
| for a_tag in card_table.find_all("a", href=True): | |
| if is_job_link(a_tag["href"], "glassdoor"): | |
| card["job_link"] = clean_url(a_tag["href"]) | |
| break | |
| company_span = card_table.find("span", class_="gd-628b46d9ce") | |
| if company_span: | |
| card["company"] = company_span.get_text(strip=True) | |
| role_p = card_table.find("p", class_="gd-6c2846d4dc") | |
| if role_p: | |
| card["role"] = role_p.get_text(strip=True) | |
| if card["role"] or card["company"]: | |
| cards.append(card) | |
| return cards | |
| def extract_linkedin(soup: BeautifulSoup) -> List[Dict]: | |
| cards = [] | |
| card_tds = soup.find_all("td", attrs={"data-test-id": "job-card"}) | |
| print(f" [LinkedIn] Found {len(card_tds)} job-card containers") | |
| for card_td in card_tds: | |
| card: Dict = {"company": "", "role": "", "job_link": None} | |
| for a_tag in card_td.find_all("a", href=True): | |
| href = a_tag["href"] | |
| if is_job_link(href, "linkedin"): | |
| card["job_link"] = clean_url(href) | |
| break | |
| role_a = card_td.find("a", class_=lambda c: c and "font-bold" in c and "text-md" in c) | |
| if role_a: | |
| card["role"] = role_a.get_text(strip=True) | |
| company_p = card_td.find("p", class_=lambda c: c and "text-system-gray-100" in c) | |
| if company_p: | |
| raw = company_p.get_text(strip=True) | |
| # FIX: original split on "Β·" (middle dot), not "." (period) β preserved correctly | |
| parts = raw.split("Β·") | |
| card["company"] = parts[0].strip() if parts else raw | |
| if card["role"] or card["company"]: | |
| cards.append(card) | |
| return cards | |
| def extract_indeed(soup: BeautifulSoup) -> List[Dict]: | |
| """ | |
| Indeed emails: each job title is <a class="strong-text-link">. | |
| That anchor's own href is the link for THAT specific job. | |
| Company is in the next <tr> sibling of the title's parent <tr>. | |
| """ | |
| cards = [] | |
| title_links = soup.find_all("a", class_="strong-text-link") | |
| print(f" [Indeed] Found {len(title_links)} job title links") | |
| for title_tag in title_links: | |
| card: Dict = {"company": "", "role": "", "job_link": None} | |
| href = title_tag.get("href") | |
| if href and is_job_link(href, "indeed"): | |
| card["job_link"] = clean_url(href) | |
| card["role"] = title_tag.get_text(strip=True) | |
| parent_tr = title_tag.find_parent("tr") | |
| if parent_tr: | |
| next_tr = parent_tr.find_next_sibling("tr") | |
| if next_tr: | |
| company_text = next_tr.get_text(separator=" | ", strip=True) | |
| card["company"] = company_text.split(" | ")[0].strip() | |
| if card["role"] or card["company"]: | |
| cards.append(card) | |
| return cards | |
| def extract_instahyre(soup: BeautifulSoup) -> List[Dict]: | |
| """ | |
| Instahyre: cards are <div class="job-block">. | |
| Company = strong[0], Role = strong[1], link = first anchor in block. | |
| """ | |
| cards = [] | |
| job_blocks = soup.find_all("div", class_="job-block") | |
| print(f" [Instahyre] Found {len(job_blocks)} job blocks") | |
| for block in job_blocks: | |
| card: Dict = {"company": "", "role": "", "job_link": None} | |
| a_tag = block.find("a", href=True) | |
| if a_tag and is_job_link(a_tag["href"], "instahyre"): | |
| card["job_link"] = clean_url(a_tag["href"]) | |
| strong_tags = block.find_all("strong") | |
| if len(strong_tags) >= 2: | |
| card["company"] = strong_tags[0].get_text(strip=True) | |
| card["role"] = strong_tags[1].get_text(strip=True) | |
| if card["role"] or card["company"]: | |
| cards.append(card) | |
| return cards | |
| def extract_naukri(soup: BeautifulSoup) -> List[Dict]: | |
| return _generic_extract(soup, "naukri") | |
| def extract_foundit(soup: BeautifulSoup) -> List[Dict]: | |
| return _generic_extract(soup, "foundit") | |
| def _generic_extract(soup: BeautifulSoup, platform: str = "generic") -> List[Dict]: | |
| """ | |
| Generic fallback: scan all anchors matching job-link signals. | |
| Each unique URL = one card. Surrounding text used for company/role context. | |
| """ | |
| cards = [] | |
| seen_links: set = set() | |
| for a_tag in soup.find_all("a", href=True): | |
| href = a_tag["href"] | |
| if not is_job_link(href, platform): | |
| continue | |
| cleaned = clean_url(href) | |
| if cleaned in seen_links: | |
| continue | |
| seen_links.add(cleaned) | |
| role_text = a_tag.get_text(strip=True) | |
| company_text = "" | |
| for parent in a_tag.parents: | |
| if parent.name in ["td", "div", "li", "tr", "table"]: | |
| all_text = parent.get_text(separator=" | ", strip=True) | |
| if len(all_text) < 400: | |
| company_text = all_text | |
| break | |
| cards.append({ | |
| "company": company_text[:200], | |
| "role": role_text, | |
| "job_link": cleaned, | |
| }) | |
| print(f" [Generic/{platform}] Found {len(cards)} unique job links") | |
| return cards | |
| PLATFORM_EXTRACTORS = { | |
| "glassdoor": extract_glassdoor, | |
| "linkedin": extract_linkedin, | |
| "naukri": extract_naukri, | |
| "foundit": extract_foundit, | |
| "indeed": extract_indeed, | |
| "instahyre": extract_instahyre, | |
| "generic": _generic_extract, | |
| } | |
| def extract_cards(soup: BeautifulSoup, platform: str) -> List[Dict]: | |
| extractor = PLATFORM_EXTRACTORS.get(platform, _generic_extract) | |
| return extractor(soup) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 4: Bouncer | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| JOB_KEYWORDS = [ | |
| "applied", "application", "interview", "rejection", "job alert", | |
| "offer", "hiring", "shortlisted", "assessment", "jobs", "apply", | |
| "internship", "intern", "career", "glassdoor", "linkedin", "naukri", | |
| "opportunity", "resume", "foundit", "indeed", "instahyre", | |
| "position", "role", "vacancy", "opening", | |
| ] | |
| def is_job_email(text: str) -> bool: | |
| return any(word in text.lower() for word in JOB_KEYWORDS) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STAGE 5: LLM Enrichment | |
| # Cards have company, role, job_link already set correctly. | |
| # LLM adds: status, sourcePlatform, domainCategory, coreTech, interpretation. | |
| # After LLM returns, we FORCE re-inject the original job_link from the card | |
| # so even if LLM disobeys, the correct link is always used. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LLM_CARD_PROMPT = """ | |
| You are a structured data extraction engine for a job application tracker. | |
| You receive pre-parsed job cards AND the full original email text as context. | |
| Each card has: company, role, job_link (job_link was extracted by code β do NOT change it). | |
| Company and role may be empty or wrong β use the FULL EMAIL TEXT below to find the correct values. | |
| Return a JSON ARRAY β one object per card, SAME COUNT and SAME ORDER as input. | |
| STRICT RULES: | |
| 1. Return ONLY a raw JSON array []. No markdown, no backticks, no explanation. | |
| 2. Exactly one object per card β same count, same order as input. | |
| 3. Copy job_link EXACTLY as given. Never modify, guess, or omit it. | |
| 4. If job_link is null, output null (not the string "null"). | |
| 5. For companyName: if the card value is empty/Unknown/wrong, find the REAL hiring company name from the EMAIL TEXT. Never output "Unknown Company" if the email text contains the company name. | |
| 6. For jobRole: if the card value is empty, find the real job title from the EMAIL TEXT. | |
| 7. Clean company: if "CompanyName Β· Location" format, extract only company name. | |
| 8. Clean role: remove extra whitespace or codes like [T500-25894]. | |
| FIELDS per object: | |
| - "companyName": string β real hiring company name (use email text if card value is missing) | |
| - "jobRole": string β clean job title (use email text if card value is missing) | |
| - "jobLink": string or null β EXACT copy of job_link provided, never change this | |
| - "status": one of: "Opportunity" | "Applied" | "Interview" | "Selection" | "Rejection" | |
| * Opportunity = job alert, new opening not yet applied to | |
| * Applied = application submitted confirmation | |
| * Interview = interview or assessment invite | |
| * Selection = offer letter or selected to proceed | |
| * Rejection = application declined | |
| - "sourcePlatform": one of: LinkedIn, Naukri, Indeed, Glassdoor, Wellfound, Instahyre, Workday, Greenhouse, Direct Email, Company Portal, Other | |
| - "domainCategory": e.g. "Mobile Development", "Backend Engineering", "Data Science", "DevOps", "Frontend", "Full Stack", "Design", "Product Management", "Other" | |
| - "coreTech": array of 1-3 strings β tech skills inferred from the role title | |
| - "interpretation": 1 sentence describing what this role involves for the applicant | |
| SOURCE PLATFORM HINT: {platform} | |
| FULL EMAIL TEXT (use this to fill missing company/role): | |
| {email_text} | |
| JOB CARDS: | |
| {card_summary} | |
| """ | |
| def build_card_summary(cards: List[Dict]) -> str: | |
| lines = [] | |
| for i, c in enumerate(cards, 1): | |
| lines.append( | |
| f"Job {i}:\n" | |
| f" company: {c.get('company') or 'Unknown'}\n" | |
| f" role: {c.get('role') or 'Unspecified'}\n" | |
| f" job_link: {c.get('job_link') or 'null'}" | |
| ) | |
| return "\n\n".join(lines) | |
| def enrich_cards_with_llm(cards: List[Dict], platform: str, email_text: str = "") -> List[Dict]: | |
| all_results: List[Dict] = [] | |
| chunk_size = 10 | |
| for i in range(0, len(cards), chunk_size): | |
| chunk = cards[i : i + chunk_size] | |
| card_summary = build_card_summary(chunk) | |
| prompt = LLM_CARD_PROMPT.format( | |
| platform=platform.capitalize(), | |
| email_text=email_text[:3000], # cap to avoid token overflow | |
| card_summary=card_summary, | |
| ) | |
| print(f"π§ Enriching cards {i + 1}β{i + len(chunk)} via Groq...") | |
| try: | |
| response = groq_client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.1, | |
| ) | |
| raw = response.choices[0].message.content | |
| batch_result = _safe_parse_json(raw) | |
| # HARD SAFETY: Re-inject original job_link from each card. | |
| # This runs AFTER LLM returns β so even if LLM changed/hallucinated | |
| # a link, the correct one from the card extractor always wins. | |
| for j, enriched in enumerate(batch_result): | |
| if j < len(chunk): | |
| enriched["jobLink"] = chunk[j].get("job_link") | |
| all_results.extend(batch_result) | |
| except Exception as e: | |
| print(f"β οΈ LLM enrichment failed for chunk {i + 1}β{i + len(chunk)}: {e}") | |
| # Fallback: preserve card data with minimal enrichment | |
| for card in chunk: | |
| all_results.append({ | |
| "companyName": card.get("company") or "Unknown Company", | |
| "jobRole": card.get("role") or "Unspecified Role", | |
| "jobLink": card.get("job_link"), | |
| "status": "Opportunity", | |
| "sourcePlatform": platform.capitalize(), | |
| "domainCategory": "Other", | |
| "coreTech": [], | |
| "interpretation": "Could not enrich β LLM call failed.", | |
| }) | |
| return all_results | |
| def _safe_parse_json(raw_text: str) -> list: | |
| raw_text = raw_text.replace("```json", "").replace("```", "").strip() | |
| match = re.search(r"\[.*\]", raw_text, re.DOTALL) | |
| if not match: | |
| print("β οΈ No JSON array found in LLM response.") | |
| return [] | |
| try: | |
| return json.loads(match.group()) | |
| except json.JSONDecodeError as e: | |
| print(f"β οΈ JSON parse failed: {e}") | |
| partial = re.findall(r"\{[^{}]+\}", match.group(), re.DOTALL) | |
| results = [] | |
| for obj_str in partial: | |
| try: | |
| results.append(json.loads(obj_str)) | |
| except Exception: | |
| pass | |
| if results: | |
| print(f" Salvaged {len(results)} partial objects.") | |
| return results | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FIREBASE HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_job_fingerprint(user_email: str, job: dict) -> str: | |
| raw = f"{user_email}|{job.get('companyName', '')}|{job.get('jobRole', '')}".lower() | |
| return hashlib.md5(raw.encode()).hexdigest() | |
| def cleanup_expired_jobs(user_doc_id: str) -> None: | |
| try: | |
| now = datetime.now(timezone.utc) | |
| expired_query = ( | |
| db.collection("users") | |
| .document(user_doc_id) | |
| .collection("applications") | |
| .where("expireAt", "<", now) | |
| .stream() | |
| ) | |
| batch = db.batch() | |
| count = 0 | |
| for doc in expired_query: | |
| batch.delete(doc.reference) | |
| count += 1 | |
| if count > 0: | |
| batch.commit() | |
| print(f"π§Ή Sweeper: Deleted {count} expired jobs.") | |
| except Exception as e: | |
| print(f"β οΈ Sweeper Error: {e}") | |
| def extract_json_array(raw_text: str) -> list: | |
| raw_text = raw_text.replace("```json", "").replace("```", "").strip() | |
| match = re.search(r"\[.*\]", raw_text, re.DOTALL) | |
| if not match: | |
| return [] | |
| try: | |
| return json.loads(match.group()) | |
| except json.JSONDecodeError: | |
| return [] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_testing_ui(): | |
| return "<h1>JobPulse Server is Running!</h1>" | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTE 1: Parse Email β Extract Cards β Enrich β Save to Firebase | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| def parse_email_with_ai(payload: EmailPayload): | |
| # STEP 1: Decode MIME + QP properly | |
| html_body = extract_html_from_email(payload.email_text) | |
| # STEP 2: Parse HTML, strip noise tags | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| for tag in soup(["script", "style", "meta", "noscript", "head"]): | |
| tag.extract() | |
| raw_text = soup.get_text(separator=" ", strip=True) | |
| # STEP 3: Bouncer | |
| if not is_job_email(raw_text): | |
| print("π‘οΈ BOUNCER: Not a job email. Skipped.") | |
| return {"status": "success", "message": "Ignored: Not a job email."} | |
| # STEP 4: Find user in Firebase | |
| users_ref = db.collection("users") | |
| query = users_ref.where("email", "==", payload.user_email).limit(1).stream() | |
| user_doc_id = None | |
| for doc in query: | |
| user_doc_id = doc.id | |
| break | |
| if not user_doc_id: | |
| raise HTTPException( | |
| status_code=404, | |
| detail=f"User with email {payload.user_email} not found in database.", | |
| ) | |
| cleanup_expired_jobs(user_doc_id) | |
| # STEP 5: Detect platform | |
| platform = detect_platform(soup, raw_text) | |
| print(f"π― Detected platform: {platform.upper()}") | |
| # STEP 6: Extract job cards β each card gets its OWN individual link | |
| print("π¦ Extracting job cards...") | |
| cards = extract_cards(soup, platform) | |
| if not cards: | |
| print("β οΈ No cards found. Trying generic fallback...") | |
| cards = _generic_extract(soup, "generic") | |
| if not cards: | |
| return {"status": "success", "message": "No job listings found in this email."} | |
| print(f"β Extracted {len(cards)} job cards β each with its own unique link.") | |
| # STEP 7: Enrich with LLM (adds status, coreTech, domainCategory, etc.) | |
| enriched_jobs = enrich_cards_with_llm(cards, platform, email_text=raw_text) | |
| if not enriched_jobs: | |
| return {"status": "success", "message": "LLM enrichment returned no results."} | |
| # STEP 8: IST timestamp | |
| ist_tz = timezone(timedelta(hours=5, minutes=30)) | |
| exact_timestamp = datetime.now(ist_tz).strftime("%H-%M %d/%m/%Y") | |
| # STEP 9: Firebase batch write with deduplication + TTL | |
| batch = db.batch() | |
| applications_ref = ( | |
| db.collection("users") | |
| .document(user_doc_id) | |
| .collection("applications") | |
| ) | |
| expiry_date = datetime.now(timezone.utc) + timedelta(days=60) | |
| saved_count = 0 | |
| updated_count = 0 | |
| skipped_count = 0 | |
| for job in enriched_jobs: | |
| job["dateApplied"] = exact_timestamp | |
| if job.get("status") == "Opportunity": | |
| job["expireAt"] = expiry_date | |
| fingerprint = generate_job_fingerprint(payload.user_email, job) | |
| job_doc_ref = applications_ref.document(fingerprint) | |
| existing_snap = job_doc_ref.get() | |
| if existing_snap.exists: | |
| existing_status = existing_snap.to_dict().get("status") | |
| new_status = job.get("status") | |
| if existing_status != new_status and new_status != "Opportunity": | |
| batch.update(job_doc_ref, { | |
| "status": new_status, | |
| "dateApplied": exact_timestamp, | |
| }) | |
| updated_count += 1 | |
| print(f"π Updated status: {job.get('companyName')} β {new_status}") | |
| else: | |
| skipped_count += 1 | |
| print(f"βοΈ Skipped duplicate: {job.get('companyName')} - {job.get('jobRole')}") | |
| continue | |
| batch.set(job_doc_ref, job) | |
| saved_count += 1 | |
| if (saved_count + updated_count) > 0: | |
| batch.commit() | |
| print(f"πΎ Firebase: Saved {saved_count} new jobs, Updated {updated_count} jobs.") | |
| return { | |
| "status": "success", | |
| "message": f"Saved {saved_count} jobs. Updated {updated_count}. Skipped {skipped_count} duplicates.", | |
| "platform": platform, | |
| "cardsExtracted": len(cards), | |
| "data": enriched_jobs, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTE 2: JD Skill Extractor | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| def extract_jd_skills(payload: JDPayload): | |
| soup = BeautifulSoup(payload.jd_text, "html.parser") | |
| clean_jd = soup.get_text(separator="\n", strip=True) | |
| if not clean_jd or len(clean_jd) < 50: | |
| raise HTTPException(status_code=400, detail="Job description text is too short or empty.") | |
| prompt = f""" | |
| Extract the top 5 to 10 core 'hard skills' (technical skills, tools, languages, frameworks) | |
| from the following Job Description. Ignore soft skills like communication or teamwork. | |
| OUTPUT FORMAT: Return ONLY a raw JSON array of strings. No markdown, no explanation. | |
| Example: ["Python", "SQL", "React", "AWS", "Docker"] | |
| Job Description: | |
| {clean_jd} | |
| """ | |
| try: | |
| response = groq_client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.0, | |
| ) | |
| ai_text = response.choices[0].message.content | |
| extracted_skills = extract_json_array(ai_text) | |
| return {"status": "success", "skills": extracted_skills or []} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTE 3: LaTeX Resume β PDF Compiler | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| def compile_latex_to_pdf(payload: LatexPayload): | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| tex_file_path = os.path.join(temp_dir, "resume.tex") | |
| pdf_file_path = os.path.join(temp_dir, "resume.pdf") | |
| with open(tex_file_path, "w", encoding="utf-8") as f: | |
| f.write(payload.latex_code) | |
| for _ in range(2): | |
| subprocess.run( | |
| ["pdflatex", "-interaction=nonstopmode", "-output-directory", temp_dir, tex_file_path], | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| ) | |
| if not os.path.exists(pdf_file_path): | |
| raise HTTPException( | |
| status_code=500, | |
| detail="LaTeX compilation failed. Check your LaTeX syntax.", | |
| ) | |
| with open(pdf_file_path, "rb") as pdf_file: | |
| pdf_bytes = pdf_file.read() | |
| # FIX: use single quotes inside the f-string to avoid backslash-in-expression error | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={"Content-Disposition": 'attachment; filename="Tailored_Resume.pdf"'}, | |
| ) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) |