JobPulse-Bouncer / server.py
learnopolis's picture
Update server.py
39e4cd3 verified
Raw
History Blame Contribute Delete
32.1 kB
from groq import Groq
from fastapi import FastAPI, HTTPException, Response
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from bs4 import BeautifulSoup
from typing import List, Dict
import email as email_lib
import json
import os
import re
import hashlib
import subprocess
import tempfile
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl, urlencode, unquote
import firebase_admin
from firebase_admin import credentials, firestore
# ─────────────────────────────────────────
# 1. LOAD ENVIRONMENT VARIABLES
# ─────────────────────────────────────────
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY)
# ─────────────────────────────────────────
# 2. INITIALIZE FIREBASE
# ─────────────────────────────────────────
firebase_secret = os.getenv("FIREBASE_CREDENTIALS")
if firebase_secret:
cred_dict = json.loads(firebase_secret)
cred = credentials.Certificate(cred_dict)
else:
cred = credentials.Certificate("firebase-credentials.json")
firebase_admin.initialize_app(cred)
db = firestore.client()
app = FastAPI(title="JobPulse AI Parser")
# ─────────────────────────────────────────
# PYDANTIC MODELS
# ─────────────────────────────────────────
class EmailPayload(BaseModel):
user_email: str
email_text: str
class JDPayload(BaseModel):
jd_text: str
class LatexPayload(BaseModel):
latex_code: str
# ═════════════════════════════════════════════════════════════════
# STAGE 0: MIME + Quoted-Printable Decoder
# Emails arriving as raw RFC-2822 messages are:
# - Multipart MIME -> must extract only the text/html part
# - QP-encoded -> =3D means =, line-ending = means line continuation
# Running quopri on the full raw email (headers + body) corrupts everything.
# Python stdlib `email` module splits MIME correctly first.
# ═════════════════════════════════════════════════════════════════
def extract_html_from_email(raw: str) -> str:
"""
Properly parse a raw RFC-2822 email and return the decoded HTML body.
Falls back to treating the input as plain HTML if MIME parsing fails.
"""
try:
msg = email_lib.message_from_string(raw)
for part in msg.walk():
if part.get_content_type() == "text/html":
# get_payload(decode=True) handles both base64 and QP automatically
payload = part.get_payload(decode=True)
charset = part.get_content_charset() or "utf-8"
return payload.decode(charset, errors="replace")
# No HTML part found β€” maybe input is already plain HTML
return raw
except Exception:
return raw
# ═════════════════════════════════════════════════════════════════
# STAGE 1: Platform Detector
# ═════════════════════════════════════════════════════════════════
def detect_platform(soup: BeautifulSoup, raw_text: str) -> str:
all_links = [a.get("href", "") for a in soup.find_all("a", href=True)]
link_text = " ".join(all_links).lower()
text_lower = raw_text.lower()
if "glassdoor.com" in link_text: return "glassdoor"
if "linkedin.com" in link_text: return "linkedin"
if "naukri.com" in link_text: return "naukri"
if "foundit.in" in link_text or "monster.com" in link_text: return "foundit"
if "indeed.com" in link_text: return "indeed"
if "instahyre.com" in link_text: return "instahyre"
if "glassdoor" in text_lower: return "glassdoor"
if "linkedin" in text_lower: return "linkedin"
if "naukri" in text_lower: return "naukri"
return "generic"
# ═════════════════════════════════════════════════════════════════
# STAGE 2: URL Utilities
# ═════════════════════════════════════════════════════════════════
JUNK_PARAMS = {
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
"jrtk", "guid", "ja", "uido", "cs", "cb", "ao", "s", "vt", "ea",
"tgt", "src", "t", "pos",
"trackingid", "refid", "lipi", "midtoken", "midsig", "trk", "trkemail", "eid", "otptoken",
"spl", "notification_frequency", "autoApply", "jr_source", "apop", "notificationid", "response", "type",
# Indeed tracking β€” 'jk' is intentionally NOT here, it is the job ID
"qd", "rd", "tk", "alid", "bb", "mo", "ad", "xkcb", "camk", "p", "jsa", "rjs", "gdfvj", "plid", "fvj",
}
NOISE_SIGNALS = [
"unsubscribe", "privacy", "terms", "manage", "email-pref",
"brand-views", "brandview", "wf/open", "logomark", "logo.png",
"easy-apply-icon", "location-icon", "bell-icon", "jobmatch",
"twitter.com", "facebook.com", "instagram.com", "youtube.com",
"glassdoor.com/about", "mailto:", "jobalertajax", "emailsettings",
"job-alert/jobalert", "job-alert-email-unsubscribe", "jobs/alerts",
"jobs/search", "comm/feed", "comm/mynetwork", "comm/messaging",
"comm/notifications", "comm/premium", "comm/widgets",
"linkedin.com/help", "in.linkedin.com/comm/in/",
"static.licdn.com", "media.licdn.com",
"naukri.com/mnjuser", "naukri.com/user",
"seeker/dashboard", "seeker/profile", "seeker/jobalert-feedback",
"trex/unsubscribe", "appurl.io", "play.google.com", "itunes.apple.com",
"media.monsterindia.com", "media.foundit.in",
"widget", "promo", "feed", "mynetwork",
]
PLATFORM_JOB_SIGNALS = {
"glassdoor": ["/partner/joblisting", "joblistingid="],
"linkedin": ["/comm/jobs/view/", "/jobs/view/"],
"naukri": ["/job-listings-", "naukri.com/view"],
"foundit": ["/rio/autoLogin/"],
"indeed": ["/viewjob", "indeed.com/rc/clk", "indeed.com/pagead/clk", "cts.indeed.com"],
"instahyre": ["instahyre.com/job-"],
"generic": ["/job", "/career", "/apply", "/position", "/vacancy"],
}
def unwrap_autologin_url(url: str) -> str:
try:
unquoted = unquote(url)
if "instahyre.com/job-" in unquoted:
match = re.search(r"(https://www\.instahyre\.com/job-[^/?]+)", unquoted)
if match:
return match.group(1) + "/"
parsed = urlparse(url)
if "/rio/autoLogin/" in parsed.path or "/autoLogin/" in parsed.path:
params = parse_qs(parsed.query)
return_url = params.get("return_url", [None])[0]
if return_url:
return return_url
except Exception:
pass
return url
def clean_url(url: str) -> str:
try:
url = unwrap_autologin_url(url)
parsed = urlparse(url)
query_params = parse_qsl(parsed.query, keep_blank_values=True)
clean_query = [(k, v) for k, v in query_params if k.lower() not in JUNK_PARAMS]
parsed = parsed._replace(query=urlencode(clean_query))
result = urlunparse(parsed)
clean_paths = ["/comm/jobs/view/", "/jobs/view/", "/job/", "/job-listings-"]
if any(p in result for p in clean_paths):
parsed = parsed._replace(query="")
result = urlunparse(parsed)
return result
except Exception:
return url
def is_job_link(url: str, platform: str = "generic") -> bool:
url_lower = unquote(url).lower()
if any(noise in url_lower for noise in NOISE_SIGNALS):
return False
if platform == "foundit" and "/rio/autologin/" in url_lower:
unwrapped = unwrap_autologin_url(url)
return "/job/" in unwrapped.lower()
signals = PLATFORM_JOB_SIGNALS.get(platform, PLATFORM_JOB_SIGNALS["generic"])
return any(signal in url_lower for signal in signals)
# ═════════════════════════════════════════════════════════════════
# STAGE 3: Platform-Specific Card Extractors
# CRITICAL: Each card gets its OWN individual job_link.
# We never extract one link and paste it across multiple cards.
# ═════════════════════════════════════════════════════════════════
def extract_glassdoor(soup: BeautifulSoup) -> List[Dict]:
cards = []
card_tables = soup.find_all("table", class_="gd-dbe9ce2b4a")
print(f" [Glassdoor] Found {len(card_tables)} card containers")
for card_table in card_tables:
card: Dict = {"company": "", "role": "", "job_link": None}
for a_tag in card_table.find_all("a", href=True):
if is_job_link(a_tag["href"], "glassdoor"):
card["job_link"] = clean_url(a_tag["href"])
break
company_span = card_table.find("span", class_="gd-628b46d9ce")
if company_span:
card["company"] = company_span.get_text(strip=True)
role_p = card_table.find("p", class_="gd-6c2846d4dc")
if role_p:
card["role"] = role_p.get_text(strip=True)
if card["role"] or card["company"]:
cards.append(card)
return cards
def extract_linkedin(soup: BeautifulSoup) -> List[Dict]:
cards = []
card_tds = soup.find_all("td", attrs={"data-test-id": "job-card"})
print(f" [LinkedIn] Found {len(card_tds)} job-card containers")
for card_td in card_tds:
card: Dict = {"company": "", "role": "", "job_link": None}
for a_tag in card_td.find_all("a", href=True):
href = a_tag["href"]
if is_job_link(href, "linkedin"):
card["job_link"] = clean_url(href)
break
role_a = card_td.find("a", class_=lambda c: c and "font-bold" in c and "text-md" in c)
if role_a:
card["role"] = role_a.get_text(strip=True)
company_p = card_td.find("p", class_=lambda c: c and "text-system-gray-100" in c)
if company_p:
raw = company_p.get_text(strip=True)
# FIX: original split on "Β·" (middle dot), not "." (period) β€” preserved correctly
parts = raw.split("Β·")
card["company"] = parts[0].strip() if parts else raw
if card["role"] or card["company"]:
cards.append(card)
return cards
def extract_indeed(soup: BeautifulSoup) -> List[Dict]:
"""
Indeed emails: each job title is <a class="strong-text-link">.
That anchor's own href is the link for THAT specific job.
Company is in the next <tr> sibling of the title's parent <tr>.
"""
cards = []
title_links = soup.find_all("a", class_="strong-text-link")
print(f" [Indeed] Found {len(title_links)} job title links")
for title_tag in title_links:
card: Dict = {"company": "", "role": "", "job_link": None}
href = title_tag.get("href")
if href and is_job_link(href, "indeed"):
card["job_link"] = clean_url(href)
card["role"] = title_tag.get_text(strip=True)
parent_tr = title_tag.find_parent("tr")
if parent_tr:
next_tr = parent_tr.find_next_sibling("tr")
if next_tr:
company_text = next_tr.get_text(separator=" | ", strip=True)
card["company"] = company_text.split(" | ")[0].strip()
if card["role"] or card["company"]:
cards.append(card)
return cards
def extract_instahyre(soup: BeautifulSoup) -> List[Dict]:
"""
Instahyre: cards are <div class="job-block">.
Company = strong[0], Role = strong[1], link = first anchor in block.
"""
cards = []
job_blocks = soup.find_all("div", class_="job-block")
print(f" [Instahyre] Found {len(job_blocks)} job blocks")
for block in job_blocks:
card: Dict = {"company": "", "role": "", "job_link": None}
a_tag = block.find("a", href=True)
if a_tag and is_job_link(a_tag["href"], "instahyre"):
card["job_link"] = clean_url(a_tag["href"])
strong_tags = block.find_all("strong")
if len(strong_tags) >= 2:
card["company"] = strong_tags[0].get_text(strip=True)
card["role"] = strong_tags[1].get_text(strip=True)
if card["role"] or card["company"]:
cards.append(card)
return cards
def extract_naukri(soup: BeautifulSoup) -> List[Dict]:
return _generic_extract(soup, "naukri")
def extract_foundit(soup: BeautifulSoup) -> List[Dict]:
return _generic_extract(soup, "foundit")
def _generic_extract(soup: BeautifulSoup, platform: str = "generic") -> List[Dict]:
"""
Generic fallback: scan all anchors matching job-link signals.
Each unique URL = one card. Surrounding text used for company/role context.
"""
cards = []
seen_links: set = set()
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if not is_job_link(href, platform):
continue
cleaned = clean_url(href)
if cleaned in seen_links:
continue
seen_links.add(cleaned)
role_text = a_tag.get_text(strip=True)
company_text = ""
for parent in a_tag.parents:
if parent.name in ["td", "div", "li", "tr", "table"]:
all_text = parent.get_text(separator=" | ", strip=True)
if len(all_text) < 400:
company_text = all_text
break
cards.append({
"company": company_text[:200],
"role": role_text,
"job_link": cleaned,
})
print(f" [Generic/{platform}] Found {len(cards)} unique job links")
return cards
PLATFORM_EXTRACTORS = {
"glassdoor": extract_glassdoor,
"linkedin": extract_linkedin,
"naukri": extract_naukri,
"foundit": extract_foundit,
"indeed": extract_indeed,
"instahyre": extract_instahyre,
"generic": _generic_extract,
}
def extract_cards(soup: BeautifulSoup, platform: str) -> List[Dict]:
extractor = PLATFORM_EXTRACTORS.get(platform, _generic_extract)
return extractor(soup)
# ═════════════════════════════════════════════════════════════════
# STAGE 4: Bouncer
# ═════════════════════════════════════════════════════════════════
JOB_KEYWORDS = [
"applied", "application", "interview", "rejection", "job alert",
"offer", "hiring", "shortlisted", "assessment", "jobs", "apply",
"internship", "intern", "career", "glassdoor", "linkedin", "naukri",
"opportunity", "resume", "foundit", "indeed", "instahyre",
"position", "role", "vacancy", "opening",
]
def is_job_email(text: str) -> bool:
return any(word in text.lower() for word in JOB_KEYWORDS)
# ═════════════════════════════════════════════════════════════════
# STAGE 5: LLM Enrichment
# Cards have company, role, job_link already set correctly.
# LLM adds: status, sourcePlatform, domainCategory, coreTech, interpretation.
# After LLM returns, we FORCE re-inject the original job_link from the card
# so even if LLM disobeys, the correct link is always used.
# ═════════════════════════════════════════════════════════════════
LLM_CARD_PROMPT = """
You are a structured data extraction engine for a job application tracker.
You receive pre-parsed job cards AND the full original email text as context.
Each card has: company, role, job_link (job_link was extracted by code β€” do NOT change it).
Company and role may be empty or wrong β€” use the FULL EMAIL TEXT below to find the correct values.
Return a JSON ARRAY β€” one object per card, SAME COUNT and SAME ORDER as input.
STRICT RULES:
1. Return ONLY a raw JSON array []. No markdown, no backticks, no explanation.
2. Exactly one object per card β€” same count, same order as input.
3. Copy job_link EXACTLY as given. Never modify, guess, or omit it.
4. If job_link is null, output null (not the string "null").
5. For companyName: if the card value is empty/Unknown/wrong, find the REAL hiring company name from the EMAIL TEXT. Never output "Unknown Company" if the email text contains the company name.
6. For jobRole: if the card value is empty, find the real job title from the EMAIL TEXT.
7. Clean company: if "CompanyName Β· Location" format, extract only company name.
8. Clean role: remove extra whitespace or codes like [T500-25894].
FIELDS per object:
- "companyName": string β€” real hiring company name (use email text if card value is missing)
- "jobRole": string β€” clean job title (use email text if card value is missing)
- "jobLink": string or null β€” EXACT copy of job_link provided, never change this
- "status": one of: "Opportunity" | "Applied" | "Interview" | "Selection" | "Rejection"
* Opportunity = job alert, new opening not yet applied to
* Applied = application submitted confirmation
* Interview = interview or assessment invite
* Selection = offer letter or selected to proceed
* Rejection = application declined
- "sourcePlatform": one of: LinkedIn, Naukri, Indeed, Glassdoor, Wellfound, Instahyre, Workday, Greenhouse, Direct Email, Company Portal, Other
- "domainCategory": e.g. "Mobile Development", "Backend Engineering", "Data Science", "DevOps", "Frontend", "Full Stack", "Design", "Product Management", "Other"
- "coreTech": array of 1-3 strings β€” tech skills inferred from the role title
- "interpretation": 1 sentence describing what this role involves for the applicant
SOURCE PLATFORM HINT: {platform}
FULL EMAIL TEXT (use this to fill missing company/role):
{email_text}
JOB CARDS:
{card_summary}
"""
def build_card_summary(cards: List[Dict]) -> str:
lines = []
for i, c in enumerate(cards, 1):
lines.append(
f"Job {i}:\n"
f" company: {c.get('company') or 'Unknown'}\n"
f" role: {c.get('role') or 'Unspecified'}\n"
f" job_link: {c.get('job_link') or 'null'}"
)
return "\n\n".join(lines)
def enrich_cards_with_llm(cards: List[Dict], platform: str, email_text: str = "") -> List[Dict]:
all_results: List[Dict] = []
chunk_size = 10
for i in range(0, len(cards), chunk_size):
chunk = cards[i : i + chunk_size]
card_summary = build_card_summary(chunk)
prompt = LLM_CARD_PROMPT.format(
platform=platform.capitalize(),
email_text=email_text[:3000], # cap to avoid token overflow
card_summary=card_summary,
)
print(f"🧠 Enriching cards {i + 1}–{i + len(chunk)} via Groq...")
try:
response = groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
)
raw = response.choices[0].message.content
batch_result = _safe_parse_json(raw)
# HARD SAFETY: Re-inject original job_link from each card.
# This runs AFTER LLM returns β€” so even if LLM changed/hallucinated
# a link, the correct one from the card extractor always wins.
for j, enriched in enumerate(batch_result):
if j < len(chunk):
enriched["jobLink"] = chunk[j].get("job_link")
all_results.extend(batch_result)
except Exception as e:
print(f"⚠️ LLM enrichment failed for chunk {i + 1}–{i + len(chunk)}: {e}")
# Fallback: preserve card data with minimal enrichment
for card in chunk:
all_results.append({
"companyName": card.get("company") or "Unknown Company",
"jobRole": card.get("role") or "Unspecified Role",
"jobLink": card.get("job_link"),
"status": "Opportunity",
"sourcePlatform": platform.capitalize(),
"domainCategory": "Other",
"coreTech": [],
"interpretation": "Could not enrich β€” LLM call failed.",
})
return all_results
def _safe_parse_json(raw_text: str) -> list:
raw_text = raw_text.replace("```json", "").replace("```", "").strip()
match = re.search(r"\[.*\]", raw_text, re.DOTALL)
if not match:
print("⚠️ No JSON array found in LLM response.")
return []
try:
return json.loads(match.group())
except json.JSONDecodeError as e:
print(f"⚠️ JSON parse failed: {e}")
partial = re.findall(r"\{[^{}]+\}", match.group(), re.DOTALL)
results = []
for obj_str in partial:
try:
results.append(json.loads(obj_str))
except Exception:
pass
if results:
print(f" Salvaged {len(results)} partial objects.")
return results
# ═════════════════════════════════════════════════════════════════
# FIREBASE HELPERS
# ═════════════════════════════════════════════════════════════════
def generate_job_fingerprint(user_email: str, job: dict) -> str:
raw = f"{user_email}|{job.get('companyName', '')}|{job.get('jobRole', '')}".lower()
return hashlib.md5(raw.encode()).hexdigest()
def cleanup_expired_jobs(user_doc_id: str) -> None:
try:
now = datetime.now(timezone.utc)
expired_query = (
db.collection("users")
.document(user_doc_id)
.collection("applications")
.where("expireAt", "<", now)
.stream()
)
batch = db.batch()
count = 0
for doc in expired_query:
batch.delete(doc.reference)
count += 1
if count > 0:
batch.commit()
print(f"🧹 Sweeper: Deleted {count} expired jobs.")
except Exception as e:
print(f"⚠️ Sweeper Error: {e}")
def extract_json_array(raw_text: str) -> list:
raw_text = raw_text.replace("```json", "").replace("```", "").strip()
match = re.search(r"\[.*\]", raw_text, re.DOTALL)
if not match:
return []
try:
return json.loads(match.group())
except json.JSONDecodeError:
return []
# ═════════════════════════════════════════════════════════════════
# ROUTES
# ═════════════════════════════════════════════════════════════════
@app.get("/", response_class=HTMLResponse)
def get_testing_ui():
return "<h1>JobPulse Server is Running!</h1>"
# ─────────────────────────────────────────
# ROUTE 1: Parse Email β†’ Extract Cards β†’ Enrich β†’ Save to Firebase
# ─────────────────────────────────────────
@app.post("/api/parse-email")
def parse_email_with_ai(payload: EmailPayload):
# STEP 1: Decode MIME + QP properly
html_body = extract_html_from_email(payload.email_text)
# STEP 2: Parse HTML, strip noise tags
soup = BeautifulSoup(html_body, "html.parser")
for tag in soup(["script", "style", "meta", "noscript", "head"]):
tag.extract()
raw_text = soup.get_text(separator=" ", strip=True)
# STEP 3: Bouncer
if not is_job_email(raw_text):
print("πŸ›‘οΈ BOUNCER: Not a job email. Skipped.")
return {"status": "success", "message": "Ignored: Not a job email."}
# STEP 4: Find user in Firebase
users_ref = db.collection("users")
query = users_ref.where("email", "==", payload.user_email).limit(1).stream()
user_doc_id = None
for doc in query:
user_doc_id = doc.id
break
if not user_doc_id:
raise HTTPException(
status_code=404,
detail=f"User with email {payload.user_email} not found in database.",
)
cleanup_expired_jobs(user_doc_id)
# STEP 5: Detect platform
platform = detect_platform(soup, raw_text)
print(f"🎯 Detected platform: {platform.upper()}")
# STEP 6: Extract job cards β€” each card gets its OWN individual link
print("πŸ“¦ Extracting job cards...")
cards = extract_cards(soup, platform)
if not cards:
print("⚠️ No cards found. Trying generic fallback...")
cards = _generic_extract(soup, "generic")
if not cards:
return {"status": "success", "message": "No job listings found in this email."}
print(f"βœ… Extracted {len(cards)} job cards β€” each with its own unique link.")
# STEP 7: Enrich with LLM (adds status, coreTech, domainCategory, etc.)
enriched_jobs = enrich_cards_with_llm(cards, platform, email_text=raw_text)
if not enriched_jobs:
return {"status": "success", "message": "LLM enrichment returned no results."}
# STEP 8: IST timestamp
ist_tz = timezone(timedelta(hours=5, minutes=30))
exact_timestamp = datetime.now(ist_tz).strftime("%H-%M %d/%m/%Y")
# STEP 9: Firebase batch write with deduplication + TTL
batch = db.batch()
applications_ref = (
db.collection("users")
.document(user_doc_id)
.collection("applications")
)
expiry_date = datetime.now(timezone.utc) + timedelta(days=60)
saved_count = 0
updated_count = 0
skipped_count = 0
for job in enriched_jobs:
job["dateApplied"] = exact_timestamp
if job.get("status") == "Opportunity":
job["expireAt"] = expiry_date
fingerprint = generate_job_fingerprint(payload.user_email, job)
job_doc_ref = applications_ref.document(fingerprint)
existing_snap = job_doc_ref.get()
if existing_snap.exists:
existing_status = existing_snap.to_dict().get("status")
new_status = job.get("status")
if existing_status != new_status and new_status != "Opportunity":
batch.update(job_doc_ref, {
"status": new_status,
"dateApplied": exact_timestamp,
})
updated_count += 1
print(f"πŸ”„ Updated status: {job.get('companyName')} β†’ {new_status}")
else:
skipped_count += 1
print(f"⏭️ Skipped duplicate: {job.get('companyName')} - {job.get('jobRole')}")
continue
batch.set(job_doc_ref, job)
saved_count += 1
if (saved_count + updated_count) > 0:
batch.commit()
print(f"πŸ’Ύ Firebase: Saved {saved_count} new jobs, Updated {updated_count} jobs.")
return {
"status": "success",
"message": f"Saved {saved_count} jobs. Updated {updated_count}. Skipped {skipped_count} duplicates.",
"platform": platform,
"cardsExtracted": len(cards),
"data": enriched_jobs,
}
# ─────────────────────────────────────────
# ROUTE 2: JD Skill Extractor
# ─────────────────────────────────────────
@app.post("/api/extract-skills")
def extract_jd_skills(payload: JDPayload):
soup = BeautifulSoup(payload.jd_text, "html.parser")
clean_jd = soup.get_text(separator="\n", strip=True)
if not clean_jd or len(clean_jd) < 50:
raise HTTPException(status_code=400, detail="Job description text is too short or empty.")
prompt = f"""
Extract the top 5 to 10 core 'hard skills' (technical skills, tools, languages, frameworks)
from the following Job Description. Ignore soft skills like communication or teamwork.
OUTPUT FORMAT: Return ONLY a raw JSON array of strings. No markdown, no explanation.
Example: ["Python", "SQL", "React", "AWS", "Docker"]
Job Description:
{clean_jd}
"""
try:
response = groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
ai_text = response.choices[0].message.content
extracted_skills = extract_json_array(ai_text)
return {"status": "success", "skills": extracted_skills or []}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# ─────────────────────────────────────────
# ROUTE 3: LaTeX Resume β†’ PDF Compiler
# ─────────────────────────────────────────
@app.post("/api/compile-latex")
def compile_latex_to_pdf(payload: LatexPayload):
try:
with tempfile.TemporaryDirectory() as temp_dir:
tex_file_path = os.path.join(temp_dir, "resume.tex")
pdf_file_path = os.path.join(temp_dir, "resume.pdf")
with open(tex_file_path, "w", encoding="utf-8") as f:
f.write(payload.latex_code)
for _ in range(2):
subprocess.run(
["pdflatex", "-interaction=nonstopmode", "-output-directory", temp_dir, tex_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if not os.path.exists(pdf_file_path):
raise HTTPException(
status_code=500,
detail="LaTeX compilation failed. Check your LaTeX syntax.",
)
with open(pdf_file_path, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
# FIX: use single quotes inside the f-string to avoid backslash-in-expression error
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={"Content-Disposition": 'attachment; filename="Tailored_Resume.pdf"'},
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))