File size: 32,128 Bytes
4d7533d
 
 
 
 
39e4cd3
 
4d7533d
da0c6a1
67aedc8
 
2e619fa
 
8fc7730
4d7533d
a12f13c
da0c6a1
 
 
 
39e4cd3
d74b98a
4d7533d
d74b98a
da0c6a1
f724e04
 
da0c6a1
39e4cd3
d74b98a
4d7533d
d74b98a
da0c6a1
 
 
 
 
d74b98a
da0c6a1
 
 
 
 
 
39e4cd3
d74b98a
 
 
da0c6a1
 
 
 
39e4cd3
d74b98a
 
 
39e4cd3
d74b98a
 
 
 
 
a12f13c
39e4cd3
 
 
 
 
d74b98a
a12f13c
 
 
 
 
67aedc8
a12f13c
 
 
 
 
 
 
 
 
 
 
67aedc8
d74b98a
 
a12f13c
d74b98a
a12f13c
 
 
 
 
39e4cd3
 
 
 
 
 
a12f13c
39e4cd3
 
 
a12f13c
 
67aedc8
d74b98a
 
a12f13c
d74b98a
a12f13c
39e4cd3
 
 
 
 
a12f13c
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a152e7c
a12f13c
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a152e7c
da0c6a1
d74b98a
a12f13c
39e4cd3
 
d74b98a
 
a12f13c
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
39e4cd3
 
 
 
 
a12f13c
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
 
 
 
a12f13c
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
d74b98a
 
a12f13c
 
d74b98a
 
a12f13c
 
d74b98a
 
a12f13c
39e4cd3
 
 
 
a12f13c
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
d74b98a
 
a12f13c
 
 
 
 
 
 
 
 
 
d74b98a
39e4cd3
a12f13c
 
d74b98a
 
a12f13c
 
39e4cd3
 
 
 
a12f13c
d74b98a
a12f13c
 
5bb5bea
 
 
 
a12f13c
 
 
 
 
 
 
 
5bb5bea
 
 
 
a12f13c
 
5bb5bea
 
 
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
5bb5bea
 
 
a12f13c
 
 
d74b98a
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
5bb5bea
39e4cd3
a12f13c
 
 
39e4cd3
a12f13c
 
 
5bb5bea
39e4cd3
a12f13c
39e4cd3
d74b98a
 
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
 
a12f13c
 
 
 
 
 
d74b98a
39e4cd3
 
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
d74b98a
a12f13c
 
 
 
 
 
 
 
 
39e4cd3
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d74b98a
 
 
 
 
 
4d7533d
 
 
da0c6a1
d74b98a
 
a12f13c
d74b98a
4d7533d
 
d74b98a
a12f13c
 
 
 
 
 
 
 
 
 
 
 
 
da0c6a1
 
a12f13c
67aedc8
 
 
 
 
 
da0c6a1
67aedc8
d74b98a
 
39e4cd3
d74b98a
4d7533d
a152e7c
 
a12f13c
 
 
d74b98a
39e4cd3
a12f13c
 
da0c6a1
a12f13c
 
 
da0c6a1
a12f13c
 
d74b98a
a12f13c
da0c6a1
39e4cd3
5bb5bea
d74b98a
a12f13c
 
d74b98a
a12f13c
 
 
da0c6a1
a12f13c
 
 
 
 
 
 
 
d74b98a
a12f13c
39e4cd3
a12f13c
d74b98a
a12f13c
 
 
 
67aedc8
a12f13c
 
39e4cd3
67aedc8
39e4cd3
 
a650320
 
 
 
39e4cd3
a650320
39e4cd3
 
a650320
 
39e4cd3
a12f13c
da0c6a1
a12f13c
 
da0c6a1
a650320
a12f13c
a650320
da0c6a1
a12f13c
 
a650320
a12f13c
 
39e4cd3
a12f13c
99f2d40
4d7533d
99f2d40
d74b98a
99f2d40
 
 
 
 
 
 
a12f13c
99f2d40
 
a12f13c
d74b98a
 
 
 
 
 
 
99f2d40
f724e04
 
 
 
 
 
 
a152e7c
99f2d40
2e619fa
 
4d7533d
2e619fa
d74b98a
2e619fa
 
 
 
 
 
 
 
 
 
 
 
a12f13c
 
2e619fa
 
39e4cd3
2e619fa
 
 
39e4cd3
 
 
 
2e619fa
ee62e00
 
 
39e4cd3
ee62e00
d74b98a
 
39e4cd3
ee62e00
d74b98a
 
2e619fa
d74b98a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
from groq import Groq
from fastapi import FastAPI, HTTPException, Response
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from bs4 import BeautifulSoup
from typing import List, Dict
import email as email_lib
import json
import os
import re
import hashlib
import subprocess
import tempfile
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl, urlencode, unquote

import firebase_admin
from firebase_admin import credentials, firestore


# ─────────────────────────────────────────
# 1. LOAD ENVIRONMENT VARIABLES
# ─────────────────────────────────────────
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY)


# ─────────────────────────────────────────
# 2. INITIALIZE FIREBASE
# ─────────────────────────────────────────
firebase_secret = os.getenv("FIREBASE_CREDENTIALS")
if firebase_secret:
    cred_dict = json.loads(firebase_secret)
    cred = credentials.Certificate(cred_dict)
else:
    cred = credentials.Certificate("firebase-credentials.json")

firebase_admin.initialize_app(cred)
db = firestore.client()

app = FastAPI(title="JobPulse AI Parser")


# ─────────────────────────────────────────
# PYDANTIC MODELS
# ─────────────────────────────────────────
class EmailPayload(BaseModel):
    user_email: str
    email_text: str


class JDPayload(BaseModel):
    jd_text: str


class LatexPayload(BaseModel):
    latex_code: str


# ═════════════════════════════════════════════════════════════════
# STAGE 0: MIME + Quoted-Printable Decoder
# Emails arriving as raw RFC-2822 messages are:
#   - Multipart MIME  ->  must extract only the text/html part
#   - QP-encoded      ->  =3D means =, line-ending = means line continuation
# Running quopri on the full raw email (headers + body) corrupts everything.
# Python stdlib `email` module splits MIME correctly first.
# ═════════════════════════════════════════════════════════════════
def extract_html_from_email(raw: str) -> str:
    """
    Properly parse a raw RFC-2822 email and return the decoded HTML body.
    Falls back to treating the input as plain HTML if MIME parsing fails.
    """
    try:
        msg = email_lib.message_from_string(raw)
        for part in msg.walk():
            if part.get_content_type() == "text/html":
                # get_payload(decode=True) handles both base64 and QP automatically
                payload = part.get_payload(decode=True)
                charset = part.get_content_charset() or "utf-8"
                return payload.decode(charset, errors="replace")
        # No HTML part found β€” maybe input is already plain HTML
        return raw
    except Exception:
        return raw


# ═════════════════════════════════════════════════════════════════
# STAGE 1: Platform Detector
# ═════════════════════════════════════════════════════════════════
def detect_platform(soup: BeautifulSoup, raw_text: str) -> str:
    all_links = [a.get("href", "") for a in soup.find_all("a", href=True)]
    link_text = " ".join(all_links).lower()
    text_lower = raw_text.lower()

    if "glassdoor.com" in link_text:                             return "glassdoor"
    if "linkedin.com" in link_text:                              return "linkedin"
    if "naukri.com" in link_text:                                return "naukri"
    if "foundit.in" in link_text or "monster.com" in link_text: return "foundit"
    if "indeed.com" in link_text:                                return "indeed"
    if "instahyre.com" in link_text:                             return "instahyre"

    if "glassdoor" in text_lower: return "glassdoor"
    if "linkedin" in text_lower:  return "linkedin"
    if "naukri" in text_lower:    return "naukri"

    return "generic"


# ═════════════════════════════════════════════════════════════════
# STAGE 2: URL Utilities
# ═════════════════════════════════════════════════════════════════
JUNK_PARAMS = {
    "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
    "jrtk", "guid", "ja", "uido", "cs", "cb", "ao", "s", "vt", "ea",
    "tgt", "src", "t", "pos",
    "trackingid", "refid", "lipi", "midtoken", "midsig", "trk", "trkemail", "eid", "otptoken",
    "spl", "notification_frequency", "autoApply", "jr_source", "apop", "notificationid", "response", "type",
    # Indeed tracking β€” 'jk' is intentionally NOT here, it is the job ID
    "qd", "rd", "tk", "alid", "bb", "mo", "ad", "xkcb", "camk", "p", "jsa", "rjs", "gdfvj", "plid", "fvj",
}

NOISE_SIGNALS = [
    "unsubscribe", "privacy", "terms", "manage", "email-pref",
    "brand-views", "brandview", "wf/open", "logomark", "logo.png",
    "easy-apply-icon", "location-icon", "bell-icon", "jobmatch",
    "twitter.com", "facebook.com", "instagram.com", "youtube.com",
    "glassdoor.com/about", "mailto:", "jobalertajax", "emailsettings",
    "job-alert/jobalert", "job-alert-email-unsubscribe", "jobs/alerts",
    "jobs/search", "comm/feed", "comm/mynetwork", "comm/messaging",
    "comm/notifications", "comm/premium", "comm/widgets",
    "linkedin.com/help", "in.linkedin.com/comm/in/",
    "static.licdn.com", "media.licdn.com",
    "naukri.com/mnjuser", "naukri.com/user",
    "seeker/dashboard", "seeker/profile", "seeker/jobalert-feedback",
    "trex/unsubscribe", "appurl.io", "play.google.com", "itunes.apple.com",
    "media.monsterindia.com", "media.foundit.in",
    "widget", "promo", "feed", "mynetwork",
]

PLATFORM_JOB_SIGNALS = {
    "glassdoor": ["/partner/joblisting", "joblistingid="],
    "linkedin":  ["/comm/jobs/view/", "/jobs/view/"],
    "naukri":    ["/job-listings-", "naukri.com/view"],
    "foundit":   ["/rio/autoLogin/"],
    "indeed":    ["/viewjob", "indeed.com/rc/clk", "indeed.com/pagead/clk", "cts.indeed.com"],
    "instahyre": ["instahyre.com/job-"],
    "generic":   ["/job", "/career", "/apply", "/position", "/vacancy"],
}


def unwrap_autologin_url(url: str) -> str:
    try:
        unquoted = unquote(url)
        if "instahyre.com/job-" in unquoted:
            match = re.search(r"(https://www\.instahyre\.com/job-[^/?]+)", unquoted)
            if match:
                return match.group(1) + "/"
        parsed = urlparse(url)
        if "/rio/autoLogin/" in parsed.path or "/autoLogin/" in parsed.path:
            params = parse_qs(parsed.query)
            return_url = params.get("return_url", [None])[0]
            if return_url:
                return return_url
    except Exception:
        pass
    return url


def clean_url(url: str) -> str:
    try:
        url = unwrap_autologin_url(url)
        parsed = urlparse(url)
        query_params = parse_qsl(parsed.query, keep_blank_values=True)
        clean_query = [(k, v) for k, v in query_params if k.lower() not in JUNK_PARAMS]
        parsed = parsed._replace(query=urlencode(clean_query))
        result = urlunparse(parsed)
        clean_paths = ["/comm/jobs/view/", "/jobs/view/", "/job/", "/job-listings-"]
        if any(p in result for p in clean_paths):
            parsed = parsed._replace(query="")
            result = urlunparse(parsed)
        return result
    except Exception:
        return url


def is_job_link(url: str, platform: str = "generic") -> bool:
    url_lower = unquote(url).lower()
    if any(noise in url_lower for noise in NOISE_SIGNALS):
        return False
    if platform == "foundit" and "/rio/autologin/" in url_lower:
        unwrapped = unwrap_autologin_url(url)
        return "/job/" in unwrapped.lower()
    signals = PLATFORM_JOB_SIGNALS.get(platform, PLATFORM_JOB_SIGNALS["generic"])
    return any(signal in url_lower for signal in signals)


# ═════════════════════════════════════════════════════════════════
# STAGE 3: Platform-Specific Card Extractors
# CRITICAL: Each card gets its OWN individual job_link.
# We never extract one link and paste it across multiple cards.
# ═════════════════════════════════════════════════════════════════

def extract_glassdoor(soup: BeautifulSoup) -> List[Dict]:
    cards = []
    card_tables = soup.find_all("table", class_="gd-dbe9ce2b4a")
    print(f"   [Glassdoor] Found {len(card_tables)} card containers")
    for card_table in card_tables:
        card: Dict = {"company": "", "role": "", "job_link": None}
        for a_tag in card_table.find_all("a", href=True):
            if is_job_link(a_tag["href"], "glassdoor"):
                card["job_link"] = clean_url(a_tag["href"])
                break
        company_span = card_table.find("span", class_="gd-628b46d9ce")
        if company_span:
            card["company"] = company_span.get_text(strip=True)
        role_p = card_table.find("p", class_="gd-6c2846d4dc")
        if role_p:
            card["role"] = role_p.get_text(strip=True)
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_linkedin(soup: BeautifulSoup) -> List[Dict]:
    cards = []
    card_tds = soup.find_all("td", attrs={"data-test-id": "job-card"})
    print(f"   [LinkedIn] Found {len(card_tds)} job-card containers")
    for card_td in card_tds:
        card: Dict = {"company": "", "role": "", "job_link": None}
        for a_tag in card_td.find_all("a", href=True):
            href = a_tag["href"]
            if is_job_link(href, "linkedin"):
                card["job_link"] = clean_url(href)
                break
        role_a = card_td.find("a", class_=lambda c: c and "font-bold" in c and "text-md" in c)
        if role_a:
            card["role"] = role_a.get_text(strip=True)
        company_p = card_td.find("p", class_=lambda c: c and "text-system-gray-100" in c)
        if company_p:
            raw = company_p.get_text(strip=True)
            # FIX: original split on "Β·" (middle dot), not "." (period) β€” preserved correctly
            parts = raw.split("Β·")
            card["company"] = parts[0].strip() if parts else raw
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_indeed(soup: BeautifulSoup) -> List[Dict]:
    """
    Indeed emails: each job title is <a class="strong-text-link">.
    That anchor's own href is the link for THAT specific job.
    Company is in the next <tr> sibling of the title's parent <tr>.
    """
    cards = []
    title_links = soup.find_all("a", class_="strong-text-link")
    print(f"   [Indeed] Found {len(title_links)} job title links")
    for title_tag in title_links:
        card: Dict = {"company": "", "role": "", "job_link": None}
        href = title_tag.get("href")
        if href and is_job_link(href, "indeed"):
            card["job_link"] = clean_url(href)
        card["role"] = title_tag.get_text(strip=True)
        parent_tr = title_tag.find_parent("tr")
        if parent_tr:
            next_tr = parent_tr.find_next_sibling("tr")
            if next_tr:
                company_text = next_tr.get_text(separator=" | ", strip=True)
                card["company"] = company_text.split(" | ")[0].strip()
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_instahyre(soup: BeautifulSoup) -> List[Dict]:
    """
    Instahyre: cards are <div class="job-block">.
    Company = strong[0], Role = strong[1], link = first anchor in block.
    """
    cards = []
    job_blocks = soup.find_all("div", class_="job-block")
    print(f"   [Instahyre] Found {len(job_blocks)} job blocks")
    for block in job_blocks:
        card: Dict = {"company": "", "role": "", "job_link": None}
        a_tag = block.find("a", href=True)
        if a_tag and is_job_link(a_tag["href"], "instahyre"):
            card["job_link"] = clean_url(a_tag["href"])
        strong_tags = block.find_all("strong")
        if len(strong_tags) >= 2:
            card["company"] = strong_tags[0].get_text(strip=True)
            card["role"] = strong_tags[1].get_text(strip=True)
        if card["role"] or card["company"]:
            cards.append(card)
    return cards


def extract_naukri(soup: BeautifulSoup) -> List[Dict]:
    return _generic_extract(soup, "naukri")


def extract_foundit(soup: BeautifulSoup) -> List[Dict]:
    return _generic_extract(soup, "foundit")


def _generic_extract(soup: BeautifulSoup, platform: str = "generic") -> List[Dict]:
    """
    Generic fallback: scan all anchors matching job-link signals.
    Each unique URL = one card. Surrounding text used for company/role context.
    """
    cards = []
    seen_links: set = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if not is_job_link(href, platform):
            continue
        cleaned = clean_url(href)
        if cleaned in seen_links:
            continue
        seen_links.add(cleaned)
        role_text = a_tag.get_text(strip=True)
        company_text = ""
        for parent in a_tag.parents:
            if parent.name in ["td", "div", "li", "tr", "table"]:
                all_text = parent.get_text(separator=" | ", strip=True)
                if len(all_text) < 400:
                    company_text = all_text
                    break
        cards.append({
            "company": company_text[:200],
            "role": role_text,
            "job_link": cleaned,
        })
    print(f"   [Generic/{platform}] Found {len(cards)} unique job links")
    return cards


PLATFORM_EXTRACTORS = {
    "glassdoor": extract_glassdoor,
    "linkedin":  extract_linkedin,
    "naukri":    extract_naukri,
    "foundit":   extract_foundit,
    "indeed":    extract_indeed,
    "instahyre": extract_instahyre,
    "generic":   _generic_extract,
}


def extract_cards(soup: BeautifulSoup, platform: str) -> List[Dict]:
    extractor = PLATFORM_EXTRACTORS.get(platform, _generic_extract)
    return extractor(soup)


# ═════════════════════════════════════════════════════════════════
# STAGE 4: Bouncer
# ═════════════════════════════════════════════════════════════════
JOB_KEYWORDS = [
    "applied", "application", "interview", "rejection", "job alert",
    "offer", "hiring", "shortlisted", "assessment", "jobs", "apply",
    "internship", "intern", "career", "glassdoor", "linkedin", "naukri",
    "opportunity", "resume", "foundit", "indeed", "instahyre",
    "position", "role", "vacancy", "opening",
]


def is_job_email(text: str) -> bool:
    return any(word in text.lower() for word in JOB_KEYWORDS)


# ═════════════════════════════════════════════════════════════════
# STAGE 5: LLM Enrichment
# Cards have company, role, job_link already set correctly.
# LLM adds: status, sourcePlatform, domainCategory, coreTech, interpretation.
# After LLM returns, we FORCE re-inject the original job_link from the card
# so even if LLM disobeys, the correct link is always used.
# ═════════════════════════════════════════════════════════════════

LLM_CARD_PROMPT = """
You are a structured data extraction engine for a job application tracker.
You receive pre-parsed job cards AND the full original email text as context.

Each card has: company, role, job_link (job_link was extracted by code β€” do NOT change it).
Company and role may be empty or wrong β€” use the FULL EMAIL TEXT below to find the correct values.

Return a JSON ARRAY β€” one object per card, SAME COUNT and SAME ORDER as input.

STRICT RULES:
1. Return ONLY a raw JSON array []. No markdown, no backticks, no explanation.
2. Exactly one object per card β€” same count, same order as input.
3. Copy job_link EXACTLY as given. Never modify, guess, or omit it.
4. If job_link is null, output null (not the string "null").
5. For companyName: if the card value is empty/Unknown/wrong, find the REAL hiring company name from the EMAIL TEXT. Never output "Unknown Company" if the email text contains the company name.
6. For jobRole: if the card value is empty, find the real job title from the EMAIL TEXT.
7. Clean company: if "CompanyName Β· Location" format, extract only company name.
8. Clean role: remove extra whitespace or codes like [T500-25894].

FIELDS per object:
- "companyName": string β€” real hiring company name (use email text if card value is missing)
- "jobRole": string β€” clean job title (use email text if card value is missing)
- "jobLink": string or null β€” EXACT copy of job_link provided, never change this
- "status": one of: "Opportunity" | "Applied" | "Interview" | "Selection" | "Rejection"
    * Opportunity  = job alert, new opening not yet applied to
    * Applied      = application submitted confirmation
    * Interview    = interview or assessment invite
    * Selection    = offer letter or selected to proceed
    * Rejection    = application declined
- "sourcePlatform": one of: LinkedIn, Naukri, Indeed, Glassdoor, Wellfound, Instahyre, Workday, Greenhouse, Direct Email, Company Portal, Other
- "domainCategory": e.g. "Mobile Development", "Backend Engineering", "Data Science", "DevOps", "Frontend", "Full Stack", "Design", "Product Management", "Other"
- "coreTech": array of 1-3 strings β€” tech skills inferred from the role title
- "interpretation": 1 sentence describing what this role involves for the applicant

SOURCE PLATFORM HINT: {platform}

FULL EMAIL TEXT (use this to fill missing company/role):
{email_text}

JOB CARDS:
{card_summary}
"""


def build_card_summary(cards: List[Dict]) -> str:
    lines = []
    for i, c in enumerate(cards, 1):
        lines.append(
            f"Job {i}:\n"
            f"  company: {c.get('company') or 'Unknown'}\n"
            f"  role: {c.get('role') or 'Unspecified'}\n"
            f"  job_link: {c.get('job_link') or 'null'}"
        )
    return "\n\n".join(lines)


def enrich_cards_with_llm(cards: List[Dict], platform: str, email_text: str = "") -> List[Dict]:
    all_results: List[Dict] = []
    chunk_size = 10

    for i in range(0, len(cards), chunk_size):
        chunk = cards[i : i + chunk_size]
        card_summary = build_card_summary(chunk)
        prompt = LLM_CARD_PROMPT.format(
            platform=platform.capitalize(),
            email_text=email_text[:3000],  # cap to avoid token overflow
            card_summary=card_summary,
        )
        print(f"🧠 Enriching cards {i + 1}–{i + len(chunk)} via Groq...")

        try:
            response = groq_client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = response.choices[0].message.content
            batch_result = _safe_parse_json(raw)

            # HARD SAFETY: Re-inject original job_link from each card.
            # This runs AFTER LLM returns β€” so even if LLM changed/hallucinated
            # a link, the correct one from the card extractor always wins.
            for j, enriched in enumerate(batch_result):
                if j < len(chunk):
                    enriched["jobLink"] = chunk[j].get("job_link")

            all_results.extend(batch_result)

        except Exception as e:
            print(f"⚠️ LLM enrichment failed for chunk {i + 1}–{i + len(chunk)}: {e}")
            # Fallback: preserve card data with minimal enrichment
            for card in chunk:
                all_results.append({
                    "companyName": card.get("company") or "Unknown Company",
                    "jobRole": card.get("role") or "Unspecified Role",
                    "jobLink": card.get("job_link"),
                    "status": "Opportunity",
                    "sourcePlatform": platform.capitalize(),
                    "domainCategory": "Other",
                    "coreTech": [],
                    "interpretation": "Could not enrich β€” LLM call failed.",
                })

    return all_results


def _safe_parse_json(raw_text: str) -> list:
    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
    match = re.search(r"\[.*\]", raw_text, re.DOTALL)
    if not match:
        print("⚠️ No JSON array found in LLM response.")
        return []
    try:
        return json.loads(match.group())
    except json.JSONDecodeError as e:
        print(f"⚠️ JSON parse failed: {e}")
        partial = re.findall(r"\{[^{}]+\}", match.group(), re.DOTALL)
        results = []
        for obj_str in partial:
            try:
                results.append(json.loads(obj_str))
            except Exception:
                pass
        if results:
            print(f"   Salvaged {len(results)} partial objects.")
        return results


# ═════════════════════════════════════════════════════════════════
# FIREBASE HELPERS
# ═════════════════════════════════════════════════════════════════
def generate_job_fingerprint(user_email: str, job: dict) -> str:
    raw = f"{user_email}|{job.get('companyName', '')}|{job.get('jobRole', '')}".lower()
    return hashlib.md5(raw.encode()).hexdigest()


def cleanup_expired_jobs(user_doc_id: str) -> None:
    try:
        now = datetime.now(timezone.utc)
        expired_query = (
            db.collection("users")
            .document(user_doc_id)
            .collection("applications")
            .where("expireAt", "<", now)
            .stream()
        )
        batch = db.batch()
        count = 0
        for doc in expired_query:
            batch.delete(doc.reference)
            count += 1
        if count > 0:
            batch.commit()
            print(f"🧹 Sweeper: Deleted {count} expired jobs.")
    except Exception as e:
        print(f"⚠️ Sweeper Error: {e}")


def extract_json_array(raw_text: str) -> list:
    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
    match = re.search(r"\[.*\]", raw_text, re.DOTALL)
    if not match:
        return []
    try:
        return json.loads(match.group())
    except json.JSONDecodeError:
        return []


# ═════════════════════════════════════════════════════════════════
# ROUTES
# ═════════════════════════════════════════════════════════════════

@app.get("/", response_class=HTMLResponse)
def get_testing_ui():
    return "<h1>JobPulse Server is Running!</h1>"


# ─────────────────────────────────────────
# ROUTE 1: Parse Email β†’ Extract Cards β†’ Enrich β†’ Save to Firebase
# ─────────────────────────────────────────
@app.post("/api/parse-email")
def parse_email_with_ai(payload: EmailPayload):

    # STEP 1: Decode MIME + QP properly
    html_body = extract_html_from_email(payload.email_text)

    # STEP 2: Parse HTML, strip noise tags
    soup = BeautifulSoup(html_body, "html.parser")
    for tag in soup(["script", "style", "meta", "noscript", "head"]):
        tag.extract()

    raw_text = soup.get_text(separator=" ", strip=True)

    # STEP 3: Bouncer
    if not is_job_email(raw_text):
        print("πŸ›‘οΈ BOUNCER: Not a job email. Skipped.")
        return {"status": "success", "message": "Ignored: Not a job email."}

    # STEP 4: Find user in Firebase
    users_ref = db.collection("users")
    query = users_ref.where("email", "==", payload.user_email).limit(1).stream()
    user_doc_id = None
    for doc in query:
        user_doc_id = doc.id
        break

    if not user_doc_id:
        raise HTTPException(
            status_code=404,
            detail=f"User with email {payload.user_email} not found in database.",
        )

    cleanup_expired_jobs(user_doc_id)

    # STEP 5: Detect platform
    platform = detect_platform(soup, raw_text)
    print(f"🎯 Detected platform: {platform.upper()}")

    # STEP 6: Extract job cards β€” each card gets its OWN individual link
    print("πŸ“¦ Extracting job cards...")
    cards = extract_cards(soup, platform)

    if not cards:
        print("⚠️ No cards found. Trying generic fallback...")
        cards = _generic_extract(soup, "generic")

    if not cards:
        return {"status": "success", "message": "No job listings found in this email."}

    print(f"βœ… Extracted {len(cards)} job cards β€” each with its own unique link.")

    # STEP 7: Enrich with LLM (adds status, coreTech, domainCategory, etc.)
    enriched_jobs = enrich_cards_with_llm(cards, platform, email_text=raw_text)

    if not enriched_jobs:
        return {"status": "success", "message": "LLM enrichment returned no results."}

    # STEP 8: IST timestamp
    ist_tz = timezone(timedelta(hours=5, minutes=30))
    exact_timestamp = datetime.now(ist_tz).strftime("%H-%M %d/%m/%Y")

    # STEP 9: Firebase batch write with deduplication + TTL
    batch = db.batch()
    applications_ref = (
        db.collection("users")
        .document(user_doc_id)
        .collection("applications")
    )
    expiry_date = datetime.now(timezone.utc) + timedelta(days=60)

    saved_count = 0
    updated_count = 0
    skipped_count = 0

    for job in enriched_jobs:
        job["dateApplied"] = exact_timestamp
        if job.get("status") == "Opportunity":
            job["expireAt"] = expiry_date

        fingerprint = generate_job_fingerprint(payload.user_email, job)
        job_doc_ref = applications_ref.document(fingerprint)
        existing_snap = job_doc_ref.get()

        if existing_snap.exists:
            existing_status = existing_snap.to_dict().get("status")
            new_status = job.get("status")
            if existing_status != new_status and new_status != "Opportunity":
                batch.update(job_doc_ref, {
                    "status": new_status,
                    "dateApplied": exact_timestamp,
                })
                updated_count += 1
                print(f"πŸ”„ Updated status: {job.get('companyName')} β†’ {new_status}")
            else:
                skipped_count += 1
                print(f"⏭️  Skipped duplicate: {job.get('companyName')} - {job.get('jobRole')}")
            continue

        batch.set(job_doc_ref, job)
        saved_count += 1

    if (saved_count + updated_count) > 0:
        batch.commit()
        print(f"πŸ’Ύ Firebase: Saved {saved_count} new jobs, Updated {updated_count} jobs.")

    return {
        "status": "success",
        "message": f"Saved {saved_count} jobs. Updated {updated_count}. Skipped {skipped_count} duplicates.",
        "platform": platform,
        "cardsExtracted": len(cards),
        "data": enriched_jobs,
    }


# ─────────────────────────────────────────
# ROUTE 2: JD Skill Extractor
# ─────────────────────────────────────────
@app.post("/api/extract-skills")
def extract_jd_skills(payload: JDPayload):
    soup = BeautifulSoup(payload.jd_text, "html.parser")
    clean_jd = soup.get_text(separator="\n", strip=True)

    if not clean_jd or len(clean_jd) < 50:
        raise HTTPException(status_code=400, detail="Job description text is too short or empty.")

    prompt = f"""
Extract the top 5 to 10 core 'hard skills' (technical skills, tools, languages, frameworks)
from the following Job Description. Ignore soft skills like communication or teamwork.
OUTPUT FORMAT: Return ONLY a raw JSON array of strings. No markdown, no explanation.
Example: ["Python", "SQL", "React", "AWS", "Docker"]

Job Description:
{clean_jd}
"""
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
        ai_text = response.choices[0].message.content
        extracted_skills = extract_json_array(ai_text)
        return {"status": "success", "skills": extracted_skills or []}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# ─────────────────────────────────────────
# ROUTE 3: LaTeX Resume β†’ PDF Compiler
# ─────────────────────────────────────────
@app.post("/api/compile-latex")
def compile_latex_to_pdf(payload: LatexPayload):
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            tex_file_path = os.path.join(temp_dir, "resume.tex")
            pdf_file_path = os.path.join(temp_dir, "resume.pdf")

            with open(tex_file_path, "w", encoding="utf-8") as f:
                f.write(payload.latex_code)

            for _ in range(2):
                subprocess.run(
                    ["pdflatex", "-interaction=nonstopmode", "-output-directory", temp_dir, tex_file_path],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                )

            if not os.path.exists(pdf_file_path):
                raise HTTPException(
                    status_code=500,
                    detail="LaTeX compilation failed. Check your LaTeX syntax.",
                )

            with open(pdf_file_path, "rb") as pdf_file:
                pdf_bytes = pdf_file.read()

        # FIX: use single quotes inside the f-string to avoid backslash-in-expression error
        return Response(
            content=pdf_bytes,
            media_type="application/pdf",
            headers={"Content-Disposition": 'attachment; filename="Tailored_Resume.pdf"'},
        )
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))