File size: 2,104 Bytes
017869d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import gradio as gr
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("job_model.pth", map_location=torch.device("cpu")))
model.eval()

# ✅ Preprocess a single job description string (adapted from your DataFrame version)
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    return ' '.join(tokens)

# ✅ Inference function
def classify_job(description):
    cleaned_text = preprocess_text(description)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=256)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()

        label_map = {
            0: "✅ Legitimate Job Post (Real)",
            1: "🚨 FAKE Job Post (Fraudulent)"
        }

        label = label_map[pred]
        confidence = probs[0][pred].item()

    return f"{label}\nConfidence: {confidence:.2%}"

# ✅ Gradio Interface
interface = gr.Interface(
    fn=classify_job,
    inputs=gr.Textbox(lines=6, placeholder="Paste the job description here..."),
    outputs=gr.Textbox(),
    title="Job Description Fraud Detector",
    description="Classifies job descriptions as real or fake using DistilBERT. Uses full text preprocessing."
)

if __name__ == "__main__":
    interface.launch()