Spaces:
Sleeping
Sleeping
File size: 2,104 Bytes
017869d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import gradio as gr
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("job_model.pth", map_location=torch.device("cpu")))
model.eval()
# ✅ Preprocess a single job description string (adapted from your DataFrame version)
def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove non-alphabetic characters
text = re.sub(r'[^a-z\s]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Join tokens back into string
return ' '.join(tokens)
# ✅ Inference function
def classify_job(description):
cleaned_text = preprocess_text(description)
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=256)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
pred = torch.argmax(probs, dim=1).item()
label_map = {
0: "✅ Legitimate Job Post (Real)",
1: "🚨 FAKE Job Post (Fraudulent)"
}
label = label_map[pred]
confidence = probs[0][pred].item()
return f"{label}\nConfidence: {confidence:.2%}"
# ✅ Gradio Interface
interface = gr.Interface(
fn=classify_job,
inputs=gr.Textbox(lines=6, placeholder="Paste the job description here..."),
outputs=gr.Textbox(),
title="Job Description Fraud Detector",
description="Classifies job descriptions as real or fake using DistilBERT. Uses full text preprocessing."
)
if __name__ == "__main__":
interface.launch()
|