import torch from transformers import DistilBertTokenizer, DistilBertForSequenceClassification import gradio as gr import re import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords # Download required NLTK resources nltk.download('punkt_tab') nltk.download('stopwords') # Load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) model.load_state_dict(torch.load("job_model.pth", map_location=torch.device("cpu"))) model.eval() # ✅ Preprocess a single job description string (adapted from your DataFrame version) def preprocess_text(text): # Lowercase text = text.lower() # Remove non-alphabetic characters text = re.sub(r'[^a-z\s]', '', text) # Tokenize tokens = word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Join tokens back into string return ' '.join(tokens) # ✅ Inference function def classify_job(description): cleaned_text = preprocess_text(description) inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=256) with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=1) pred = torch.argmax(probs, dim=1).item() label_map = { 0: "✅ Legitimate Job Post (Real)", 1: "🚨 FAKE Job Post (Fraudulent)" } label = label_map[pred] confidence = probs[0][pred].item() return f"{label}\nConfidence: {confidence:.2%}" # ✅ Gradio Interface interface = gr.Interface( fn=classify_job, inputs=gr.Textbox(lines=6, placeholder="Paste the job description here..."), outputs=gr.Textbox(), title="Job Description Fraud Detector", description="Classifies job descriptions as real or fake using DistilBERT. Uses full text preprocessing." ) if __name__ == "__main__": interface.launch()