ma4389 commited on
Commit
017869d
Β·
verified Β·
1 Parent(s): 6873958

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -63
app.py CHANGED
@@ -1,63 +1,63 @@
1
- import torch
2
- from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
3
- import gradio as gr
4
- import re
5
- import nltk
6
- from nltk.tokenize import word_tokenize
7
- from nltk.corpus import stopwords
8
-
9
- # Download required NLTK resources
10
- nltk.download('punkt')
11
- nltk.download('stopwords')
12
-
13
- # Load tokenizer and model
14
- tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
15
- model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
16
- model.load_state_dict(torch.load("job_model.pth", map_location=torch.device("cpu")))
17
- model.eval()
18
-
19
- # βœ… Preprocess a single job description string (adapted from your DataFrame version)
20
- def preprocess_text(text):
21
- # Lowercase
22
- text = text.lower()
23
- # Remove non-alphabetic characters
24
- text = re.sub(r'[^a-z\s]', '', text)
25
- # Tokenize
26
- tokens = word_tokenize(text)
27
- # Remove stopwords
28
- stop_words = set(stopwords.words('english'))
29
- tokens = [word for word in tokens if word not in stop_words]
30
- # Join tokens back into string
31
- return ' '.join(tokens)
32
-
33
- # βœ… Inference function
34
- def classify_job(description):
35
- cleaned_text = preprocess_text(description)
36
- inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=256)
37
-
38
- with torch.no_grad():
39
- outputs = model(**inputs)
40
- probs = torch.softmax(outputs.logits, dim=1)
41
- pred = torch.argmax(probs, dim=1).item()
42
-
43
- label_map = {
44
- 0: "βœ… Legitimate Job Post (Real)",
45
- 1: "🚨 FAKE Job Post (Fraudulent)"
46
- }
47
-
48
- label = label_map[pred]
49
- confidence = probs[0][pred].item()
50
-
51
- return f"{label}\nConfidence: {confidence:.2%}"
52
-
53
- # βœ… Gradio Interface
54
- interface = gr.Interface(
55
- fn=classify_job,
56
- inputs=gr.Textbox(lines=6, placeholder="Paste the job description here..."),
57
- outputs=gr.Textbox(),
58
- title="Job Description Fraud Detector",
59
- description="Classifies job descriptions as real or fake using DistilBERT. Uses full text preprocessing."
60
- )
61
-
62
- if __name__ == "__main__":
63
- interface.launch()
 
1
+ import torch
2
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
3
+ import gradio as gr
4
+ import re
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.corpus import stopwords
8
+
9
+ # Download required NLTK resources
10
+ nltk.download('punkt_tab')
11
+ nltk.download('stopwords')
12
+
13
+ # Load tokenizer and model
14
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
15
+ model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
16
+ model.load_state_dict(torch.load("job_model.pth", map_location=torch.device("cpu")))
17
+ model.eval()
18
+
19
+ # βœ… Preprocess a single job description string (adapted from your DataFrame version)
20
+ def preprocess_text(text):
21
+ # Lowercase
22
+ text = text.lower()
23
+ # Remove non-alphabetic characters
24
+ text = re.sub(r'[^a-z\s]', '', text)
25
+ # Tokenize
26
+ tokens = word_tokenize(text)
27
+ # Remove stopwords
28
+ stop_words = set(stopwords.words('english'))
29
+ tokens = [word for word in tokens if word not in stop_words]
30
+ # Join tokens back into string
31
+ return ' '.join(tokens)
32
+
33
+ # βœ… Inference function
34
+ def classify_job(description):
35
+ cleaned_text = preprocess_text(description)
36
+ inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=256)
37
+
38
+ with torch.no_grad():
39
+ outputs = model(**inputs)
40
+ probs = torch.softmax(outputs.logits, dim=1)
41
+ pred = torch.argmax(probs, dim=1).item()
42
+
43
+ label_map = {
44
+ 0: "βœ… Legitimate Job Post (Real)",
45
+ 1: "🚨 FAKE Job Post (Fraudulent)"
46
+ }
47
+
48
+ label = label_map[pred]
49
+ confidence = probs[0][pred].item()
50
+
51
+ return f"{label}\nConfidence: {confidence:.2%}"
52
+
53
+ # βœ… Gradio Interface
54
+ interface = gr.Interface(
55
+ fn=classify_job,
56
+ inputs=gr.Textbox(lines=6, placeholder="Paste the job description here..."),
57
+ outputs=gr.Textbox(),
58
+ title="Job Description Fraud Detector",
59
+ description="Classifies job descriptions as real or fake using DistilBERT. Uses full text preprocessing."
60
+ )
61
+
62
+ if __name__ == "__main__":
63
+ interface.launch()