Rohithm16 commited on
Commit
86a2576
·
verified ·
1 Parent(s): bc3fe8f

Upload 3 files

Browse files
Files changed (3) hide show
  1. handler.py +70 -0
  2. requirements.txt +6 -0
  3. test_model.joblib +3 -0
handler.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================
2
+ # IMPORTS
3
+ # =======================
4
+ import joblib
5
+ import re
6
+ from urllib.parse import urlparse
7
+ import tldextract
8
+ from PyPDF2 import PdfReader
9
+
10
+ # =======================
11
+ # LOAD MODEL
12
+ # =======================
13
+ model = joblib.load("test_model.joblib")
14
+
15
+ # =======================
16
+ # URL FEATURES
17
+ # =======================
18
+ def extract_url_features(url):
19
+ parsed = urlparse(url)
20
+ ext = tldextract.extract(url)
21
+ return {
22
+ "url_length": len(url),
23
+ "num_dots": url.count("."),
24
+ "has_ip": bool(re.search(r"\d+\.\d+\.\d+\.\d+", url)),
25
+ "https": parsed.scheme == "https",
26
+ "domain_length": len(ext.domain)
27
+ }
28
+
29
+ # =======================
30
+ # PDF TEXT EXTRACTION
31
+ # =======================
32
+ def extract_pdf_text(pdf_path):
33
+ text = ""
34
+ reader = PdfReader(pdf_path)
35
+ for page in reader.pages:
36
+ text += page.extract_text() or ""
37
+ return text[:500] # limit for cloud
38
+
39
+ # =======================
40
+ # PREDICTION FUNCTION
41
+ # =======================
42
+ def predict(data):
43
+ """
44
+ Expects JSON input:
45
+ {"inputs": {"text": "...", "url": "...", "pdf_path": "..."}}
46
+ pdf_path is optional if sending a PDF file
47
+ """
48
+ text = data["inputs"].get("text", "")
49
+ url = data["inputs"].get("url", "")
50
+ pdf_path = data["inputs"].get("pdf_path", "")
51
+
52
+ # URL features
53
+ url_features = extract_url_features(url) if url else {}
54
+
55
+ # PDF text (optional)
56
+ pdf_text = extract_pdf_text(pdf_path) if pdf_path else ""
57
+
58
+ # Combine text + PDF text
59
+ combined_text = text + " " + pdf_text
60
+
61
+ # ML prediction
62
+ pred = model.predict([combined_text])[0]
63
+ prob = model.predict_proba([combined_text])[0][1]
64
+
65
+ return {
66
+ "prediction": int(pred),
67
+ "probability": float(prob),
68
+ "url_features": url_features,
69
+ "pdf_text_sample": pdf_text[:100] # sample only
70
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ scikit-learn
2
+ joblib
3
+ numpy
4
+ pandas
5
+ tldextract
6
+ PyPDF2
test_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:927b553bb88479d6fa73fbd3f85a57db155cda264aed15cae09fb461d3a4ce2f
3
+ size 2113