albin commited on
Commit
02c4993
·
1 Parent(s): 19d8bb3

Use extraction features before calling model

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -9
  2. app.py +19 -2
  3. extraction_features.py +6 -6
Dockerfile CHANGED
@@ -1,16 +1,20 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # you will also find guides on how best to write your Dockerfile
3
-
4
  FROM python:3.9
5
 
 
 
 
 
 
 
6
  RUN useradd -m -u 1000 user
 
7
  USER user
8
- ENV PATH="/home/user/.local/bin:$PATH"
9
 
10
- WORKDIR /app
 
 
 
11
 
12
- COPY --chown=user ./requirements.txt requirements.txt
13
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
- COPY --chown=user . /app
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
1
  FROM python:3.9
2
 
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
  RUN useradd -m -u 1000 user
10
+
11
  USER user
 
12
 
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
 
18
+ COPY --chown=user . $HOME/app
 
19
 
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
app.py CHANGED
@@ -7,6 +7,7 @@ from fastapi.responses import JSONResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
  import pickle
 
10
 
11
  app = FastAPI()
12
 
@@ -36,7 +37,7 @@ class Resp(BaseModel):
36
 
37
  @app.get("/")
38
  async def root():
39
- return {"message": "Hello World. Welcome to FastAPI!"}
40
 
41
  def form_req(url: str = Form(...)):
42
  return Req(url=str(url))
@@ -69,7 +70,23 @@ async def predict(request: Request, requess: Req = Depends(form_req)):
69
  '''
70
  url = requess.url
71
 
72
- prediction = model.predict([str(url)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  output = prediction[0]
74
 
75
  output_text = "Legitimate" if output == 1 else "Phishing"
 
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
  import pickle
10
+ from extraction_features import extract_features
11
 
12
  app = FastAPI()
13
 
 
37
 
38
  @app.get("/")
39
  async def root():
40
+ return {"message": "Hello, Welcome to the final project from Albin Tardivel"}
41
 
42
  def form_req(url: str = Form(...)):
43
  return Req(url=str(url))
 
70
  '''
71
  url = requess.url
72
 
73
+ features = extract_features(str(url))
74
+ data = []
75
+ data.append(str(features.URL))
76
+ data.extend(int(features.URLLength))
77
+ data.extend(str(features.Domain))
78
+ data.extend(int(features.DomainLength))
79
+ data.extend(str(features.TLD))
80
+ data.extend(float(features.CharContinuationRate))
81
+ data.extend(int(features.TLDLength))
82
+ data.extend(int(features.NoOfSubDomain))
83
+ data.extend(float(features.DegitRatioInURL))
84
+ data.extend(float(features.SpacialCharRatioInURL))
85
+ data.extend(int(features.IsHTTPS))
86
+
87
+ print(data)
88
+
89
+ prediction = model.predict([data])
90
  output = prediction[0]
91
 
92
  output_text = "Legitimate" if output == 1 else "Phishing"
extraction_features.py CHANGED
@@ -30,13 +30,13 @@ def extract_features(url):
30
  tld_match = re.search(r'\.[a-z]+$', domain_no_www)
31
  features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
32
 
33
- # TLDLength
34
- features['TLDLength'] = len(features['TLD'])
35
-
36
  # CharContinuationRate
37
  char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
38
  total_chars = sum(len(seq) for seq in char_sequences)
39
  features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0
 
 
 
40
 
41
  # NoOfSubDomain
42
  subdomains = domain_no_www.split('.')[:-1]
@@ -58,6 +58,6 @@ def extract_features(url):
58
  # url_example = "https://www.southbankmosaics.com"
59
  url_example = "https://www.ooty.ind.in"
60
  features = extract_features(url_example)
61
-
62
- for key, value in features.items():
63
- print(f"{key}: {value}")
 
30
  tld_match = re.search(r'\.[a-z]+$', domain_no_www)
31
  features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
32
 
 
 
 
33
  # CharContinuationRate
34
  char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
35
  total_chars = sum(len(seq) for seq in char_sequences)
36
  features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0
37
+
38
+ # TLDLength
39
+ features['TLDLength'] = len(features['TLD'])
40
 
41
  # NoOfSubDomain
42
  subdomains = domain_no_www.split('.')[:-1]
 
58
  # url_example = "https://www.southbankmosaics.com"
59
  url_example = "https://www.ooty.ind.in"
60
  features = extract_features(url_example)
61
+ print(features)
62
+ # for key, value in features.items():
63
+ # print(f"{key}: {value}")