RohitGuptaAI commited on
Commit
cdd73b6
·
1 Parent(s): daa113d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -109
app.py CHANGED
@@ -10,125 +10,158 @@ import gradio as gr
10
  import torch
11
  import nltk
12
 
 
13
  def check_by_url(txt_url):
14
- #if txt_url.startswith("http://") or txt_url.startswith("https://"):
15
- parsed_url = urlparse(txt_url)
16
- url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
17
- print(url)
18
-
19
- new_data = []
20
- page = urlopen(url=url).read().decode("utf-8")
21
- soup = BeautifulSoup(page, 'html.parser')
22
- title = soup.find('title').get_text()
23
-
24
- # remove punctuations from title
25
- def remove_punctuation(title):
26
- punctuationfree = "".join([i for i in title if i not in string.punctuation])
27
- return punctuationfree
28
-
29
- css_class_to_remove = "dp-highlighter" # Replace with the CSS class you want to remove
30
- # Find <div> tags with the specified CSS class and remove their content
31
- div_tags = soup.find_all(['code', 'pre'])
32
- for div_tag in div_tags:
33
- div_tag.clear()
34
-
35
- div_tags = soup.find_all('div', class_=css_class_to_remove)
36
- for div_tag in div_tags:
37
- div_tag.clear()
38
-
39
- # Fetch content of remaining tags
40
- content_with_style = ""
41
- p_tags_with_style = soup.find_all('p', style=True)
42
- for p_tag in p_tags_with_style:
43
- p_content = re.sub(r'\n', '', p_tag.get_text())
44
- content_with_style += p_content
45
-
46
- # Fetch content of <p> tags without style
47
- content_without_style = ""
48
- p_tags_without_style = soup.find_all('p', style=False)
49
- for p_tag in p_tags_without_style:
50
- p_content = re.sub(r'\n', '', p_tag.get_text())
51
- content_without_style += p_content
52
-
53
- # Replace Unicode characters in the content and remove duplicates
54
- normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style) # Remove extra spaces
55
- normalized_content_with_style = normalized_content_with_style.replace('\r', '') # Replace '\r' characters
56
- normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
57
- normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
58
-
59
- normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style) # Remove extra spaces
60
- normalized_content_without_style = normalized_content_without_style.replace('\r', '') # Replace '\r' characters
61
- normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
62
- normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
63
-
64
- normalized_content_with_style += normalized_content_without_style
65
- new_data = {"title": title, "content": normalized_content_with_style}
66
-
67
- Save_model = "." # Replace with your saved model name
68
-
69
- model = DistilBertForSequenceClassification.from_pretrained(Save_model)
70
- tokenizer = DistilBertTokenizer.from_pretrained(Save_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- test_encodings = tokenizer.encode_plus(
73
- title,
74
- truncation=True,
75
- padding=True,
76
- max_length=512,
77
- return_tensors="pt"
78
- )
79
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80
- test_input_ids = test_encodings["input_ids"].to(device)
81
- test_attention_mask = test_encodings["attention_mask"].to(device)
82
- with torch.no_grad():
83
- model = model.to(device)
84
- model.eval()
85
- outputs = model(test_input_ids, attention_mask=test_attention_mask)
86
- logits = outputs.logits
87
- predicted_labels = torch.argmax(logits, dim=1)
88
- probabilities = F.softmax(logits, dim=1)
89
- confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
90
- predicted_label_title = predicted_labels.item()
91
 
92
- test_encodings = tokenizer.encode_plus(
93
- normalized_content_with_style,
94
- truncation=True,
95
- padding=True,
96
- max_length=512,
97
- return_tensors="pt"
98
- )
99
- test_input_ids = test_encodings["input_ids"].to(device)
100
- test_attention_mask = test_encodings["attention_mask"].to(device)
101
- with torch.no_grad():
102
- outputs = model(test_input_ids, attention_mask=test_attention_mask)
103
- logits = outputs.logits
104
- predicted_labels = torch.argmax(logits, dim=1)
105
- probabilities = F.softmax(logits, dim=1)
106
- confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
107
- predicted_label_content = predicted_labels.item()
108
 
109
- label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
110
- predicted_label_title = label_mapping[predicted_label_title]
111
- predicted_label_content = label_mapping[predicted_label_content]
112
 
113
- return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
114
-
115
- label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
116
  def predict_2(txt_url, normalized_content_with_style):
117
- predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = None, None, None, None, None
 
 
 
 
 
 
118
  predicted_label_text, confidence_score_text = None, None
119
 
120
  if txt_url.startswith("http://") or txt_url.startswith("https://"):
121
- predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = check_by_url(txt_url)
 
 
 
 
 
 
122
  elif txt_url.startswith(""):
123
- model = DistilBertForSequenceClassification.from_pretrained(Save_model)
124
- tokenizer = DistilBertTokenizer.from_pretrained(Save_model)
125
 
126
  test_encodings = tokenizer.encode_plus(
127
  normalized_content_with_style,
128
  truncation=True,
129
  padding=True,
130
  max_length=512,
131
- return_tensors="pt"
132
  )
133
 
134
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -144,20 +177,26 @@ def predict_2(txt_url, normalized_content_with_style):
144
  probabilities = F.softmax(logits, dim=1)
145
  confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
146
  predicted_label_text = label_mapping[predicted_labels.item()]
147
-
148
-
149
- #predicted_label_text, confidence_score_text=check_by_text(normalized_content_with_style)
150
  else:
151
- print("Done")
 
 
 
 
 
 
 
 
 
 
152
 
153
- return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data, predicted_label_text, confidence_score_text
154
 
155
  demo = gr.Interface(
156
  fn=predict_2,
157
  inputs=[
158
  gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
159
  gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
160
- #gr.inputs.Textbox(label="Content", placeholder="Enter Content"),
161
  ],
162
  outputs=[
163
  gr.outputs.Textbox(label="Title_prediction"),
 
10
  import torch
11
  import nltk
12
 
13
+
14
  def check_by_url(txt_url):
15
+ parsed_url = urlparse(txt_url)
16
+ url = (
17
+ f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
18
+ )
19
+ print(url)
20
+
21
+ new_data = []
22
+ page = urlopen(url=url).read().decode("utf-8")
23
+ soup = BeautifulSoup(page, "html.parser")
24
+ title = soup.find("title").get_text()
25
+
26
+ # remove punctuations from title
27
+ def remove_punctuation(title):
28
+ punctuationfree = "".join([i for i in title if i not in string.punctuation])
29
+ return punctuationfree
30
+
31
+ css_class_to_remove = (
32
+ "dp-highlighter" # Replace with the CSS class you want to remove
33
+ )
34
+ # Find <div> tags with the specified CSS class and remove their content
35
+ div_tags = soup.find_all(["code", "pre"])
36
+ for div_tag in div_tags:
37
+ div_tag.clear()
38
+
39
+ div_tags = soup.find_all("div", class_=css_class_to_remove)
40
+ for div_tag in div_tags:
41
+ div_tag.clear()
42
+
43
+ # Fetch content of remaining tags
44
+ content_with_style = ""
45
+ p_tags_with_style = soup.find_all("p", style=True)
46
+ for p_tag in p_tags_with_style:
47
+ p_content = re.sub(r"\n", "", p_tag.get_text())
48
+ content_with_style += p_content
49
+
50
+ # Fetch content of <p> tags without style
51
+ content_without_style = ""
52
+ p_tags_without_style = soup.find_all("p", style=False)
53
+ for p_tag in p_tags_without_style:
54
+ p_content = re.sub(r"\n", "", p_tag.get_text())
55
+ content_without_style += p_content
56
+
57
+ # Replace Unicode characters in the content and remove duplicates
58
+ normalized_content_with_style = re.sub(
59
+ r"\s+", " ", content_with_style
60
+ ) # Remove extra spaces
61
+ normalized_content_with_style = normalized_content_with_style.replace(
62
+ "\r", ""
63
+ ) # Replace '\r' characters
64
+ normalized_content_with_style = unicodedata.normalize(
65
+ "NFKD", normalized_content_with_style
66
+ )
67
+ normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
68
+
69
+ normalized_content_without_style = re.sub(
70
+ r"\s+", " ", content_without_style
71
+ ) # Remove extra spaces
72
+ normalized_content_without_style = normalized_content_without_style.replace(
73
+ "\r", ""
74
+ ) # Replace '\r' characters
75
+ normalized_content_without_style = unicodedata.normalize(
76
+ "NFKD", normalized_content_without_style
77
+ )
78
+ normalized_content_without_style = unidecode.unidecode(
79
+ normalized_content_without_style
80
+ )
81
+
82
+ normalized_content_with_style += normalized_content_without_style
83
+ new_data = {"title": title, "content": normalized_content_with_style}
84
+
85
+ model = DistilBertForSequenceClassification.from_pretrained(".")
86
+ tokenizer = DistilBertTokenizer.from_pretrained(".")
87
+
88
+ test_encodings = tokenizer.encode_plus(
89
+ title, truncation=True, padding=True, max_length=512, return_tensors="pt"
90
+ )
91
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
92
+ test_input_ids = test_encodings["input_ids"].to(device)
93
+ test_attention_mask = test_encodings["attention_mask"].to(device)
94
+ with torch.no_grad():
95
+ model = model.to(device)
96
+ model.eval()
97
+ outputs = model(test_input_ids, attention_mask=test_attention_mask)
98
+ logits = outputs.logits
99
+ predicted_labels = torch.argmax(logits, dim=1)
100
+ probabilities = F.softmax(logits, dim=1)
101
+ confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
102
+ predicted_label_title = predicted_labels.item()
103
+
104
+ test_encodings = tokenizer.encode_plus(
105
+ normalized_content_with_style,
106
+ truncation=True,
107
+ padding=True,
108
+ max_length=512,
109
+ return_tensors="pt",
110
+ )
111
+ test_input_ids = test_encodings["input_ids"].to(device)
112
+ test_attention_mask = test_encodings["attention_mask"].to(device)
113
+ with torch.no_grad():
114
+ outputs = model(test_input_ids, attention_mask=test_attention_mask)
115
+ logits = outputs.logits
116
+ predicted_labels = torch.argmax(logits, dim=1)
117
+ probabilities = F.softmax(logits, dim=1)
118
+ confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
119
+ predicted_label_content = predicted_labels.item()
120
+
121
+ label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
122
+ predicted_label_title = label_mapping[predicted_label_title]
123
+ predicted_label_content = label_mapping[predicted_label_content]
124
+
125
+ return (
126
+ predicted_label_title,
127
+ confidence_score_title,
128
+ predicted_label_content,
129
+ confidence_scores_content,
130
+ new_data,
131
+ )
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
 
 
 
136
 
 
 
 
137
  def predict_2(txt_url, normalized_content_with_style):
138
+ (
139
+ predicted_label_title,
140
+ confidence_score_title,
141
+ predicted_label_content,
142
+ confidence_scores_content,
143
+ new_data,
144
+ ) = (None, None, None, None, None)
145
  predicted_label_text, confidence_score_text = None, None
146
 
147
  if txt_url.startswith("http://") or txt_url.startswith("https://"):
148
+ (
149
+ predicted_label_title,
150
+ confidence_score_title,
151
+ predicted_label_content,
152
+ confidence_scores_content,
153
+ new_data,
154
+ ) = check_by_url(txt_url)
155
  elif txt_url.startswith(""):
156
+ model = DistilBertForSequenceClassification.from_pretrained(".")
157
+ tokenizer = DistilBertTokenizer.from_pretrained(".")
158
 
159
  test_encodings = tokenizer.encode_plus(
160
  normalized_content_with_style,
161
  truncation=True,
162
  padding=True,
163
  max_length=512,
164
+ return_tensors="pt",
165
  )
166
 
167
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
177
  probabilities = F.softmax(logits, dim=1)
178
  confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
179
  predicted_label_text = label_mapping[predicted_labels.item()]
180
+
 
 
181
  else:
182
+ print("Done")
183
+
184
+ return (
185
+ predicted_label_title,
186
+ confidence_score_title,
187
+ predicted_label_content,
188
+ confidence_scores_content,
189
+ new_data,
190
+ predicted_label_text,
191
+ confidence_score_text,
192
+ )
193
 
 
194
 
195
  demo = gr.Interface(
196
  fn=predict_2,
197
  inputs=[
198
  gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
199
  gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
 
200
  ],
201
  outputs=[
202
  gr.outputs.Textbox(label="Title_prediction"),