| import streamlit as st | |
| from annotated_text import annotated_text | |
| from transformers import AutoModelForTokenClassification | |
| from transformers import AutoTokenizer | |
| from transformers import pipeline | |
| import requests | |
| import random | |
| import justext | |
| import pickle | |
| from tqdm import tqdm | |
| import torch | |
| import jsonlines | |
| st.title('Identifying Cybersecurity Entities on Webpages') | |
| query_input = st.text_input("URL:") | |
| if query_input: | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0", | |
| "Accept": "application/json, text/plain, */*", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate", | |
| } | |
| s = requests.Session() | |
| s.headers.update(headers) | |
| response = s.get(query_input) | |
| paragraphs = justext.justext(response.content, justext.get_stoplist("English")) | |
| text = "" | |
| for paragraph in paragraphs: | |
| if not paragraph.is_boilerplate: | |
| text += paragraph.text + "\n" | |
| text = text.split("\n") | |
| text = [text_block for text_block in text if text_block != ""] | |
| pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True) | |
| for text_block in text: | |
| entities = pipe(text_block) | |
| annotated = [] | |
| last_entity, last_idx = None, None | |
| for entity in entities: | |
| if last_entity is None and last_idx is None: | |
| annotated.append(text_block[:entity["start"]]) | |
| annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) | |
| last_entity = entity["entity_group"] | |
| last_idx = entity["end"] | |
| elif last_entity == entity["entity_group"] and last_idx == entity["start"]: | |
| new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]] | |
| label = annotated[-1][1] | |
| annotated[-1] = (new_text, label) | |
| last_entity = entity["entity_group"] | |
| last_idx = entity["end"] | |
| else: | |
| annotated.append(text_block[last_idx : entity["start"]]) | |
| annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"])) | |
| last_entity = entity["entity_group"] | |
| last_idx = entity["end"] | |
| annotated.append(text_block[last_idx : ]) | |
| annotated_text(annotated) |