File size: 4,589 Bytes
7fdec27 814ecb5 dfeac1f 3378f63 0da48c6 dfeac1f 3584ed4 7fdec27 dfeac1f e9ee518 4a4563a e9ee518 dfeac1f e9ee518 dfeac1f e9ee518 395a1d5 e9ee518 dfeac1f 395a1d5 dfeac1f 395a1d5 dfeac1f 395a1d5 dfeac1f e9ee518 dfeac1f e9ee518 5d54a41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import os
import pandas as pd
import numpy as np
import gradio as gr
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
auth_token = os.environ["TOKEN"]
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model_path = "inesani/ner-log"
model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token)
ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer,
aggregation_strategy='simple')
title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.'
description = """
I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :)
You can paste below any log that you want to test or use one of the provided examples.
"""
examples = [
'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000',
'campo1,campo2,campo3,"campo4,campo5"',
'188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"',
'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ',
'1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,',
'1331901007 C36a282Jljz7BsbGH 192.168.202.76 137 udp 57398 WPAD 1 C_INTERNET 32 NB - - F F T F 1 - - F',
'Jun 9 06:06:20 combo kernel: On node 0 totalpages: 32430',
'2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6 Action=Retrieve File EventMessage=Retrieve File OSUser= usrName=PasswordManager src=127.0.0.0 SourceUser= TargetUser= File=Root\Policy.ini Safe=PasswordManagerShared Location= Category= RequestId= Reason= ExtraDetails= GatewayStation= CAPolicy=',
'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4',
]
def aggregate_entities(pipeline_output):
reference_entity = pipeline_output[0]
aggregated_output = [reference_entity]
for idx in range(1, len(pipeline_output)):
entity = pipeline_output[idx]
if (entity['entity_group'] == reference_entity['entity_group']) \
and (entity['start'] == reference_entity['end']):
result_entity = {
'entity_group': reference_entity['entity_group'],
'score': np.round((reference_entity['score'] + entity['score'])
/ 2, 3),
'word': reference_entity['word'] + entity['word'],
'start': reference_entity['start'],
'end': entity['end']
}
aggregated_output.pop()
aggregated_output.append(result_entity)
reference_entity = result_entity
else:
aggregated_output.append(entity)
reference_entity = entity
return aggregated_output
def ner(text):
output = ner_pipeline(text)
if len(output) != 0:
output = aggregate_entities(output)
for i in output:
i['entity'] = i.pop('entity_group')
df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability'])
for i in output:
new_entity = pd.Series({"Word": text[i['start']:i['end']],
"Entity": i['entity'],
"Probability": np.round(i['score'], 3)})
df = pd.concat([df, new_entity.to_frame().T], ignore_index=True)
return [{"text": text, "entities": output}, df]
demo = gr.Interface(ner,
gr.Textbox(label='Log', placeholder="Enter your log here!"),
[gr.HighlightedText(label='NER output'),
gr.Dataframe(label='',
headers=["Word", "Entity", "Probability"],
datatype=["str", "str", "number"],
wrap=True
)],
title=title,
description=description,
examples=examples,
allow_flagging='never')
demo.launch()
|