ner-log / app.py
inesani's picture
back to original ner-log model
3584ed4
import os
import pandas as pd
import numpy as np
import gradio as gr
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
auth_token = os.environ["TOKEN"]
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model_path = "inesani/ner-log"
model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token)
ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer,
aggregation_strategy='simple')
title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.'
description = """
I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :)
You can paste below any log that you want to test or use one of the provided examples.
"""
examples = [
'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000',
'campo1,campo2,campo3,"campo4,campo5"',
'188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"',
'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ',
'1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,',
'1331901007 C36a282Jljz7BsbGH 192.168.202.76 137 udp 57398 WPAD 1 C_INTERNET 32 NB - - F F T F 1 - - F',
'Jun 9 06:06:20 combo kernel: On node 0 totalpages: 32430',
'2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6 Action=Retrieve File EventMessage=Retrieve File OSUser= usrName=PasswordManager src=127.0.0.0 SourceUser= TargetUser= File=Root\Policy.ini Safe=PasswordManagerShared Location= Category= RequestId= Reason= ExtraDetails= GatewayStation= CAPolicy=',
'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4',
]
def aggregate_entities(pipeline_output):
reference_entity = pipeline_output[0]
aggregated_output = [reference_entity]
for idx in range(1, len(pipeline_output)):
entity = pipeline_output[idx]
if (entity['entity_group'] == reference_entity['entity_group']) \
and (entity['start'] == reference_entity['end']):
result_entity = {
'entity_group': reference_entity['entity_group'],
'score': np.round((reference_entity['score'] + entity['score'])
/ 2, 3),
'word': reference_entity['word'] + entity['word'],
'start': reference_entity['start'],
'end': entity['end']
}
aggregated_output.pop()
aggregated_output.append(result_entity)
reference_entity = result_entity
else:
aggregated_output.append(entity)
reference_entity = entity
return aggregated_output
def ner(text):
output = ner_pipeline(text)
if len(output) != 0:
output = aggregate_entities(output)
for i in output:
i['entity'] = i.pop('entity_group')
df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability'])
for i in output:
new_entity = pd.Series({"Word": text[i['start']:i['end']],
"Entity": i['entity'],
"Probability": np.round(i['score'], 3)})
df = pd.concat([df, new_entity.to_frame().T], ignore_index=True)
return [{"text": text, "entities": output}, df]
demo = gr.Interface(ner,
gr.Textbox(label='Log', placeholder="Enter your log here!"),
[gr.HighlightedText(label='NER output'),
gr.Dataframe(label='',
headers=["Word", "Entity", "Probability"],
datatype=["str", "str", "number"],
wrap=True
)],
title=title,
description=description,
examples=examples,
allow_flagging='never')
demo.launch()