File size: 4,589 Bytes
7fdec27
814ecb5
 
dfeac1f
3378f63
0da48c6
dfeac1f
 
 
3584ed4
7fdec27
dfeac1f
 
 
 
e9ee518
 
 
4a4563a
e9ee518
 
 
 
dfeac1f
e9ee518
 
dfeac1f
 
e9ee518
 
 
395a1d5
e9ee518
dfeac1f
 
395a1d5
 
 
dfeac1f
395a1d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfeac1f
 
395a1d5
 
dfeac1f
 
e9ee518
 
 
 
 
 
 
dfeac1f
 
 
e9ee518
 
 
 
 
 
 
 
 
 
 
5d54a41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas as pd
import numpy as np
import gradio as gr
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
auth_token = os.environ["TOKEN"]

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model_path = "inesani/ner-log"
model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token)

ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer,
                        aggregation_strategy='simple')

title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.'

description = """
I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :)

You can paste below any log that you want to test or use one of the provided examples.
"""

examples = [
    'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000',
    'campo1,campo2,campo3,"campo4,campo5"',
    '188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"',
    'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ',
    '1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,',
    '1331901007	C36a282Jljz7BsbGH	192.168.202.76	137	udp	57398	WPAD	1	C_INTERNET	32	NB	-	-	F	F	T	F	1	-	-	F',
    'Jun  9 06:06:20 combo kernel: On node 0 totalpages: 32430',
    '2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6   Action=Retrieve File    EventMessage=Retrieve File  OSUser= usrName=PasswordManager src=127.0.0.0   SourceUser= TargetUser= File=Root\Policy.ini  Safe=PasswordManagerShared  Location=   Category=   RequestId=  Reason= ExtraDetails=   GatewayStation= CAPolicy=',
    'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4',
]

def aggregate_entities(pipeline_output):
    reference_entity = pipeline_output[0]
    aggregated_output = [reference_entity]

    for idx in range(1, len(pipeline_output)):
        entity = pipeline_output[idx]
        if (entity['entity_group'] == reference_entity['entity_group']) \
                and (entity['start'] == reference_entity['end']):
            result_entity = {
                'entity_group': reference_entity['entity_group'],
                'score': np.round((reference_entity['score'] + entity['score'])
                                  / 2, 3),
                'word': reference_entity['word'] + entity['word'],
                'start': reference_entity['start'],
                'end': entity['end']
            }

            aggregated_output.pop()
            aggregated_output.append(result_entity)
            reference_entity = result_entity
        else:
            aggregated_output.append(entity)
            reference_entity = entity

    return aggregated_output

    
def ner(text):
    output = ner_pipeline(text)
    if len(output) != 0:
        output = aggregate_entities(output)
    for i in output:
        i['entity'] = i.pop('entity_group')
    df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability'])
    for i in output:
        new_entity = pd.Series({"Word": text[i['start']:i['end']],
                                "Entity": i['entity'],
                                "Probability": np.round(i['score'], 3)})
        df = pd.concat([df, new_entity.to_frame().T], ignore_index=True)
    return [{"text": text, "entities": output}, df]


demo = gr.Interface(ner,
                    gr.Textbox(label='Log', placeholder="Enter your log here!"),
                    [gr.HighlightedText(label='NER output'),
                     gr.Dataframe(label='',
                                  headers=["Word", "Entity", "Probability"],
                                  datatype=["str", "str", "number"],
                                  wrap=True
                                  )],
                    title=title,
                    description=description,
                    examples=examples,
                    allow_flagging='never')
demo.launch()