| import os |
| import pandas as pd |
| import numpy as np |
| import gradio as gr |
| from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline |
| auth_token = os.environ["TOKEN"] |
|
|
| tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') |
|
|
| model_path = "inesani/ner-log" |
| model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token) |
|
|
| ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer, |
| aggregation_strategy='simple') |
|
|
| title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.' |
|
|
| description = """ |
| I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :) |
| |
| You can paste below any log that you want to test or use one of the provided examples. |
| """ |
|
|
| examples = [ |
| 'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000', |
| 'campo1,campo2,campo3,"campo4,campo5"', |
| '188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"', |
| 'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ', |
| '1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,', |
| '1331901007 C36a282Jljz7BsbGH 192.168.202.76 137 udp 57398 WPAD 1 C_INTERNET 32 NB - - F F T F 1 - - F', |
| 'Jun 9 06:06:20 combo kernel: On node 0 totalpages: 32430', |
| '2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6 Action=Retrieve File EventMessage=Retrieve File OSUser= usrName=PasswordManager src=127.0.0.0 SourceUser= TargetUser= File=Root\Policy.ini Safe=PasswordManagerShared Location= Category= RequestId= Reason= ExtraDetails= GatewayStation= CAPolicy=', |
| 'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4', |
| ] |
|
|
| def aggregate_entities(pipeline_output): |
| reference_entity = pipeline_output[0] |
| aggregated_output = [reference_entity] |
|
|
| for idx in range(1, len(pipeline_output)): |
| entity = pipeline_output[idx] |
| if (entity['entity_group'] == reference_entity['entity_group']) \ |
| and (entity['start'] == reference_entity['end']): |
| result_entity = { |
| 'entity_group': reference_entity['entity_group'], |
| 'score': np.round((reference_entity['score'] + entity['score']) |
| / 2, 3), |
| 'word': reference_entity['word'] + entity['word'], |
| 'start': reference_entity['start'], |
| 'end': entity['end'] |
| } |
|
|
| aggregated_output.pop() |
| aggregated_output.append(result_entity) |
| reference_entity = result_entity |
| else: |
| aggregated_output.append(entity) |
| reference_entity = entity |
|
|
| return aggregated_output |
|
|
| |
| def ner(text): |
| output = ner_pipeline(text) |
| if len(output) != 0: |
| output = aggregate_entities(output) |
| for i in output: |
| i['entity'] = i.pop('entity_group') |
| df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability']) |
| for i in output: |
| new_entity = pd.Series({"Word": text[i['start']:i['end']], |
| "Entity": i['entity'], |
| "Probability": np.round(i['score'], 3)}) |
| df = pd.concat([df, new_entity.to_frame().T], ignore_index=True) |
| return [{"text": text, "entities": output}, df] |
|
|
|
|
| demo = gr.Interface(ner, |
| gr.Textbox(label='Log', placeholder="Enter your log here!"), |
| [gr.HighlightedText(label='NER output'), |
| gr.Dataframe(label='', |
| headers=["Word", "Entity", "Probability"], |
| datatype=["str", "str", "number"], |
| wrap=True |
| )], |
| title=title, |
| description=description, |
| examples=examples, |
| allow_flagging='never') |
| demo.launch() |
|
|