Spaces:

inesani
/

ner-log

Runtime error

App Files Files Community

ner-log / app.py

inesani

back to original ner-log model

3584ed4 almost 3 years ago

raw

history blame contribute delete

4.59 kB

	import os
	import pandas as pd
	import numpy as np
	import gradio as gr
	from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
	auth_token = os.environ["TOKEN"]

	tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

	model_path = "inesani/ner-log"
	model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token)

	ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer,
	aggregation_strategy='simple')

	title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.'

	description = """
	I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :)

	You can paste below any log that you want to test or use one of the provided examples.
	"""

	examples = [
	'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000',
	'campo1,campo2,campo3,"campo4,campo5"',
	'188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"',
	'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ',
	'1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,',
	'1331901007 C36a282Jljz7BsbGH 192.168.202.76 137 udp 57398 WPAD 1 C_INTERNET 32 NB - - F F T F 1 - - F',
	'Jun 9 06:06:20 combo kernel: On node 0 totalpages: 32430',
	'2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0\|Cyber-Ark\|Vault\|11.5.0003\|51\|sev=6 Action=Retrieve File EventMessage=Retrieve File OSUser= usrName=PasswordManager src=127.0.0.0 SourceUser= TargetUser= File=Root\Policy.ini Safe=PasswordManagerShared Location= Category= RequestId= Reason= ExtraDetails= GatewayStation= CAPolicy=',
	'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4',
	]

	def aggregate_entities(pipeline_output):
	reference_entity = pipeline_output[0]
	aggregated_output = [reference_entity]

	for idx in range(1, len(pipeline_output)):
	entity = pipeline_output[idx]
	if (entity['entity_group'] == reference_entity['entity_group']) \
	and (entity['start'] == reference_entity['end']):
	result_entity = {
	'entity_group': reference_entity['entity_group'],
	'score': np.round((reference_entity['score'] + entity['score'])
	/ 2, 3),
	'word': reference_entity['word'] + entity['word'],
	'start': reference_entity['start'],
	'end': entity['end']
	}

	aggregated_output.pop()
	aggregated_output.append(result_entity)
	reference_entity = result_entity
	else:
	aggregated_output.append(entity)
	reference_entity = entity

	return aggregated_output


	def ner(text):
	output = ner_pipeline(text)
	if len(output) != 0:
	output = aggregate_entities(output)
	for i in output:
	i['entity'] = i.pop('entity_group')
	df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability'])
	for i in output:
	new_entity = pd.Series({"Word": text[i['start']:i['end']],
	"Entity": i['entity'],
	"Probability": np.round(i['score'], 3)})
	df = pd.concat([df, new_entity.to_frame().T], ignore_index=True)
	return [{"text": text, "entities": output}, df]


	demo = gr.Interface(ner,
	gr.Textbox(label='Log', placeholder="Enter your log here!"),
	[gr.HighlightedText(label='NER output'),
	gr.Dataframe(label='',
	headers=["Word", "Entity", "Probability"],
	datatype=["str", "str", "number"],
	wrap=True
	)],
	title=title,
	description=description,
	examples=examples,
	allow_flagging='never')
	demo.launch()