Spaces:

PranavaKailash
/

CyNER2.0_Cyber_Entity_Recogonizer

Sleeping

CyNER2.0_Cyber_Entity_Recogonizer / app.py

Pranava Kailash

Fixed No data in Tensor error v1.1

5dde192 5 months ago

7.56 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	from collections import defaultdict

	# Load model and tokenizer
	path_to_checkpoint = 'PranavaKailash/CyNER-2.0-DeBERTa-v3-base'

	@st.cache_resource
	def load_model():
	"""Load model and tokenizer with proper error handling"""
	try:
	tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint, use_fast=True)
	model = AutoModelForTokenClassification.from_pretrained(path_to_checkpoint)

	# Initialize the NER pipeline (this handles device placement automatically)
	ner_pipeline = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	device=-1 # Force CPU usage
	)
	return ner_pipeline
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return None

	def tag_sentence(sentence, entities):
	"""
	Add HTML tags to entities for visualization.
	"""
	if not entities:
	return sentence

	# Sort entities by start position
	sorted_entities = sorted(entities, key=lambda x: x['start'])

	tagged_sentence = ""
	last_idx = 0

	for entity in sorted_entities:
	# Add text before entity
	tagged_sentence += sentence[last_idx:entity['start']]

	# Add tagged entity
	entity_text = sentence[entity['start']:entity['end']]
	entity_label = entity['entity_group'] if 'entity_group' in entity else entity['entity']
	confidence = entity.get('score', 0)

	tagged_sentence += f"""
	<span style='background-color: #e6f3ff; padding: 2px 6px; border-radius: 4px; border-left: 3px solid #007acc; margin: 1px;'>
	<strong style='color: #005299;'>{entity_label}</strong>
	<span style='color: #333;'>{entity_text}</span>
	<small style='color: #666; font-size: 0.8em;'>({confidence:.2f})</small>
	</span>
	"""

	last_idx = entity['end']

	# Add remaining text
	tagged_sentence += sentence[last_idx:]
	return tagged_sentence

	@st.cache_data
	def perform_ner(text, _pipeline):
	"""
	Run NER pipeline and prepare results for display.
	"""
	if not _pipeline:
	return [], text

	try:
	# Get entities from pipeline
	entities = _pipeline(text)

	# Group entities by type for summary
	entities_by_type = defaultdict(list)
	for entity in entities:
	entity_type = entity.get('entity_group', entity.get('entity', 'Unknown'))
	entities_by_type[entity_type].append({
	'text': text[entity['start']:entity['end']],
	'confidence': round(entity['score'], 3),
	'start': entity['start'],
	'end': entity['end']
	})

	# Create tagged sentence
	tagged_sentence = tag_sentence(text, entities)

	return dict(entities_by_type), tagged_sentence, entities
	except Exception as e:
	st.error(f"Error during NER processing: {str(e)}")
	return {}, text, []

	# Streamlit UI
	st.set_page_config(
	page_title="CyNER 2.0",
	page_icon="🔐",
	layout="wide"
	)

	# Load the pipeline
	ner_pipeline = load_model()

	if not ner_pipeline:
	st.error("❌ Failed to load the model. Please refresh the page or contact support.")
	st.stop()

	st.title("🔐 CyNER 2.0 - Cybersecurity Named Entity Recognition")
	st.markdown("Advanced NER for Cybersecurity Text Analysis using DeBERTa-v3")
	st.write("Enter cybersecurity-related text to identify and extract named entities.")

	# Example texts
	examples = {
	"Malware Analysis": "The Zeus trojan was detected on the victim's Windows 10 system at IP address 192.168.1.100. The malware communicated with command and control server evil.example.com using port 8080.",
	"Vulnerability Report": "CVE-2021-44228 affects Apache Log4j versions 2.0 to 2.15.0. The vulnerability allows remote code execution through LDAP injection.",
	"Incident Response": "Suspicious network traffic detected from IP 203.0.113.1 attempting to access /admin/login.php on our web server nginx running on Ubuntu 20.04.",
	"Phishing Attack": "Users received emails from admin@secur3-bank.com asking them to update their credentials by clicking on https://phishing-site.malicious.com/login"
	}

	# Sidebar for examples
	with st.sidebar:
	st.header("📝 Example Texts")
	st.write("Click to load example cybersecurity text:")
	for title, text in examples.items():
	if st.button(f"📋 {title}", key=f"example_{title}"):
	st.session_state.input_text = text

	# Main input
	input_text = st.text_area(
	"Input Text",
	value=st.session_state.get('input_text', "Enter your cybersecurity text here..."),
	height=150,
	help="Paste any cybersecurity-related text to analyze",
	key='input_text'
	)

	col1, col2, col3 = st.columns([2, 1, 3])
	with col1:
	analyze_button = st.button("🔍 Analyze Text", type="primary")
	with col2:
	clear_button = st.button("🗑️ Clear")

	if clear_button:
	st.session_state.input_text = ""
	st.experimental_rerun()

	if analyze_button and ner_pipeline:
	if input_text.strip() and input_text != "Enter your cybersecurity text here...":
	with st.spinner("🤖 Processing text with CyNER 2.0..."):
	entities_dict, tagged_sentence, raw_entities = perform_ner(input_text, ner_pipeline)

	if entities_dict:
	st.success(f"✅ Analysis complete! Found {sum(len(v) for v in entities_dict.values())} entities")

	# Display results
	st.subheader("📊 Analysis Results")

	# Tagged visualization
	st.markdown("🏷️ Tagged Entities:")
	st.markdown(tagged_sentence, unsafe_allow_html=True)

	# Entity summary metrics
	st.markdown("📈 Entity Summary:")
	if len(entities_dict) > 0:
	cols = st.columns(min(len(entities_dict), 4))
	for i, (entity_type, entities_list) in enumerate(entities_dict.items()):
	with cols[i % 4]:
	st.metric(
	label=entity_type.replace('B-', '').replace('I-', ''),
	value=len(entities_list)
	)

	# Detailed breakdown
	with st.expander("📋 Detailed Entity Breakdown", expanded=True):
	for entity_type, entities_list in entities_dict.items():
	st.markdown(f"{entity_type}:")
	for entity in entities_list:
	st.markdown(f"- `{entity['text']}` (confidence: {entity['confidence']})")

	# Raw data for developers
	with st.expander("🔧 Raw JSON Data", expanded=False):
	st.json(entities_dict)
	else:
	st.info("ℹ️ No cybersecurity entities detected in the provided text. Try using text with security-related terms like IP addresses, malware names, CVEs, etc.")
	else:
	st.warning("⚠️ Please enter some text for analysis.")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center; color: #666; font-size: 0.9em;'>
	<strong>CyNER 2.0</strong> - Cybersecurity Named Entity Recognition<br>
	Model: <code>PranavaKailash/CyNER-2.0-DeBERTa-v3-base</code> \| Built with Streamlit
	</div>
	""", unsafe_allow_html=True)