Spaces:

PranavaKailash
/

CyNER2.0_Cyber_Entity_Recogonizer

Sleeping

App Files Files Community

Pranava Kailash commited on Sep 26, 2025

Commit

5dde192

1 Parent(s): d2369e6

Fixed No data in Tensor error v1.1

Browse files

Files changed (1) hide show

app.py +115 -79

app.py CHANGED Viewed

@@ -1,91 +1,95 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 from collections import defaultdict
-import torch
 # Load model and tokenizer
 path_to_checkpoint = 'PranavaKailash/CyNER-2.0-DeBERTa-v3-base'
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint, use_fast=True)
-# Load model with proper device handling
-model = AutoModelForTokenClassification.from_pretrained(
-    path_to_checkpoint,
-    torch_dtype='auto',
-    device_map='cpu'
-)
-# Initialize the NER pipeline
-ner_pipeline = pipeline(
-    "ner",
-    model=model,
-    tokenizer=tokenizer,
-    device=-1  # Explicitly use CPU
-)
-def tag_sentence(sentence, entities_dict):
     """
     Add HTML tags to entities for visualization.
     """
-    all_entities = sorted(
-        [(e['start'], e['end'], e['entity'], e['word']) for ents in entities_dict.values() for e in ents],
-        key=lambda x: x[0]
-    )
-    merged_entities = []
-    current_entity = None
-    for start, end, entity_type, word in all_entities:
-        if current_entity is None:
-            current_entity = [start, end, entity_type, word]
-        else:
-            if start == current_entity[1] and entity_type == current_entity[2] and entity_type.startswith('I-'):
-                current_entity[1] = end
-                current_entity[3] += word.replace('▁', ' ')
-            else:
-                merged_entities.append(tuple(current_entity))
-                current_entity = [start, end, entity_type, word]
-    if current_entity:
-        merged_entities.append(tuple(current_entity))
     tagged_sentence = ""
     last_idx = 0
-    for start, end, entity_type, _ in merged_entities:
-        tagged_sentence += sentence[last_idx:start]
-        entity_tag = entity_type.replace('I-', 'B-')
-        tagged_sentence += f"<span style='color:blue; background-color: #e6f3ff; padding: 2px; border-radius: 3px;'><strong>{entity_tag}</strong></span><span style='background-color: #fff3cd; padding: 2px; border-radius: 3px;'>{sentence[start:end]}</span>"
-        last_idx = end
     tagged_sentence += sentence[last_idx:]
     return tagged_sentence
 @st.cache_data
-def perform_ner(text):
     """
     Run NER pipeline and prepare results for display.
     """
     try:
-        entities = ner_pipeline(text)
-        entities_dict = defaultdict(list)
         for entity in entities:
-            entities_dict[entity['entity']].append({
-                "entity": entity['entity'],
-                "score": round(entity['score'], 4),
-                "index": entity['index'],
-                "word": entity['word'],
-                "start": entity['start'],
-                "end": entity['end']
             })
-        tagged_sentence = tag_sentence(text, entities_dict)
-        return dict(entities_dict), tagged_sentence
     except Exception as e:
         st.error(f"Error during NER processing: {str(e)}")
-        return {}, text
 # Streamlit UI
 st.set_page_config(
@@ -94,66 +98,98 @@ st.set_page_config(
     layout="wide"
 )
 st.title("🔐 CyNER 2.0 - Cybersecurity Named Entity Recognition")
-st.markdown("**Advanced NER for Cybersecurity Text Analysis**")
-st.write("Enter cybersecurity-related text to identify and extract named entities using the CyNER 2.0 model.")
 # Example texts
 examples = {
     "Malware Analysis": "The Zeus trojan was detected on the victim's Windows 10 system at IP address 192.168.1.100. The malware communicated with command and control server evil.example.com using port 8080.",
     "Vulnerability Report": "CVE-2021-44228 affects Apache Log4j versions 2.0 to 2.15.0. The vulnerability allows remote code execution through LDAP injection.",
-    "Incident Response": "Suspicious network traffic detected from IP 203.0.113.1 attempting to access /admin/login.php on our web server nginx running on Ubuntu 20.04."
 }
 # Sidebar for examples
 with st.sidebar:
-    st.header("Example Texts")
     for title, text in examples.items():
-        if st.button(f"Load: {title}"):
             st.session_state.input_text = text
 # Main input
 input_text = st.text_area(
-    "Input Text",
     value=st.session_state.get('input_text', "Enter your cybersecurity text here..."),
     height=150,
     key='input_text'
 )
-col1, col2 = st.columns([1, 4])
 with col1:
     analyze_button = st.button("🔍 Analyze Text", type="primary")
-if analyze_button:
     if input_text.strip() and input_text != "Enter your cybersecurity text here...":
-        with st.spinner("Processing text with CyNER 2.0..."):
-            entities_dict, tagged_sentence = perform_ner(input_text)
         if entities_dict:
             # Display results
             st.subheader("📊 Analysis Results")
             # Tagged visualization
-            st.markdown("**Tagged Entities:**")
             st.markdown(tagged_sentence, unsafe_allow_html=True)
-            # Entity summary
-            st.markdown("**Entity Summary:**")
-            entity_counts = {k: len(v) for k, v in entities_dict.items()}
-            cols = st.columns(min(len(entity_counts), 4))
-            for i, (entity_type, count) in enumerate(entity_counts.items()):
-                with cols[i % 4]:
-                    st.metric(entity_type.replace('B-', '').replace('I-', ''), count)
-            # Detailed results
-            with st.expander("📋 Detailed Entity Information", expanded=False):
                 st.json(entities_dict)
         else:
-            st.info("No entities detected in the provided text.")
     else:
         st.warning("⚠️ Please enter some text for analysis.")
 # Footer
 st.markdown("---")
-st.markdown("**CyNER 2.0** - Powered by DeBERTa-v3-base | Built with Streamlit")

 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 from collections import defaultdict
 # Load model and tokenizer
 path_to_checkpoint = 'PranavaKailash/CyNER-2.0-DeBERTa-v3-base'
+@st.cache_resource
+def load_model():
+    """Load model and tokenizer with proper error handling"""
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint, use_fast=True)
+        model = AutoModelForTokenClassification.from_pretrained(path_to_checkpoint)
+        # Initialize the NER pipeline (this handles device placement automatically)
+        ner_pipeline = pipeline(
+            "ner",
+            model=model,
+            tokenizer=tokenizer,
+            device=-1  # Force CPU usage
+        )
+        return ner_pipeline
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None
+def tag_sentence(sentence, entities):
     """
     Add HTML tags to entities for visualization.
     """
+    if not entities:
+        return sentence
+    # Sort entities by start position
+    sorted_entities = sorted(entities, key=lambda x: x['start'])
     tagged_sentence = ""
     last_idx = 0
+    for entity in sorted_entities:
+        # Add text before entity
+        tagged_sentence += sentence[last_idx:entity['start']]
+        # Add tagged entity
+        entity_text = sentence[entity['start']:entity['end']]
+        entity_label = entity['entity_group'] if 'entity_group' in entity else entity['entity']
+        confidence = entity.get('score', 0)
+        tagged_sentence += f"""
+        <span style='background-color: #e6f3ff; padding: 2px 6px; border-radius: 4px; border-left: 3px solid #007acc; margin: 1px;'>
+            <strong style='color: #005299;'>{entity_label}</strong>
+            <span style='color: #333;'>{entity_text}</span>
+            <small style='color: #666; font-size: 0.8em;'>({confidence:.2f})</small>
+        </span>
+        """
+        last_idx = entity['end']
+    # Add remaining text
     tagged_sentence += sentence[last_idx:]
     return tagged_sentence
 @st.cache_data
+def perform_ner(text, _pipeline):
     """
     Run NER pipeline and prepare results for display.
     """
+    if not _pipeline:
+        return [], text
     try:
+        # Get entities from pipeline
+        entities = _pipeline(text)
+        # Group entities by type for summary
+        entities_by_type = defaultdict(list)
         for entity in entities:
+            entity_type = entity.get('entity_group', entity.get('entity', 'Unknown'))
+            entities_by_type[entity_type].append({
+                'text': text[entity['start']:entity['end']],
+                'confidence': round(entity['score'], 3),
+                'start': entity['start'],
+                'end': entity['end']
             })
+        # Create tagged sentence
+        tagged_sentence = tag_sentence(text, entities)
+        return dict(entities_by_type), tagged_sentence, entities
     except Exception as e:
         st.error(f"Error during NER processing: {str(e)}")
+        return {}, text, []
 # Streamlit UI
 st.set_page_config(
     layout="wide"
 )
+# Load the pipeline
+ner_pipeline = load_model()
+if not ner_pipeline:
+    st.error("❌ Failed to load the model. Please refresh the page or contact support.")
+    st.stop()
 st.title("🔐 CyNER 2.0 - Cybersecurity Named Entity Recognition")
+st.markdown("**Advanced NER for Cybersecurity Text Analysis using DeBERTa-v3**")
+st.write("Enter cybersecurity-related text to identify and extract named entities.")
 # Example texts
 examples = {
     "Malware Analysis": "The Zeus trojan was detected on the victim's Windows 10 system at IP address 192.168.1.100. The malware communicated with command and control server evil.example.com using port 8080.",
     "Vulnerability Report": "CVE-2021-44228 affects Apache Log4j versions 2.0 to 2.15.0. The vulnerability allows remote code execution through LDAP injection.",
+    "Incident Response": "Suspicious network traffic detected from IP 203.0.113.1 attempting to access /admin/login.php on our web server nginx running on Ubuntu 20.04.",
+    "Phishing Attack": "Users received emails from admin@secur3-bank.com asking them to update their credentials by clicking on https://phishing-site.malicious.com/login"
 }
 # Sidebar for examples
 with st.sidebar:
+    st.header("📝 Example Texts")
+    st.write("Click to load example cybersecurity text:")
     for title, text in examples.items():
+        if st.button(f"📋 {title}", key=f"example_{title}"):
             st.session_state.input_text = text
 # Main input
 input_text = st.text_area(
+    "**Input Text**",
     value=st.session_state.get('input_text', "Enter your cybersecurity text here..."),
     height=150,
+    help="Paste any cybersecurity-related text to analyze",
     key='input_text'
 )
+col1, col2, col3 = st.columns([2, 1, 3])
 with col1:
     analyze_button = st.button("🔍 Analyze Text", type="primary")
+with col2:
+    clear_button = st.button("🗑️ Clear")
+if clear_button:
+    st.session_state.input_text = ""
+    st.experimental_rerun()
+if analyze_button and ner_pipeline:
     if input_text.strip() and input_text != "Enter your cybersecurity text here...":
+        with st.spinner("🤖 Processing text with CyNER 2.0..."):
+            entities_dict, tagged_sentence, raw_entities = perform_ner(input_text, ner_pipeline)
         if entities_dict:
+            st.success(f"✅ Analysis complete! Found {sum(len(v) for v in entities_dict.values())} entities")
             # Display results
             st.subheader("📊 Analysis Results")
             # Tagged visualization
+            st.markdown("**🏷️ Tagged Entities:**")
             st.markdown(tagged_sentence, unsafe_allow_html=True)
+            # Entity summary metrics
+            st.markdown("**📈 Entity Summary:**")
+            if len(entities_dict) > 0:
+                cols = st.columns(min(len(entities_dict), 4))
+                for i, (entity_type, entities_list) in enumerate(entities_dict.items()):
+                    with cols[i % 4]:
+                        st.metric(
+                            label=entity_type.replace('B-', '').replace('I-', ''),
+                            value=len(entities_list)
+                        )
+            # Detailed breakdown
+            with st.expander("📋 Detailed Entity Breakdown", expanded=True):
+                for entity_type, entities_list in entities_dict.items():
+                    st.markdown(f"**{entity_type}:**")
+                    for entity in entities_list:
+                        st.markdown(f"- `{entity['text']}` (confidence: {entity['confidence']})")
+            # Raw data for developers
+            with st.expander("🔧 Raw JSON Data", expanded=False):
                 st.json(entities_dict)
         else:
+            st.info("ℹ️ No cybersecurity entities detected in the provided text. Try using text with security-related terms like IP addresses, malware names, CVEs, etc.")
     else:
         st.warning("⚠️ Please enter some text for analysis.")
 # Footer
 st.markdown("---")
+st.markdown("""
+<div style='text-align: center; color: #666; font-size: 0.9em;'>
+    <strong>CyNER 2.0</strong> - Cybersecurity Named Entity Recognition<br>
+    Model: <code>PranavaKailash/CyNER-2.0-DeBERTa-v3-base</code> | Built with Streamlit
+</div>
+""", unsafe_allow_html=True)