Spaces:

Nomi78600
/

bert-ner-app

Sleeping

App Files Files Community

Nomi78600 commited on Aug 24, 2025

Commit

981a77e

0 Parent(s):

main

Browse files

Files changed (11) hide show

.gitattributes +2 -0
.gitignore +69 -0
NER_Using_BERT_updated.ipynb +0 -0
app.py +182 -0
model/config.json +47 -0
model/special_tokens_map.json +7 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +56 -0
model/training_args.bin +3 -0
model/vocab.txt +0 -0
requirements.txt +7 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.bin filter=lfs diff=lfs merge=lfs -text
2	+ *.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+model/model.safetensors
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Jupyter Notebook
+.ipynb_checkpoints
+results/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDEs
+.idea/
+.vscode/

NER_Using_BERT_updated.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import torch
+import numpy as np
+# --- CONFIGURATION ---
+MODEL_DIR = "./model"
+st.set_page_config(page_title="NER with BERT", page_icon="🤖", layout="wide")
+# --- MODEL LOADING ---
+@st.cache_resource
+def load_model_and_tokenizer(model_path):
+    """Load the fine-tuned model and tokenizer from a local directory."""
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForTokenClassification.from_pretrained(model_path)
+        return tokenizer, model
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+        return None, None
+tokenizer, model = load_model_and_tokenizer(MODEL_DIR)
+if model is None:
+    st.stop()
+# --- NER VISUALIZATION ---
+ENTITY_COLORS = {
+    "PER": "#ffc107",  # Yellow
+    "ORG": "#007bff",  # Blue
+    "LOC": "#28a745",  # Green
+    "MISC": "#dc3545", # Red
+    "O": "#adb5bd"    # Gray for non-entities, though we won't highlight them
+}
+LABEL_NAMES = model.config.id2label
+def get_entity_html(text, label):
+    """Generates HTML for a single entity with a colored background."""
+    entity_type = label.split('-')[-1]
+    color = ENTITY_COLORS.get(entity_type, "#adb5bd")
+    return f'<span style="background-color: {color}; color: white; padding: 0.2em 0.4em; margin: 0 0.2em; border-radius: 0.3em; font-weight: bold;">{text} <span style="font-size: 0.8em; opacity: 0.7;">{entity_type}</span></span>'
+def visualize_ner(text, predictions):
+    """Combines tokens and predictions into a visualized HTML string."""
+    html_output = ""
+    current_word = ""
+    current_label = "O"
+    for token, label in zip(text.split(), predictions):
+        # If the label is a B-tag, start a new entity
+        if label.startswith("B-"):
+            # If there was a previous entity, add it to the output
+            if current_word:
+                if current_label != "O":
+                    html_output += get_entity_html(current_word, current_label)
+                else:
+                    html_output += current_word + " "
+            current_word = token + " "
+            current_label = label
+        # If it's an I-tag and matches the current entity type, continue it
+        elif label.startswith("I-") and current_label.split('-')[-1] == label.split('-')[-1]:
+            current_word += token + " "
+        # Otherwise, it's a new word or an O-tag
+        else:
+            # Add the completed entity or word to the output
+            if current_word:
+                if current_label != "O":
+                    html_output += get_entity_html(current_word.strip(), current_label) + " "
+                else:
+                    html_output += current_word
+            # Reset for the current token
+            current_word = token + " "
+            current_label = "O" # Default to O if the label isn't B- or I-
+    # Add the last processed word/entity
+    if current_word:
+        if current_label != "O":
+            html_output += get_entity_html(current_word.strip(), current_label)
+        else:
+            html_output += current_word
+    return html_output.strip()
+# --- STREAMLIT APP LAYOUT ---
+st.title("Named Entity Recognition (NER) with BERT")
+st.markdown("Enter text below to identify entities like Persons (PER), Organizations (ORG), Locations (LOC), and Miscellaneous (MISC).")
+text_input = st.text_area("Input Text", height=150, placeholder="Example: Elon Musk, the CEO of SpaceX, announced a new mission to Mars from their headquarters in California.")
+if st.button("Analyze Text"):
+    if not text_input:
+        st.warning("Please enter some text to analyze.")
+    else:
+        with st.spinner("Analyzing..."):
+            # 1. Tokenization
+            inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
+            # 2. Model Prediction
+            with torch.no_grad():
+                outputs = model(**inputs)
+            predictions = np.argmax(outputs.logits.detach().numpy(), axis=2)
+            # 3. Post-processing
+            predicted_labels = []
+            tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+            for token, pred_id in zip(tokens, predictions[0]):
+                if token not in (tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token):
+                    predicted_labels.append(LABEL_NAMES[pred_id])
+            # This is a simplified alignment. For a more robust solution, we'd align subwords to words.
+            # For this app, we'll assume a simple space-based tokenization for visualization.
+            words = text_input.split()
+            # Heuristic: Assign the first label of a word to the whole word.
+            aligned_predictions = []
+            label_idx = 0
+            for word in words:
+                word_tokens = tokenizer.tokenize(word)
+                if label_idx < len(predicted_labels):
+                    aligned_predictions.append(predicted_labels[label_idx])
+                    label_idx += len(word_tokens)
+                else:
+                    aligned_predictions.append("O")
+            # 4. Visualization
+            st.subheader("Analysis Results")
+            # A more robust visualization that handles subword tokenization better
+            final_tokens = []
+            final_labels = []
+            word_ids = inputs.word_ids()
+            previous_word_id = None
+            for i, token_id in enumerate(inputs['input_ids'][0]):
+                if token_id not in (tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id):
+                    word_id = word_ids[i]
+                    if word_id is not None and word_id != previous_word_id:
+                        start, end = inputs.token_to_chars(i)
+                        word = text_input[start:end]
+                        final_tokens.append(word)
+                        final_labels.append(LABEL_NAMES[predictions[0][i]])
+                    previous_word_id = word_id
+            # Grouping entities
+            grouped_entities = []
+            current_entity = {"text": "", "label": ""}
+            for token, label in zip(final_tokens, final_labels):
+                if label.startswith("B-"):
+                    if current_entity["text"]:
+                        grouped_entities.append(current_entity)
+                    current_entity = {"text": token, "label": label.split("-")[1]}
+                elif label.startswith("I-") and current_entity["label"] == label.split("-")[1]:
+                    current_entity["text"] += " " + token
+                else:
+                    if current_entity["text"]:
+                        grouped_entities.append(current_entity)
+                        current_entity = {"text": "", "label": ""}
+                    # We don't add O-tagged words to the list of entities
+            if current_entity["text"]:
+                grouped_entities.append(current_entity)
+            # Displaying the text with highlighted entities
+            display_text = text_input
+            for entity in reversed(grouped_entities): # Reverse to avoid index shifting
+                entity_html = get_entity_html(entity["text"], entity["label"])
+                # This is a simple text replace, might fail on overlapping entities
+                display_text = display_text.replace(entity["text"], entity_html, 1)
+            st.markdown(display_text, unsafe_allow_html=True)
+            # Displaying entities as a list
+            st.subheader("Extracted Entities")
+            if grouped_entities:
+                for entity in grouped_entities:
+                    st.markdown(f"- **{entity['text']}** (`{entity['label']}`)")
+            else:
+                st.info("No entities were found in the text.")

model/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 28996
+}

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe8b8d384ad9d15cac07dd62997c984d3bd4c9e0460f4ebc4604319babdefd45
+size 5777

model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+transformers
+streamlit
+datasets
+seqeval
+numpy
+scipy