Nomi78600 commited on
Commit
981a77e
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model/model.safetensors
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Jupyter Notebook
55
+ .ipynb_checkpoints
56
+ results/
57
+
58
+ # Environments
59
+ .env
60
+ .venv
61
+ env/
62
+ venv/
63
+ ENV/
64
+ env.bak/
65
+ venv.bak/
66
+
67
+ # IDEs
68
+ .idea/
69
+ .vscode/
NER_Using_BERT_updated.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
4
+ import torch
5
+ import numpy as np
6
+
7
+ # --- CONFIGURATION ---
8
+ MODEL_DIR = "./model"
9
+ st.set_page_config(page_title="NER with BERT", page_icon="🤖", layout="wide")
10
+
11
+ # --- MODEL LOADING ---
12
+ @st.cache_resource
13
+ def load_model_and_tokenizer(model_path):
14
+ """Load the fine-tuned model and tokenizer from a local directory."""
15
+ try:
16
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
17
+ model = AutoModelForTokenClassification.from_pretrained(model_path)
18
+ return tokenizer, model
19
+ except Exception as e:
20
+ st.error(f"Error loading model: {e}")
21
+ return None, None
22
+
23
+ tokenizer, model = load_model_and_tokenizer(MODEL_DIR)
24
+ if model is None:
25
+ st.stop()
26
+
27
+ # --- NER VISUALIZATION ---
28
+ ENTITY_COLORS = {
29
+ "PER": "#ffc107", # Yellow
30
+ "ORG": "#007bff", # Blue
31
+ "LOC": "#28a745", # Green
32
+ "MISC": "#dc3545", # Red
33
+ "O": "#adb5bd" # Gray for non-entities, though we won't highlight them
34
+ }
35
+ LABEL_NAMES = model.config.id2label
36
+
37
+ def get_entity_html(text, label):
38
+ """Generates HTML for a single entity with a colored background."""
39
+ entity_type = label.split('-')[-1]
40
+ color = ENTITY_COLORS.get(entity_type, "#adb5bd")
41
+ return f'<span style="background-color: {color}; color: white; padding: 0.2em 0.4em; margin: 0 0.2em; border-radius: 0.3em; font-weight: bold;">{text} <span style="font-size: 0.8em; opacity: 0.7;">{entity_type}</span></span>'
42
+
43
+ def visualize_ner(text, predictions):
44
+ """Combines tokens and predictions into a visualized HTML string."""
45
+ html_output = ""
46
+ current_word = ""
47
+ current_label = "O"
48
+
49
+ for token, label in zip(text.split(), predictions):
50
+ # If the label is a B-tag, start a new entity
51
+ if label.startswith("B-"):
52
+ # If there was a previous entity, add it to the output
53
+ if current_word:
54
+ if current_label != "O":
55
+ html_output += get_entity_html(current_word, current_label)
56
+ else:
57
+ html_output += current_word + " "
58
+ current_word = token + " "
59
+ current_label = label
60
+ # If it's an I-tag and matches the current entity type, continue it
61
+ elif label.startswith("I-") and current_label.split('-')[-1] == label.split('-')[-1]:
62
+ current_word += token + " "
63
+ # Otherwise, it's a new word or an O-tag
64
+ else:
65
+ # Add the completed entity or word to the output
66
+ if current_word:
67
+ if current_label != "O":
68
+ html_output += get_entity_html(current_word.strip(), current_label) + " "
69
+ else:
70
+ html_output += current_word
71
+
72
+ # Reset for the current token
73
+ current_word = token + " "
74
+ current_label = "O" # Default to O if the label isn't B- or I-
75
+
76
+ # Add the last processed word/entity
77
+ if current_word:
78
+ if current_label != "O":
79
+ html_output += get_entity_html(current_word.strip(), current_label)
80
+ else:
81
+ html_output += current_word
82
+
83
+ return html_output.strip()
84
+
85
+
86
+ # --- STREAMLIT APP LAYOUT ---
87
+ st.title("Named Entity Recognition (NER) with BERT")
88
+ st.markdown("Enter text below to identify entities like Persons (PER), Organizations (ORG), Locations (LOC), and Miscellaneous (MISC).")
89
+
90
+ text_input = st.text_area("Input Text", height=150, placeholder="Example: Elon Musk, the CEO of SpaceX, announced a new mission to Mars from their headquarters in California.")
91
+
92
+ if st.button("Analyze Text"):
93
+ if not text_input:
94
+ st.warning("Please enter some text to analyze.")
95
+ else:
96
+ with st.spinner("Analyzing..."):
97
+ # 1. Tokenization
98
+ inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
99
+
100
+ # 2. Model Prediction
101
+ with torch.no_grad():
102
+ outputs = model(**inputs)
103
+
104
+ predictions = np.argmax(outputs.logits.detach().numpy(), axis=2)
105
+
106
+ # 3. Post-processing
107
+ predicted_labels = []
108
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
109
+
110
+ for token, pred_id in zip(tokens, predictions[0]):
111
+ if token not in (tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token):
112
+ predicted_labels.append(LABEL_NAMES[pred_id])
113
+
114
+ # This is a simplified alignment. For a more robust solution, we'd align subwords to words.
115
+ # For this app, we'll assume a simple space-based tokenization for visualization.
116
+ words = text_input.split()
117
+ # Heuristic: Assign the first label of a word to the whole word.
118
+ aligned_predictions = []
119
+ label_idx = 0
120
+ for word in words:
121
+ word_tokens = tokenizer.tokenize(word)
122
+ if label_idx < len(predicted_labels):
123
+ aligned_predictions.append(predicted_labels[label_idx])
124
+ label_idx += len(word_tokens)
125
+ else:
126
+ aligned_predictions.append("O")
127
+
128
+
129
+ # 4. Visualization
130
+ st.subheader("Analysis Results")
131
+
132
+ # A more robust visualization that handles subword tokenization better
133
+ final_tokens = []
134
+ final_labels = []
135
+ word_ids = inputs.word_ids()
136
+
137
+ previous_word_id = None
138
+ for i, token_id in enumerate(inputs['input_ids'][0]):
139
+ if token_id not in (tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id):
140
+ word_id = word_ids[i]
141
+ if word_id is not None and word_id != previous_word_id:
142
+ start, end = inputs.token_to_chars(i)
143
+ word = text_input[start:end]
144
+ final_tokens.append(word)
145
+ final_labels.append(LABEL_NAMES[predictions[0][i]])
146
+ previous_word_id = word_id
147
+
148
+ # Grouping entities
149
+ grouped_entities = []
150
+ current_entity = {"text": "", "label": ""}
151
+ for token, label in zip(final_tokens, final_labels):
152
+ if label.startswith("B-"):
153
+ if current_entity["text"]:
154
+ grouped_entities.append(current_entity)
155
+ current_entity = {"text": token, "label": label.split("-")[1]}
156
+ elif label.startswith("I-") and current_entity["label"] == label.split("-")[1]:
157
+ current_entity["text"] += " " + token
158
+ else:
159
+ if current_entity["text"]:
160
+ grouped_entities.append(current_entity)
161
+ current_entity = {"text": "", "label": ""}
162
+ # We don't add O-tagged words to the list of entities
163
+
164
+ if current_entity["text"]:
165
+ grouped_entities.append(current_entity)
166
+
167
+ # Displaying the text with highlighted entities
168
+ display_text = text_input
169
+ for entity in reversed(grouped_entities): # Reverse to avoid index shifting
170
+ entity_html = get_entity_html(entity["text"], entity["label"])
171
+ # This is a simple text replace, might fail on overlapping entities
172
+ display_text = display_text.replace(entity["text"], entity_html, 1)
173
+
174
+ st.markdown(display_text, unsafe_allow_html=True)
175
+
176
+ # Displaying entities as a list
177
+ st.subheader("Extracted Entities")
178
+ if grouped_entities:
179
+ for entity in grouped_entities:
180
+ st.markdown(f"- **{entity['text']}** (`{entity['label']}`)")
181
+ else:
182
+ st.info("No entities were found in the text.")
model/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4",
17
+ "5": "LABEL_5",
18
+ "6": "LABEL_6",
19
+ "7": "LABEL_7",
20
+ "8": "LABEL_8"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 3072,
24
+ "label2id": {
25
+ "LABEL_0": 0,
26
+ "LABEL_1": 1,
27
+ "LABEL_2": 2,
28
+ "LABEL_3": 3,
29
+ "LABEL_4": 4,
30
+ "LABEL_5": 5,
31
+ "LABEL_6": 6,
32
+ "LABEL_7": 7,
33
+ "LABEL_8": 8
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "bert",
38
+ "num_attention_heads": 12,
39
+ "num_hidden_layers": 12,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.55.2",
44
+ "type_vocab_size": 2,
45
+ "use_cache": true,
46
+ "vocab_size": 28996
47
+ }
model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe8b8d384ad9d15cac07dd62997c984d3bd4c9e0460f4ebc4604319babdefd45
3
+ size 5777
model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ streamlit
4
+ datasets
5
+ seqeval
6
+ numpy
7
+ scipy