Upload 2 files
Browse files- inference.py +59 -0
- linkbert.pth +3 -0
inference.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import BertForTokenClassification, BertTokenizerFast # Import BertTokenizerFast
|
| 4 |
+
|
| 5 |
+
def load_model(model_name='linkbert.pth'):
|
| 6 |
+
model_path = model_name
|
| 7 |
+
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)
|
| 8 |
+
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
|
| 9 |
+
model.eval() # Set the model to inference mode
|
| 10 |
+
return model
|
| 11 |
+
|
| 12 |
+
def predict_and_annotate(model, tokenizer, text):
|
| 13 |
+
# Tokenize the input text with special tokens
|
| 14 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, return_offsets_mapping=True)
|
| 15 |
+
input_ids, attention_mask, offset_mapping = inputs["input_ids"], inputs["attention_mask"], inputs["offset_mapping"]
|
| 16 |
+
|
| 17 |
+
with torch.no_grad():
|
| 18 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 19 |
+
predictions = torch.argmax(outputs.logits, dim=-1)
|
| 20 |
+
|
| 21 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
|
| 22 |
+
predictions = predictions.squeeze().tolist()
|
| 23 |
+
offset_mapping = offset_mapping.squeeze().tolist()
|
| 24 |
+
|
| 25 |
+
annotated_text = ""
|
| 26 |
+
previous_end = 0
|
| 27 |
+
for offset, prediction in zip(offset_mapping, predictions):
|
| 28 |
+
start, end = offset
|
| 29 |
+
if start == end: # Skip special tokens
|
| 30 |
+
continue
|
| 31 |
+
if prediction == 1: # Anchor text
|
| 32 |
+
if start > previous_end:
|
| 33 |
+
annotated_text += text[previous_end:start]
|
| 34 |
+
annotated_text += f"<u>{text[start:end]}</u>"
|
| 35 |
+
else:
|
| 36 |
+
if start > previous_end:
|
| 37 |
+
annotated_text += text[previous_end:start]
|
| 38 |
+
annotated_text += text[start:end]
|
| 39 |
+
previous_end = end
|
| 40 |
+
annotated_text += text[previous_end:] # Append remaining text
|
| 41 |
+
|
| 42 |
+
return annotated_text
|
| 43 |
+
|
| 44 |
+
# Streamlit app setup
|
| 45 |
+
st.title("BERT Token Classification for Anchor Text Prediction")
|
| 46 |
+
|
| 47 |
+
# Load the model and tokenizer
|
| 48 |
+
model = load_model('linkbert.pth')
|
| 49 |
+
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') # Use BertTokenizerFast
|
| 50 |
+
|
| 51 |
+
# User input text area
|
| 52 |
+
user_input = st.text_area("Paste the text you want to analyze:", "Type or paste text here.")
|
| 53 |
+
|
| 54 |
+
if st.button("Predict Anchor Texts"):
|
| 55 |
+
if user_input:
|
| 56 |
+
annotated_text = predict_and_annotate(model, tokenizer, user_input)
|
| 57 |
+
st.markdown(annotated_text, unsafe_allow_html=True)
|
| 58 |
+
else:
|
| 59 |
+
st.write("Please paste some text into the text area.")
|
linkbert.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81dc286402b449bf1e0348dbd7f8bb0b64a284f452bd4e0b2bb41ddbac492a24
|
| 3 |
+
size 435654416
|