Spaces:

dejanseo
/

linkbert

Running

App Files Files Community

dejanseo commited on Aug 11, 2025

Commit

df3962f

verified ·

1 Parent(s): 88da8fc

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +69 -32

src/streamlit_app.py CHANGED Viewed

@@ -8,23 +8,28 @@ import trafilatura
 # Streamlit config
 st.set_page_config(layout="wide", page_title="LinkBERT")
-# Load tokenizer & model (avoid meta-tensor .to() issue)
 MODEL_ID = "dejanseo/LinkBERT-XL"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-load_kwargs = {}
-if torch.cuda.is_available():
-    # Load directly onto GPU(s); do NOT call .to(...) afterward
-    load_kwargs.update(dict(device_map="auto", torch_dtype=torch.float16))
 else:
-    # CPU load without meta tensors
-    load_kwargs.update(dict(device_map=None))
-model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, **load_kwargs)
 model.eval()
-# Functions
 def tokenize_with_indices(text: str):
     encoded = tokenizer.encode_plus(
         text,
@@ -66,6 +71,7 @@ def process_text(inputs: str, confidence_threshold: float):
     with torch.no_grad():
         for chunk in chunk_texts:
             input_ids, token_offsets = tokenize_with_indices(chunk)
             input_ids_tensor = torch.tensor(input_ids).unsqueeze(0).to(model.device)
             outputs = model(input_ids_tensor)
@@ -73,53 +79,77 @@ def process_text(inputs: str, confidence_threshold: float):
             predictions = torch.argmax(logits, dim=-1).squeeze(0).tolist()
             softmax_scores = F.softmax(logits, dim=-1).squeeze(0).tolist()
             word_info = {}
             for idx, (start, end) in enumerate(token_offsets):
                 if idx == 0 or idx == len(token_offsets) - 1:
                     continue  # skip specials
                 word_start = start
-                while word_start > 0 and chunk[word_start - 1] != ' ':
                     word_start -= 1
                 if word_start not in word_info:
                     word_info[word_start] = {"prediction": 0, "confidence": 0.0, "subtokens": []}
                 conf_pct = softmax_scores[idx][predictions[idx]] * 100.0
                 if predictions[idx] == 1 and conf_pct >= confidence_threshold:
-                    word_info[word_start]["prediction"] = 1
                 word_info[word_start]["confidence"] = max(word_info[word_start]["confidence"], conf_pct)
                 word_info[word_start]["subtokens"].append((start, end, chunk[start:end]))
             last_end = 0
             for word_start in sorted(word_info.keys()):
                 word_data = word_info[word_start]
-                for subtoken_start, subtoken_end, subtoken_text in word_data["subtokens"]:
                     escaped = subtoken_text.replace("$", "\\$")
                     if last_end < subtoken_start:
                         reconstructed_text += chunk[last_end:subtoken_start]
                     if word_data["prediction"] == 1:
                         reconstructed_text += (
-                            f"<span style='background-color: rgba(0, 255, 0); display: inline;'>{escaped}</span>"
                         )
                     else:
-                        reconstructed_text += escaped
                     last_end = subtoken_end
                     df_data["Word"].append(escaped)
-                    df_data["Prediction"].append(word_data["prediction"])
-                    df_data["Confidence"].append(word_info[word_start]["confidence"])
                     df_data["Start"].append(subtoken_start + original_position_offset)
                     df_data["End"].append(subtoken_end + original_position_offset)
-                original_position_offset += len(chunk) + 1
-            reconstructed_text += chunk[last_end:].replace("$", "\\$")
     df_tokens = pd.DataFrame(df_data)
     return reconstructed_text, df_tokens
-# UI
 st.title("LinkBERT")
 st.markdown("""
 LinkBERT predicts natural link placement within web content. Enter text or a URL for extraction. Increase the threshold to reduce link predictions.
@@ -130,22 +160,29 @@ confidence_threshold = st.slider("Confidence Threshold", 50, 100, 50)
 tab1, tab2 = st.tabs(["Text Input", "URL Input"])
 with tab1:
-    user_input = st.text_area("Enter text to process:")
     if st.button("Process Text"):
-        highlighted_text, df_tokens = process_text(user_input, confidence_threshold)
-        st.markdown(highlighted_text, unsafe_allow_html=True)
-        st.dataframe(df_tokens)
 with tab2:
-    url_input = st.text_input("Enter URL to process:")
     if st.button("Fetch and Process"):
-        content = fetch_and_extract_content(url_input)
-        if content:
-            highlighted_text, df_tokens = process_text(content, confidence_threshold)
-            st.markdown(highlighted_text, unsafe_allow_html=True)
-            st.dataframe(df_tokens)
         else:
-            st.error("Could not fetch content from the URL. Please check the URL and try again.")
 st.divider()
 st.markdown("""
@@ -165,4 +202,4 @@ LinkBERT was fine-tuned on a dataset of organic web content and editorial links.
 Interested in using this in an automated pipeline for bulk link prediction?
 Please [book an appointment](https://dejanmarketing.com/conference/).
-""")

 # Streamlit config
 st.set_page_config(layout="wide", page_title="LinkBERT")
+# Load tokenizer & model
 MODEL_ID = "dejanseo/LinkBERT-XL"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+# Determine the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the model directly to the determined device
+# Avoid device_map="auto" if it's causing meta tensor issues with certain torch/transformers versions.
+# Load to CPU first, then move to GPU if available.
+model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
+# Explicitly move model to the determined device and dtype
+if device == "cuda":
+    model.half().to(device) # Use .half() for float16 on GPU
 else:
+    model.to(device) # For CPU, typically stick to float32 unless model was specifically trained with bfloat16 for CPU
 model.eval()
+# Functions (rest of your functions remain mostly the same)
 def tokenize_with_indices(text: str):
     encoded = tokenizer.encode_plus(
         text,
     with torch.no_grad():
         for chunk in chunk_texts:
             input_ids, token_offsets = tokenize_with_indices(chunk)
+            # Ensure input_ids_tensor is on the same device as the model
             input_ids_tensor = torch.tensor(input_ids).unsqueeze(0).to(model.device)
             outputs = model(input_ids_tensor)
             predictions = torch.argmax(logits, dim=-1).squeeze(0).tolist()
             softmax_scores = F.softmax(logits, dim=-1).squeeze(0).tolist()
+            # The rest of your processing logic
             word_info = {}
             for idx, (start, end) in enumerate(token_offsets):
                 if idx == 0 or idx == len(token_offsets) - 1:
                     continue  # skip specials
                 word_start = start
+                # Find the actual start of the word corresponding to this token
+                # This logic assumes space-separated words for the most part
+                while word_start > 0 and chunk[word_start - 1] not in [' ', '\n', '\t']:
                     word_start -= 1
+                # If a word_start maps to multiple tokens (e.g., "don't" -> ["don", "'", "t"])
+                # ensure we pick the earliest start for that conceptual word
+                while word_start > 0 and (chunk[word_start-1:word_start] == ' ' or tokenizer.decode(tokenizer.encode(chunk[word_start-1:end], add_special_tokens=False))[0] == chunk[word_start-1]):
+                     word_start -= 1
+                # Use a tuple (word_start, actual_word_text_from_chunk) as key for more robust aggregation
+                # For simplicity here, we stick to word_start
                 if word_start not in word_info:
+                    # Initialize with default for "not link"
                     word_info[word_start] = {"prediction": 0, "confidence": 0.0, "subtokens": []}
                 conf_pct = softmax_scores[idx][predictions[idx]] * 100.0
+                # Only mark as 1 if the current token's prediction is 1 AND confidence meets threshold
                 if predictions[idx] == 1 and conf_pct >= confidence_threshold:
+                    word_info[word_start]["prediction"] = 1 # Mark the whole 'word' as a link
+                # Keep the max confidence for any token within the 'word'
                 word_info[word_start]["confidence"] = max(word_info[word_start]["confidence"], conf_pct)
                 word_info[word_start]["subtokens"].append((start, end, chunk[start:end]))
             last_end = 0
+            # Sort by word_start to maintain order
             for word_start in sorted(word_info.keys()):
                 word_data = word_info[word_start]
+                # Sort subtokens to ensure they are processed in order within a word
+                for subtoken_start, subtoken_end, subtoken_text in sorted(word_data["subtokens"], key=lambda x: x[0]):
                     escaped = subtoken_text.replace("$", "\\$")
+                    # Add any text between the last processed token and the current one
                     if last_end < subtoken_start:
                         reconstructed_text += chunk[last_end:subtoken_start]
                     if word_data["prediction"] == 1:
+                        # Apply highlight to the subtoken
                         reconstructed_text += (
+                            f"<span style='background-color: rgba(0, 255, 0, 0.5); display: inline;'>{escaped}</span>" # Added alpha for better readability
                         )
                     else:
+                        reconstructed_text += escaped # No highlight
                     last_end = subtoken_end
+                    # For DataFrame, append the info for each *subtoken*
                     df_data["Word"].append(escaped)
+                    df_data["Prediction"].append(word_data["prediction"]) # Prediction applies to the whole conceptual word
+                    df_data["Confidence"].append(word_data["confidence"]) # Confidence applies to the whole conceptual word
                     df_data["Start"].append(subtoken_start + original_position_offset)
                     df_data["End"].append(subtoken_end + original_position_offset)
+            # Add any remaining text from the current chunk after the last token
+            if last_end < len(chunk):
+                reconstructed_text += chunk[last_end:].replace("$", "\\$")
+            # Update offset for the next chunk. Add 1 for the space that was implicitly there.
+            original_position_offset += len(chunk) + 1
     df_tokens = pd.DataFrame(df_data)
     return reconstructed_text, df_tokens
+# UI (remains the same)
 st.title("LinkBERT")
 st.markdown("""
 LinkBERT predicts natural link placement within web content. Enter text or a URL for extraction. Increase the threshold to reduce link predictions.
 tab1, tab2 = st.tabs(["Text Input", "URL Input"])
 with tab1:
+    user_input = st.text_area("Enter text to process:", height=200) # Added height for better UX
     if st.button("Process Text"):
+        if user_input: # Ensure input is not empty
+            highlighted_text, df_tokens = process_text(user_input, confidence_threshold)
+            st.markdown(highlighted_text, unsafe_allow_html=True)
+            st.dataframe(df_tokens)
+        else:
+            st.warning("Please enter some text to process.")
 with tab2:
+    url_input = st.text_input("Enter URL to process:", value="https://dejan.ai/blog/gpt-5-made-seo-irreplaceable/") # Pre-fill with example
     if st.button("Fetch and Process"):
+        if url_input: # Ensure URL input is not empty
+            with st.spinner("Fetching and processing content..."):
+                content = fetch_and_extract_content(url_input)
+                if content:
+                    highlighted_text, df_tokens = process_text(content, confidence_threshold)
+                    st.markdown(highlighted_text, unsafe_allow_html=True)
+                    st.dataframe(df_tokens)
+                else:
+                    st.error("Could not fetch content from the URL. Please check the URL and try again.")
         else:
+            st.warning("Please enter a URL to process.")
 st.divider()
 st.markdown("""
 Interested in using this in an automated pipeline for bulk link prediction?
 Please [book an appointment](https://dejanmarketing.com/conference/).
+""")