Spaces:

dejanseo
/

linkbert-gradio

Paused

App Files Files Community

dejanseo commited on Jan 18

Commit

d8dd02e

verified ·

1 Parent(s): a2bc870

Create app.py

Browse files

Files changed (1) hide show

app.py +232 -0

app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import numpy as np
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Optional
+# ----------------------------------
+# Logging
+# ----------------------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ----------------------------------
+# Config
+# ----------------------------------
+@dataclass
+class AppConfig:
+    model_name: str = "dejanseo/link-prediction"
+    max_length: int = 512
+    doc_stride: int = 128
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+# ----------------------------------
+# Load model/tokenizer
+# ----------------------------------
+config = AppConfig()
+logger.info(f"Loading model: {config.model_name} on {config.device}")
+model = AutoModelForTokenClassification.from_pretrained(config.model_name)
+tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+model.to(config.device)
+model.eval()
+logger.info("Model loaded successfully.")
+# ----------------------------------
+# Inference helpers
+# ----------------------------------
+def windowize_inference(
+    plain_text: str, tokenizer: AutoTokenizer, max_length: int, doc_stride: int
+) -> List[Dict]:
+    """Slice long text into overlapping windows for inference."""
+    specials = tokenizer.num_special_tokens_to_add(pair=False)
+    cap = max_length - specials
+    full_encoding = tokenizer(
+        plain_text, add_special_tokens=False, return_offsets_mapping=True, truncation=False
+    )
+    temp_tokenization = tokenizer(plain_text, truncation=False)
+    full_word_ids = temp_tokenization.word_ids(batch_index=0)
+    windows_data = []
+    step = max(cap - doc_stride, 1)
+    start_token_idx = 0
+    total_tokens = len(full_encoding["input_ids"])
+    if total_tokens == 0 and len(plain_text) > 0:
+        logger.warning("Tokenizer produced 0 tokens for a non-empty string.")
+        return []
+    while start_token_idx < total_tokens:
+        end_token_idx = min(start_token_idx + cap, total_tokens)
+        ids_slice = full_encoding["input_ids"][start_token_idx:end_token_idx]
+        offsets_slice = full_encoding["offset_mapping"][start_token_idx:end_token_idx]
+        word_ids_slice = []
+        current_token = 0
+        for i, wid in enumerate(full_word_ids):
+            if temp_tokenization.token_to_chars(i) is not None:
+                if current_token >= start_token_idx and current_token < end_token_idx:
+                    word_ids_slice.append(wid)
+                current_token += 1
+        input_ids = tokenizer.build_inputs_with_special_tokens(ids_slice)
+        attention_mask = [1] * len(input_ids)
+        padding_length = max_length - len(input_ids)
+        input_ids.extend([tokenizer.pad_token_id] * padding_length)
+        attention_mask.extend([0] * padding_length)
+        window_offset_mapping = [(0, 0)] + offsets_slice + [(0, 0)]
+        window_offset_mapping += [(0, 0)] * padding_length
+        window_word_ids = [None] + word_ids_slice + [None]
+        window_word_ids += [None] * padding_length
+        windows_data.append({
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "word_ids": window_word_ids[:max_length],
+            "offset_mapping": window_offset_mapping[:max_length],
+        })
+        if end_token_idx >= total_tokens:
+            break
+        start_token_idx += step
+    return windows_data
+def classify_text(text: str, threshold_percent: float) -> Tuple[str, Optional[str]]:
+    """Classify link tokens with windowing. Returns (html, warning)."""
+    if not text.strip():
+        return "", "Input text is empty."
+    windows = windowize_inference(text, tokenizer, config.max_length, config.doc_stride)
+    if not windows:
+        return "", "Could not generate any windows for processing."
+    char_link_probabilities = np.zeros(len(text), dtype=np.float32)
+    with torch.no_grad():
+        for window in windows:
+            inputs = {
+                'input_ids': window['input_ids'].unsqueeze(0).to(config.device),
+                'attention_mask': window['attention_mask'].unsqueeze(0).to(config.device)
+            }
+            outputs = model(**inputs)
+            probabilities = torch.softmax(outputs.logits, dim=-1).squeeze(0)
+            link_probs = probabilities[:, 1].cpu().numpy()
+            for i, offset in enumerate(window['offset_mapping']):
+                if isinstance(offset, (list, tuple)) and len(offset) == 2:
+                    start, end = offset
+                    if window['word_ids'][i] is not None and start < end:
+                        char_link_probabilities[start:end] = np.maximum(
+                            char_link_probabilities[start:end], link_probs[i]
+                        )
+    final_threshold = threshold_percent / 100.0
+    full_encoding = tokenizer(text, return_offsets_mapping=True, truncation=False)
+    word_ids = full_encoding.word_ids(batch_index=0)
+    offsets = full_encoding['offset_mapping']
+    word_max_prob_map: Dict[int, float] = {}
+    word_char_spans: Dict[int, List[int]] = {}
+    for i, word_id in enumerate(word_ids):
+        if word_id is not None and i < len(offsets):
+            start_char, end_char = offsets[i]
+            if start_char < end_char:
+                current_token_max_prob = np.max(char_link_probabilities[start_char:end_char]) if start_char < len(char_link_probabilities) else 0.0
+                if word_id not in word_max_prob_map:
+                    word_max_prob_map[word_id] = current_token_max_prob
+                    word_char_spans[word_id] = [start_char, end_char]
+                else:
+                    word_max_prob_map[word_id] = max(word_max_prob_map[word_id], current_token_max_prob)
+                    word_char_spans[word_id][1] = end_char
+    highlight_candidates: Dict[int, float] = {}
+    for word_id, max_prob in word_max_prob_map.items():
+        if max_prob >= final_threshold:
+            highlight_candidates[word_id] = max_prob
+    max_highlight_prob = max(highlight_candidates.values()) if highlight_candidates else 0.0
+    html_parts, current_char = [], 0
+    sorted_word_ids = sorted(word_char_spans.keys(), key=lambda k: word_char_spans[k][0])
+    for word_id in sorted_word_ids:
+        start_char, end_char = word_char_spans[word_id]
+        if start_char > current_char:
+            html_parts.append(text[current_char:start_char])
+        word_text = text[start_char:end_char]
+        if word_id in highlight_candidates:
+            word_prob = highlight_candidates[word_id]
+            normalized_opacity = (word_prob / max_highlight_prob) * 0.9 + 0.1 if max_highlight_prob > 0 else 1.0
+            html_parts.append(
+                f"<span style='background-color: #D4EDDA; color: #155724; "
+                f"padding: 0.1em 0.2em; border-radius: 0.2em; opacity: {normalized_opacity:.2f};' "
+                f"title='Link Probability: {word_prob:.1%}'>{word_text}</span>"
+            )
+        else:
+            html_parts.append(word_text)
+        current_char = end_char
+    if current_char < len(text):
+        html_parts.append(text[current_char:])
+    return "".join(html_parts), None
+# ----------------------------------
+# Gradio Interface
+# ----------------------------------
+def predict(text: str, threshold: float) -> str:
+    """Main prediction function for Gradio."""
+    html, warning = classify_text(text, threshold)
+    if warning:
+        return f"<p style='color: orange;'>{warning}</p>"
+    return html
+# Build the interface
+with gr.Blocks(title="LinkBERT by DEJAN AI") as demo:
+    gr.Markdown("# LinkBERT")
+    gr.Markdown("Predict natural link placement in plain text.")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="Paste your text here...",
+                lines=8,
+                value="DEJAN AI is the world's leading AI SEO agency. This tool showcases the capability of our latest link prediction model called LinkBERT."
+            )
+            threshold_slider = gr.Slider(
+                minimum=0,
+                maximum=100,
+                value=70,
+                step=1,
+                label="Link Probability Threshold (%)"
+            )
+            submit_btn = gr.Button("Classify Text", variant="primary")
+        with gr.Column():
+            output_html = gr.HTML(label="Results")
+    submit_btn.click(
+        fn=predict,
+        inputs=[text_input, threshold_slider],
+        outputs=output_html,
+        api_name="predict"  # Exposes as /api/predict
+    )
+# Launch
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)