Spaces:

z0u
/

sparky

Sleeping

App Files Files Community

z0u commited on Jan 30, 2025

Commit

cae066d

unverified ·

1 Parent(s): b479caf

added input validation

Browse files

Files changed (5) hide show

README.md +0 -3
app.py +93 -58
requirements.txt +1 -0
src/sparky/inference/calc_metrics.py +1 -2
src/sparky/inference/model.py +17 -0

README.md CHANGED Viewed

@@ -44,10 +44,7 @@ To run locally:
 ```bash
 uv venv
-# source .venv/bin/activate
 uv pip install -r requirements-dev.txt
-uv pip compile requirements-dev.txt -o uv.lock
 uv run app.py
 ```

 ```bash
 uv venv
 uv pip install -r requirements-dev.txt
 uv run app.py
 ```

app.py CHANGED Viewed

@@ -1,63 +1,113 @@
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent / "src"))
 import gradio as gr
 from sparky.inference import load_model, calc_token_metrics, visualize_batch
-# Load model on startup (cached)
 model, tokenizer = load_model("gpt2")
 def analyze_text(
     text: str,
-    line_width: int = 80,
-    surprisal: bool = False,
-    entropy: bool = False,
-    s2: bool = True,
 ) -> str:
-    # Default to S2 if nothing selected
-    if not any([s2, entropy, surprisal]):
-        s2 = True
-    # Build list of metrics to show
-    metrics_to_show = []
     if surprisal:
-        metrics_to_show.append("surprisal")
     if entropy:
-        metrics_to_show.append("entropy")
     if s2:
-        metrics_to_show.append("s2")
-    # Calculate metrics and generate visualization
-    metrics = calc_token_metrics([text], model, tokenizer)
-    svgs = visualize_batch(
-        metrics, metrics_to_show=metrics_to_show, line_width=line_width
     )
-    return svgs[0]  # Return first (only) SVG
-# Create Gradio interface
 demo = gr.Interface(
     fn=analyze_text,
-    inputs=[
-        gr.Textbox(
-            label="Text to analyze",
-            placeholder="Enter some text to analyze its information content...",
-            lines=3,
-            value="The quick brown fox jumps over the lazy dog.",
-        ),
-        gr.Slider(
-            label="Line width (characters)",
-            minimum=30,
-            maximum=200,
-            step=10,
-            value=30,
-        ),
-        gr.Checkbox(label="Show surprisal", value=True),
-        gr.Checkbox(label="Show entropy", value=True),
-        gr.Checkbox(label="Show S₂ (surprise-surprise)"),
-    ],
     outputs=gr.HTML(),
     title="Token Information Content Visualization",
     description="""
@@ -65,27 +115,12 @@ demo = gr.Interface(
     - **Surprisal**: Actual information content (-log probability) of each token
     - **Entropy**: Expected information content (uncertainty) at each position
-    - **S₂** (surprise-surprise): How much more/less surprising a token is than expected
     """,
-    examples=[
-        ["The quick brown fox jumps over the lazy dog.", 30, True, True, False],
-        [
-            "In a shocking turn of events, the seemingly impossible task",
-            30,
-            False,
-            False,
-            True,
-        ],
-        [
-            "In a shocking turn of table, the seemingly impossible task",
-            30,
-            False,
-            False,
-            True,
-        ],
-        ["A long time ago, in a galaxy far, far away...", 50, True, False, True],
-    ],
 )
 if __name__ == "__main__":
-    demo.launch(show_error=True)

 import sys
 from pathlib import Path
+from functools import lru_cache, wraps
 sys.path.append(str(Path(__file__).parent / "src"))
+from pydantic import validate_call, Field, ValidationError
 import gradio as gr
 from sparky.inference import load_model, calc_token_metrics, visualize_batch
+from sparky.inference.visualize import MetricType
 model, tokenizer = load_model("gpt2")
+def catch_all(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs) -> str:
+        try:
+            return fn(*args, **kwargs)
+        except ValidationError as e:
+            return "<br>".join(
+                f"{', '.join(err['loc'])}: {err['msg']}"
+                for err in e.errors(
+                    include_url=False,
+                    include_context=False,
+                    include_input=True,
+                )
+            )
+        except Exception as e:
+            print(f"Error processing text: {str(e)}", file=sys.stderr)
+            return "Sorry, there was an error processing your text. Please try a different input."
+    return wrapper
+@catch_all
+@validate_call(validate_return=True)
+def _analyze_text(
+    text: str = Field(..., min_length=1, max_length=2000),
+    line_width: int = Field(..., ge=20, le=200),
+    metrics_to_show: tuple[MetricType, ...] = Field(..., min_length=1),
+) -> str:
+    metrics = calc_token_metrics([text], model, tokenizer)
+    svgs = visualize_batch(
+        metrics, metrics_to_show=metrics_to_show, line_width=line_width
+    )
+    return svgs[0]
+@lru_cache(128)
 def analyze_text(
     text: str,
+    line_width: int,
+    surprisal: bool,
+    entropy: bool,
+    s2: bool,
 ) -> str:
+    metrics_to_show = tuple()
     if surprisal:
+        metrics_to_show += ("surprisal",)
     if entropy:
+        metrics_to_show += ("entropy",)
     if s2:
+        metrics_to_show += ("s2",)
+    return _analyze_text(
+        text=text, line_width=line_width, metrics_to_show=metrics_to_show
     )
+# Define UI components
+text_input = gr.Textbox(
+    label="text",
+    placeholder="Enter some text to analyze...",
+    lines=3,
+    value="The quick brown fox jumps over the lazy dog.",
+)
+width_slider = gr.Slider(
+    label="line_width",
+    minimum=20,
+    maximum=120,
+    step=10,
+    value=30,
+)
+metric_toggles = [
+    gr.Checkbox(label="Surprisal", value=True),
+    gr.Checkbox(label="Entropy", value=True),
+    gr.Checkbox(label="S₂", value=False),
+]
+inputs = [text_input, width_slider, *metric_toggles]
+# Example inputs showing different linguistic patterns
+examples = [
+    ["The quick brown fox jumps over the lazy dog."],
+    ["In a shocking turn of events, the seemingly impossible task"],
+    ["In a shocking turn of table, the seemingly impossible task"],
+    ["A long time ago, in a galaxy far, far away..."],
+]
+empty_sample = [None] * len(inputs)
+examples = [sample + empty_sample[len(sample) :] for sample in examples]
+# Create interface
 demo = gr.Interface(
     fn=analyze_text,
+    inputs=inputs,
     outputs=gr.HTML(),
     title="Token Information Content Visualization",
     description="""
     - **Surprisal**: Actual information content (-log probability) of each token
     - **Entropy**: Expected information content (uncertainty) at each position
+    - **S₂** (surprise-surprise): How much more/less surprising a token is than expected (surprisal - entropy)
+    Read the paper for more details about this visualization and S₂: [Detecting out of distribution text with surprisal and entropy](https://www.lesswrong.com/posts/Kjo64rSWkFfc3sre5/detecting-out-of-distribution-text-with-surprisal-and#)
     """,
+    examples=examples,
 )
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 numpy~=2.2.2
 torch~=2.5.1
 transformers~=4.48.1

 numpy~=2.2.2
 torch~=2.5.1
 transformers~=4.48.1
+pydantic~=2.10.0

src/sparky/inference/calc_metrics.py CHANGED Viewed

@@ -16,7 +16,6 @@ def calc_token_metrics(
     texts: List[str],
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
-    truncation=False,
 ) -> TokenMetrics:
     """Calculate per-token metrics for a batch of text sequences using a language model."""
     if tokenizer.pad_token is None:
@@ -30,7 +29,7 @@ def calc_token_metrics(
         texts,
         return_tensors="pt",
         padding=True,
-        truncation=truncation,
         return_length=True,
     ).to(device)

     texts: List[str],
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
 ) -> TokenMetrics:
     """Calculate per-token metrics for a batch of text sequences using a language model."""
     if tokenizer.pad_token is None:
         texts,
         return_tensors="pt",
         padding=True,
+        truncation=True,
         return_length=True,
     ).to(device)

src/sparky/inference/model.py CHANGED Viewed

@@ -1,9 +1,26 @@
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 def load_model(name="gpt2"):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = GPT2LMHeadModel.from_pretrained(name).to(device)
     tokenizer = GPT2Tokenizer.from_pretrained(name, clean_up_tokenization_spaces=True)
     return model, tokenizer

+import time
 import torch
+import transformers
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 def load_model(name="gpt2"):
+    print(f"Loading model {name}...")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("Loading model from pretrained...")
+    t0 = time.perf_counter()
     model = GPT2LMHeadModel.from_pretrained(name).to(device)
+    t1 = time.perf_counter()
+    print(f"Model loaded in {t1 - t0:.1f}s")
+    print("Loading tokenizer...")
     tokenizer = GPT2Tokenizer.from_pretrained(name, clean_up_tokenization_spaces=True)
+    t2 = time.perf_counter()
+    print(f"Tokenizer loaded in {t2 - t1:.1f}s")
+    # Truncate from start to preserve adversarial suffixes.
+    tokenizer.truncation_side = "left"
     return model, tokenizer