Spaces:

Jellyfish042
/

Compression-Lens

Running

App Files Files Community

Jellyfish042 commited on Jan 18

Commit

452ae9b

1 Parent(s): 04ab453

update

Browse files

Files changed (1) hide show

visualization/html_generator.py +75 -35

visualization/html_generator.py CHANGED Viewed

@@ -36,6 +36,7 @@ def get_rwkv_tokenizer():
     if _rwkv_tokenizer is None:
         from rwkv.rwkv_tokenizer import TRIE_TOKENIZER
         import os
         script_dir = os.path.dirname(os.path.abspath(__file__))
         vocab_path = os.path.join(os.path.dirname(script_dir), "support", "rwkv_vocab_v20230424.txt")
         _rwkv_tokenizer = TRIE_TOKENIZER(vocab_path)
@@ -297,24 +298,28 @@ def generate_comparison_html(
         rwkv_toks = get_tokens_for_range(start_byte, end_byte, rwkv_tokens)
         if re.search(r"\w", token_text, re.UNICODE):
-            tokens.append({
-                "type": "word",
-                "text": token_text,
-                "byte_start": start_byte,
-                "byte_end": end_byte,
-                "word_lower": token_text.lower(),
-                "qwen_tokens": qwen_toks,
-                "rwkv_tokens": rwkv_toks,
-            })
         else:
-            tokens.append({
-                "type": "non-word",
-                "text": token_text,
-                "byte_start": start_byte,
-                "byte_end": end_byte,
-                "qwen_tokens": qwen_toks,
-                "rwkv_tokens": rwkv_toks,
-            })
     # Track word occurrences
     word_occurrences = {}
@@ -335,14 +340,16 @@ def generate_comparison_html(
     def escape_for_attr(s):
         # Escape all characters that could break HTML attributes
         # Order matters: & must be escaped first
-        return (s.replace("&", "&amp;")
-                 .replace('"', "&quot;")
-                 .replace("'", "&#39;")
-                 .replace("<", "&lt;")
-                 .replace(">", "&gt;")
-                 .replace("\n", "&#10;")
-                 .replace("\r", "&#13;")
-                 .replace("\t", "&#9;"))
     for token in tokens:
         token_text = token["text"]
@@ -382,7 +389,8 @@ def generate_comparison_html(
                     ]
                     # Use base64 encoding to avoid escaping issues
                     import base64
-                    topk_a_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode('utf-8')).decode('ascii')
                 except Exception as e:
                     pass
         if topk_predictions_b is not None and model_b_token_ranges:
@@ -393,7 +401,8 @@ def generate_comparison_html(
                     decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
                     # Use base64 encoding to avoid escaping issues
                     import base64
-                    topk_b_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode('utf-8')).decode('ascii')
                 except Exception as e:
                     pass
@@ -402,7 +411,13 @@ def generate_comparison_html(
         token_deltas = deltas[byte_start:byte_end]
         avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
-        color = delta_to_color(avg_token_delta, avg_delta, max_deviation)
         r, g, b = color
         token_html_parts = []
@@ -857,6 +872,7 @@ def generate_comparison_html(
         const avgDelta = {avg_delta_compression};
         const slider = document.getElementById('saturation-slider');
         const saturationValue = document.getElementById('saturation-value');
         const allDeltas = [];
         tokenSpans.forEach(token => {{
@@ -864,6 +880,12 @@ def generate_comparison_html(
             if (!isNaN(delta)) allDeltas.push(delta);
         }});
         function percentile(arr, p) {{
             const sorted = [...arr].sort((a, b) => a - b);
             const idx = (p / 100) * (sorted.length - 1);
@@ -873,9 +895,9 @@ def generate_comparison_html(
             return sorted[lower] + (sorted[upper] - sorted[lower]) * (idx - lower);
         }}
-        function deltaToColor(delta, avgDelta, maxDeviation) {{
             if (maxDeviation === 0) return 'rgb(255, 255, 255)';
-            const deviation = delta - avgDelta;
             let normalized = Math.max(-1, Math.min(1, deviation / maxDeviation));
             let r, g, b;
             if (normalized < 0) {{
@@ -892,13 +914,31 @@ def generate_comparison_html(
             return `rgb(${{r}}, ${{g}}, ${{b}})`;
         }}
-        function updateColors(percentileValue) {{
-            const deviations = allDeltas.map(d => Math.abs(d - avgDelta));
-            const maxDeviation = Math.max(percentile(deviations, percentileValue), 1e-6);
-            tokenSpans.forEach(token => {{
                 const delta = parseFloat(token.getAttribute('data-delta'));
                 if (!isNaN(delta)) {{
-                    token.style.backgroundColor = deltaToColor(delta, avgDelta, maxDeviation);
                 }}
             }});
         }}

     if _rwkv_tokenizer is None:
         from rwkv.rwkv_tokenizer import TRIE_TOKENIZER
         import os
         script_dir = os.path.dirname(os.path.abspath(__file__))
         vocab_path = os.path.join(os.path.dirname(script_dir), "support", "rwkv_vocab_v20230424.txt")
         _rwkv_tokenizer = TRIE_TOKENIZER(vocab_path)
         rwkv_toks = get_tokens_for_range(start_byte, end_byte, rwkv_tokens)
         if re.search(r"\w", token_text, re.UNICODE):
+            tokens.append(
+                {
+                    "type": "word",
+                    "text": token_text,
+                    "byte_start": start_byte,
+                    "byte_end": end_byte,
+                    "word_lower": token_text.lower(),
+                    "qwen_tokens": qwen_toks,
+                    "rwkv_tokens": rwkv_toks,
+                }
+            )
         else:
+            tokens.append(
+                {
+                    "type": "non-word",
+                    "text": token_text,
+                    "byte_start": start_byte,
+                    "byte_end": end_byte,
+                    "qwen_tokens": qwen_toks,
+                    "rwkv_tokens": rwkv_toks,
+                }
+            )
     # Track word occurrences
     word_occurrences = {}
     def escape_for_attr(s):
         # Escape all characters that could break HTML attributes
         # Order matters: & must be escaped first
+        return (
+            s.replace("&", "&amp;")
+            .replace('"', "&quot;")
+            .replace("'", "&#39;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+            .replace("\n", "&#10;")
+            .replace("\r", "&#13;")
+            .replace("\t", "&#9;")
+        )
     for token in tokens:
         token_text = token["text"]
                     ]
                     # Use base64 encoding to avoid escaping issues
                     import base64
+                    topk_a_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
                 except Exception as e:
                     pass
         if topk_predictions_b is not None and model_b_token_ranges:
                     decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
                     # Use base64 encoding to avoid escaping issues
                     import base64
+                    topk_b_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
                 except Exception as e:
                     pass
         token_deltas = deltas[byte_start:byte_end]
         avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
+        # Apply power transformation to enhance color differentiation
+        # Preserve the sign and apply power to the absolute value
+        power_n = 3
+        sign = 1 if avg_token_delta >= 0 else -1
+        avg_token_delta_powered = sign * (abs(avg_token_delta) ** power_n)
+        color = delta_to_color(avg_token_delta_powered, avg_delta, max_deviation)
         r, g, b = color
         token_html_parts = []
         const avgDelta = {avg_delta_compression};
         const slider = document.getElementById('saturation-slider');
         const saturationValue = document.getElementById('saturation-value');
+        const powerN = 3;  // Must match Python's power_n
         const allDeltas = [];
         tokenSpans.forEach(token => {{
             if (!isNaN(delta)) allDeltas.push(delta);
         }});
+        // Apply power transformation to delta values (matching Python's logic)
+        function applyPower(delta) {{
+            const sign = delta >= 0 ? 1 : -1;
+            return sign * Math.pow(Math.abs(delta), powerN);
+        }}
         function percentile(arr, p) {{
             const sorted = [...arr].sort((a, b) => a - b);
             const idx = (p / 100) * (sorted.length - 1);
             return sorted[lower] + (sorted[upper] - sorted[lower]) * (idx - lower);
         }}
+        function deltaToColor(deltaPowered, avgDeltaPowered, maxDeviation) {{
             if (maxDeviation === 0) return 'rgb(255, 255, 255)';
+            const deviation = deltaPowered - avgDeltaPowered;
             let normalized = Math.max(-1, Math.min(1, deviation / maxDeviation));
             let r, g, b;
             if (normalized < 0) {{
             return `rgb(${{r}}, ${{g}}, ${{b}})`;
         }}
+        // Pre-compute powered deltas and avgDeltaPowered
+        const allDeltasPowered = allDeltas.map(d => applyPower(d));
+        const avgDeltaPowered = applyPower(avgDelta);
+        // Pre-compute min and max deviations for logarithmic interpolation
+        const allDeviations = allDeltasPowered.map(d => Math.abs(d - avgDeltaPowered));
+        const minDeviation = Math.max(percentile(allDeviations, 1), 1e-9);  // Use 1st percentile to avoid extreme outliers
+        const maxDeviationFull = Math.max(percentile(allDeviations, 100), 1e-6);
+        function updateColors(sliderValue) {{
+            // Use logarithmic interpolation for smoother perceptual control
+            // sliderValue: 50-100, maps to maxDeviation from minDeviation to maxDeviationFull
+            // Lower slider value = lower maxDeviation = more saturation (more colors hit the clamp)
+            // Higher slider value = higher maxDeviation = less saturation (fewer colors hit the clamp)
+            const t = (sliderValue - 50) / 50;  // Normalize to 0-1
+            // Logarithmic interpolation: exp(lerp(log(min), log(max), t))
+            const logMin = Math.log(minDeviation);
+            const logMax = Math.log(maxDeviationFull);
+            const maxDeviation = Math.exp(logMin + t * (logMax - logMin));
+            tokenSpans.forEach((token, idx) => {{
                 const delta = parseFloat(token.getAttribute('data-delta'));
                 if (!isNaN(delta)) {{
+                    const deltaPowered = applyPower(delta);
+                    token.style.backgroundColor = deltaToColor(deltaPowered, avgDeltaPowered, maxDeviation);
                 }}
             }});
         }}