File size: 1,839 Bytes
28c5847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
from tokenizer import StockBPE
import json

# Initialize and load the tokenizer
tokenizer = StockBPE()
try:
    tokenizer.load("stock_bpe")
    print("Tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # Fallback for initial build if files aren't there yet
    pass

def analyze_text(text):
    if not text:
        return "Please enter text", "0", "0.00x"
    
    # Encode
    tokens = tokenizer.encode(text)
    
    # Decode
    decoded = tokenizer.decode(tokens)
    
    # Stats
    original_len = len(text.encode('utf-8'))
    token_len = len(tokens)
    ratio = original_len / token_len if token_len > 0 else 0
    
    # Format output
    token_str = str(tokens)
    if len(token_str) > 1000:
        token_str = token_str[:1000] + "... (truncated)"
        
    return token_str, decoded, f"{ratio:.2f}x"

# Example data
examples = [
    ["TECH|AAPL|2020-11|MON|UNDER200|OPEN:113.9|HIGH:117.8|LOW:113.7|CLOSE:115.9|VOL:HIGH"],
    ["FIN|JPM|2023-05|FRI|UNDER150|OPEN:135.2|HIGH:136.5|LOW:134.8|CLOSE:135.9|VOL:MED"],
    ["TECH|MSFT|2024-01|WED|OVER300|OPEN:380.5|HIGH:385.2|LOW:379.0|CLOSE:384.8|VOL:HIGH"]
]

# Create Interface
iface = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(lines=3, placeholder="Enter stock data here...", label="Input Text"),
    outputs=[
        gr.Textbox(label="Tokens IDs"),
        gr.Textbox(label="Decoded Back (Verification)"),
        gr.Label(label="Compression Ratio")
    ],
    title="πŸ“ˆ Stock Market BPE Tokenizer",
    description="A custom BPE tokenizer trained on financial time-series data. Enter stock data to see how it gets compressed!",
    examples=examples,
    theme="huggingface"
)

if __name__ == "__main__":
    iface.launch()