Spaces:
Sleeping
Sleeping
| import re | |
| import torch | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| # Load SentenceTransformer model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def process_text(text): | |
| # Remove ASCII characters and lowercase | |
| cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() | |
| # Get token embeddings | |
| token_embeddings = model.encode(cleaned, output_value='token_embeddings', convert_to_tensor=True) | |
| tokens = model.tokenizer.tokenize(cleaned) | |
| # Pair each token with its embedding (truncated for display) | |
| result = [] | |
| for token, emb in zip(tokens, token_embeddings): | |
| result.append([token, str(emb[:5].tolist()) + '...']) # truncate vector | |
| return result | |
| # Gradio interface | |
| gr.Interface( | |
| fn=process_text, | |
| inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
| outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), | |
| title="SentenceTransformer Token Embeddings", | |
| description="Removes ASCII, lowercases input, tokenizes and embeds with SentenceTransformer." | |
| ).launch() |