Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import AutoTokenizer | |
| import json | |
| from huggingface_hub import hf_hub_download | |
| class EmbeddingModel(nn.Module): | |
| def __init__(self, vocab_size, embedding_dim=256, output_dim=128, nhead=4, num_layers=2, ffn_dim=512, dropout=0.1): | |
| super().__init__() | |
| self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) | |
| encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead, dim_feedforward=ffn_dim, dropout=dropout, batch_first=True) | |
| self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers) | |
| self.projection = nn.Linear(embedding_dim, output_dim) | |
| def forward(self, input_ids): | |
| x = self.embedding(input_ids) | |
| mask = (input_ids == 0) | |
| x = self.encoder(x, src_key_padding_mask=mask) | |
| x = x.mean(dim=1) | |
| projected = self.projection(x) | |
| return F.normalize(projected, p=2, dim=1) | |
| REPO_ID = "Saminx22/text_embedding_model_14M" | |
| config_path = hf_hub_download(repo_id=REPO_ID, filename="embedding_model_config.json") | |
| weights_path = hf_hub_download(repo_id=REPO_ID, filename="embedding_model_weights.pth") | |
| with open(config_path, 'r') as f: config = json.load(f) | |
| # Corrected tokenizer loading: use subfolder argument | |
| tokenizer = AutoTokenizer.from_pretrained(REPO_ID, subfolder="tokenizer") | |
| model = EmbeddingModel(vocab_size=config['vocab_size']) | |
| model.load_state_dict(torch.load(weights_path, map_location='cpu')) | |
| model.eval() | |
| def get_similarity(query, candidates_str): | |
| candidates = [c.strip() for c in candidates_str.split('\n') if c.strip()] | |
| if not candidates: return "Please enter some candidate sentences." | |
| with torch.no_grad(): | |
| q_ids = tokenizer(query, padding='max_length', truncation=True, max_length=32, return_tensors='pt')['input_ids'] | |
| q_emb = model(q_ids) | |
| c_ids = tokenizer(candidates, padding='max_length', truncation=True, max_length=32, return_tensors='pt')['input_ids'] | |
| c_embs = model(c_ids) | |
| probs = torch.matmul(q_emb, c_embs.T).squeeze(0) | |
| results = sorted(zip(candidates, probs.tolist()), key=lambda x: x[1], reverse=True) | |
| return "\n".join([f"{score:.4f} | {text}" for text, score in results]) | |
| demo = gr.Interface( | |
| fn=get_similarity, | |
| inputs=[gr.Textbox(label="Query Sentence"), gr.Textbox(label="Candidate Sentences (one per line)", lines=5)], | |
| outputs=gr.Textbox(label="Similarity Scores"), | |
| title="14M Parameter Text Embedding Demo", | |
| description="Using the custom-trained model from Saminx22/text_embedding_model_14M" | |
| ) | |
| demo.launch() |