Spaces:

Saminx22
/

text-embedding-demo

Sleeping

App Files Files Community

text-embedding-demo / app.py

Saminx22

Upload app.py with huggingface_hub

1e5eadb verified about 1 month ago

raw

history blame contribute delete

2.7 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import AutoTokenizer
	import json
	from huggingface_hub import hf_hub_download

	class EmbeddingModel(nn.Module):
	def __init__(self, vocab_size, embedding_dim=256, output_dim=128, nhead=4, num_layers=2, ffn_dim=512, dropout=0.1):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
	encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead, dim_feedforward=ffn_dim, dropout=dropout, batch_first=True)
	self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers)
	self.projection = nn.Linear(embedding_dim, output_dim)

	def forward(self, input_ids):
	x = self.embedding(input_ids)
	mask = (input_ids == 0)
	x = self.encoder(x, src_key_padding_mask=mask)
	x = x.mean(dim=1)
	projected = self.projection(x)
	return F.normalize(projected, p=2, dim=1)

	REPO_ID = "Saminx22/text_embedding_model_14M"
	config_path = hf_hub_download(repo_id=REPO_ID, filename="embedding_model_config.json")
	weights_path = hf_hub_download(repo_id=REPO_ID, filename="embedding_model_weights.pth")

	with open(config_path, 'r') as f: config = json.load(f)
	# Corrected tokenizer loading: use subfolder argument
	tokenizer = AutoTokenizer.from_pretrained(REPO_ID, subfolder="tokenizer")
	model = EmbeddingModel(vocab_size=config['vocab_size'])
	model.load_state_dict(torch.load(weights_path, map_location='cpu'))
	model.eval()

	def get_similarity(query, candidates_str):
	candidates = [c.strip() for c in candidates_str.split('\n') if c.strip()]
	if not candidates: return "Please enter some candidate sentences."

	with torch.no_grad():
	q_ids = tokenizer(query, padding='max_length', truncation=True, max_length=32, return_tensors='pt')['input_ids']
	q_emb = model(q_ids)
	c_ids = tokenizer(candidates, padding='max_length', truncation=True, max_length=32, return_tensors='pt')['input_ids']
	c_embs = model(c_ids)

	probs = torch.matmul(q_emb, c_embs.T).squeeze(0)
	results = sorted(zip(candidates, probs.tolist()), key=lambda x: x[1], reverse=True)

	return "\n".join([f"{score:.4f} \| {text}" for text, score in results])

	demo = gr.Interface(
	fn=get_similarity,
	inputs=[gr.Textbox(label="Query Sentence"), gr.Textbox(label="Candidate Sentences (one per line)", lines=5)],
	outputs=gr.Textbox(label="Similarity Scores"),
	title="14M Parameter Text Embedding Demo",
	description="Using the custom-trained model from Saminx22/text_embedding_model_14M"
	)

	demo.launch()