| | from transformers import AutoTokenizer, AutoModelForMaskedLM |
| | import torch |
| | import gradio as gr |
| | import numpy as np |
| | import json |
| |
|
| | class BertEmbeddingsGenerator: |
| | def __init__(self, model_name="tahrirchi/tahrirchi-bert-base"): |
| | """Initialize the BERT model and tokenizer.""" |
| | self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | self.model = AutoModelForMaskedLM.from_pretrained(model_name) |
| | self.model.eval() |
| |
|
| | def get_embeddings(self, text): |
| | """ |
| | Generate embeddings for the input text. |
| | |
| | Args: |
| | text (str): Input text to embed |
| | |
| | Returns: |
| | np.ndarray: Text embeddings |
| | """ |
| | |
| | inputs = self.tokenizer( |
| | text, |
| | return_tensors="pt", |
| | truncation=True, |
| | padding=True, |
| | max_length=512 |
| | ) |
| | |
| | |
| | with torch.no_grad(): |
| | outputs = self.model(**inputs, output_hidden_states=True) |
| | |
| | |
| | |
| | last_hidden_state = outputs.hidden_states[-1] |
| | |
| | |
| | embeddings = last_hidden_state.mean(dim=1) |
| | |
| | |
| | return embeddings.squeeze().cpu().numpy() |
| |
|
| | def create_gradio_interface(): |
| | """Create and configure the Gradio interface.""" |
| | |
| | generator = BertEmbeddingsGenerator() |
| | |
| | def embed_text(input_text): |
| | """Gradio interface function.""" |
| | try: |
| | if not input_text or not input_text.strip(): |
| | return json.dumps({"error": "Matn kiritilmadi"}) |
| | |
| | embeddings = generator.get_embeddings(input_text) |
| | |
| | |
| | embeddings_list = np.where(np.isfinite(embeddings), embeddings, None).tolist() |
| | |
| | |
| | output = { |
| | "embeddings": embeddings_list, |
| | "dimensions": len(embeddings_list), |
| | "status": "success" |
| | } |
| | |
| | return json.dumps(output, ensure_ascii=False) |
| | |
| | except Exception as e: |
| | return json.dumps({ |
| | "error": str(e), |
| | "status": "error" |
| | }) |
| | |
| | |
| | iface = gr.Interface( |
| | fn=embed_text, |
| | inputs=gr.Textbox( |
| | lines=2, |
| | placeholder="Matn kiriting...", |
| | label="Input Text" |
| | ), |
| | outputs=gr.JSON(label="Embeddings"), |
| | title="O'zbek tili uchun embedding", |
| | description="O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish", |
| | examples=[ |
| | ["Assalomu alaykum, men o'zbek tili bilan ishlayman"], |
| | ["O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish uchun namuna matn."] |
| | ] |
| | ) |
| | return iface |
| |
|
| | if __name__ == "__main__": |
| | |
| | iface = create_gradio_interface() |
| | iface.launch() |