Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModel | |
| from datasets import load_dataset | |
| import torch | |
| model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" | |
| tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
| model = AutoModel.from_pretrained(model_ckpt) | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| model.to(device) | |
| def cls_pooling(model_output): | |
| return model_output.last_hidden_state[:, 0] | |
| def get_embeddings(text_list): | |
| encoded_input = tokenizer( | |
| text_list, padding=True, truncation=True, return_tensors="pt" | |
| ) | |
| encoded_input = {k: v.to(device) for k, v in encoded_input.items()} | |
| model_output = model(**encoded_input) | |
| return cls_pooling(model_output) | |
| embeddings_doc_dataset = load_dataset("fashxp/pimcore-docs-embeddings") | |
| embeddings_doc_dataset = embeddings_doc_dataset['train'] | |
| embeddings_doc_dataset.add_faiss_index(column="embeddings") | |
| import pandas as pd | |
| def find_in_docs(question): | |
| question_embedding = get_embeddings([question]).cpu().detach().numpy() | |
| question_embedding.shape | |
| scores, samples = embeddings_doc_dataset.get_nearest_examples( | |
| "embeddings", question_embedding, k=10 | |
| ) | |
| samples_df = pd.DataFrame.from_dict(samples) | |
| samples_df["scores"] = scores | |
| samples_df.sort_values("scores", ascending=False, inplace=True) | |
| result = '' | |
| for _, row in samples_df.iterrows(): | |
| result = result + f"HEADING: {row.heading}\n" + f"SCORE: {row.scores}\n" + f"URL: {row.url}\n" + ("=" * 50) + "\n\n" | |
| return result | |
| import gradio as gr | |
| demo = gr.Interface(fn=find_in_docs, inputs="text", outputs="text") | |
| demo.launch(share=True) |