Spaces:
Sleeping
Sleeping
| import torch | |
| from sentence_transformers import SentenceTransformer, util | |
| import pandas as pd | |
| import gradio as gr | |
| def save_embeddings(sentences, filename): | |
| embeddings = model.encode(sentences, convert_to_tensor=True) | |
| torch.save(embeddings, filename) | |
| def load_embeddings(filename): | |
| return torch.load(filename, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')) | |
| def preprocess_space_descriptions(file_path): | |
| encodings = ['utf-8', 'latin-1', 'utf-16'] | |
| for encoding in encodings: | |
| try: | |
| df = pd.read_csv(file_path, sep='\t', header=None, names=['space_id', 'description']) | |
| df.dropna(subset=['description'], inplace=True) | |
| space_ids = df['space_id'].tolist() | |
| descriptions = df['description'].tolist() | |
| break | |
| except UnicodeDecodeError: | |
| continue | |
| else: | |
| raise UnicodeDecodeError("Unable to decode the file using the available encodings.") | |
| return space_ids, descriptions | |
| def perform_similarity_search(query_embeddings, embeddings, space_ids, descriptions, top_k=10): | |
| cosine_scores = util.cos_sim(query_embeddings, embeddings) | |
| similarity_scores = cosine_scores.tolist() | |
| results = [] | |
| for i, query_embedding in enumerate(query_embeddings): | |
| query_results = sorted(zip(space_ids, descriptions, similarity_scores[i]), key=lambda x: x[2], reverse=True)[:top_k] | |
| results.extend(query_results) | |
| return pd.DataFrame(results, columns=["space_id", "description", "score"]) | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| space_ids, descriptions = preprocess_space_descriptions('hf_spaces_descriptions.tsv') | |
| embeddings = load_embeddings('embeddings_hf_spaces_descriptions.pt') | |
| with gr.Blocks() as demo: | |
| input = gr.Textbox(label="Enter your query") | |
| num_results = gr.Slider(10, 100, value=10, step=1, label="Number of results") | |
| df_output = gr.Dataframe(label="Similarity Results", wrap=True) | |
| def search(query, num_results): | |
| query_embedding = model.encode([query], convert_to_tensor=True) | |
| return perform_similarity_search(query_embedding, embeddings, space_ids, descriptions, top_k=num_results) | |
| input.submit(search, inputs=[input, num_results], outputs=df_output, api_name="search") | |
| demo.launch() | |