Spaces:
Sleeping
Sleeping
| import json | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Load the JSON data | |
| with open('Datasets/Query/datasets_text.json', 'r', encoding='utf-8') as file: | |
| data = json.load(file) | |
| # Prepare sentences and labels | |
| sentences = [item[0] for item in data["annotations"]] | |
| labels = [item[1]['entities'] for item in data["annotations"]] | |
| # Define tags | |
| tags = data["classes"] | |
| # tags = ['<pad>'] + tags | |
| # Convert tags to indices | |
| tag2idx = {tag: 0 for idx, tag in enumerate(tags)} | |
| for label in labels: | |
| for entity in label: | |
| tag2idx[entity[1]] = tag2idx[entity[1]] + 1 | |
| # Sort the dictionary by values | |
| sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True)) | |
| sorted_tags = {key: value for key, value in sorted_tags_dict.items()} | |
| sorted_tags = list(sorted_tags) | |
| for i in range(len(sorted_tags)): | |
| sorted_tags[i] = sorted_tags[i].replace(" ", "_") | |
| destinations = pd.read_excel("Datasets/Places/des_retags_copilot.xlsx") | |
| vectorizer = CountVectorizer(max_features=10000, stop_words="english") | |
| tags_vector = vectorizer.fit_transform(destinations["tags"].values.astype('U')).toarray() | |
| tags_vector = tags_vector[1:] | |
| feature_names = vectorizer.get_feature_names_out() | |