Spaces:
Sleeping
Sleeping
| import csv | |
| import pandas as pd | |
| # read an Excel file into a Pandas dataframe | |
| xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl') | |
| xls.iloc[:, 1:] # select all rows and all columns starting from the second column | |
| # get the names of the source databases from the dataframe | |
| source_databases = xls.iloc[:, 1:].columns | |
| # create a dictionary to hold the similarity data | |
| similarity_dict = {} | |
| # loop through each source database and get the list of similar databases | |
| for source_database in source_databases: | |
| series = xls.loc[:, source_database] | |
| similar_databases = series[series != False].values.tolist() | |
| similarity_dict[source_database] = similar_databases | |
| # find the length of the longest list | |
| max_len = max(len(v) for v in similarity_dict.values()) | |
| # pad the shorter lists with NaNs to make them the same length as the longest list | |
| for k, v in similarity_dict.items(): | |
| if len(v) < max_len: | |
| similarity_dict[k] = v + [float('nan')] * (max_len - len(v)) | |
| # convert the dictionary to a Pandas dataframe and transpose it | |
| df = pd.DataFrame.from_dict(similarity_dict) | |
| df = df.transpose() | |
| # write the dataframe to a CSV file | |
| df.to_csv('similarity_dict.csv', index=False) |