Spaces:

darylfunggg
/

text-analysis

Sleeping

text-analysis / read_sheet.py

Daryl Fung

initial commit

9b9ea2f over 2 years ago

1.22 kB

	import csv
	import pandas as pd

	# read an Excel file into a Pandas dataframe
	xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl')
	xls.iloc[:, 1:] # select all rows and all columns starting from the second column

	# get the names of the source databases from the dataframe
	source_databases = xls.iloc[:, 1:].columns

	# create a dictionary to hold the similarity data
	similarity_dict = {}

	# loop through each source database and get the list of similar databases
	for source_database in source_databases:
	series = xls.loc[:, source_database]
	similar_databases = series[series != False].values.tolist()
	similarity_dict[source_database] = similar_databases

	# find the length of the longest list
	max_len = max(len(v) for v in similarity_dict.values())

	# pad the shorter lists with NaNs to make them the same length as the longest list
	for k, v in similarity_dict.items():
	if len(v) < max_len:
	similarity_dict[k] = v + [float('nan')] * (max_len - len(v))

	# convert the dictionary to a Pandas dataframe and transpose it
	df = pd.DataFrame.from_dict(similarity_dict)
	df = df.transpose()

	# write the dataframe to a CSV file
	df.to_csv('similarity_dict.csv', index=False)