Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

topic_modelling / funcs /helper_functions.py

Sonnyjim

first commit

9dbf344 almost 2 years ago

raw

history blame

2.82 kB

	import os
	import re
	import pandas as pd
	import gradio as gr
	import gzip
	import pickle


	def detect_file_type(filename):
	"""Detect the file type based on its extension."""
	if (filename.endswith('.csv')) \| (filename.endswith('.csv.gz')) \| (filename.endswith('.zip')):
	return 'csv'
	elif filename.endswith('.xlsx'):
	return 'xlsx'
	elif filename.endswith('.parquet'):
	return 'parquet'
	elif filename.endswith('.pkl.gz'):
	return 'pkl.gz'
	else:
	raise ValueError("Unsupported file type.")

	def read_file(filename):
	"""Read the file based on its detected type."""
	file_type = detect_file_type(filename)

	print("Loading in file")

	if file_type == 'csv':
	file = pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
	elif file_type == 'xlsx':
	file = pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
	elif file_type == 'parquet':
	file = pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
	elif file_type == 'pkl.gz':
	with gzip.open(filename, 'rb') as file:
	file = pickle.load(file)
	#file = pd.read_pickle(filename)

	print("File load complete")

	return file

	def put_columns_in_df(in_file, in_bm25_column):
	'''
	When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
	'''

	file_list = [string.name for string in in_file]

	data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
	data_file_name = data_file_names[0]


	new_choices = []
	concat_choices = []


	df = read_file(data_file_name)

	new_choices = list(df.columns)


	concat_choices.extend(new_choices)

	return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df

	def get_file_path_end(file_path):
	# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
	basename = os.path.basename(file_path)

	# Then, split the basename and its extension and return only the basename without the extension
	filename_without_extension, _ = os.path.splitext(basename)

	#print(filename_without_extension)

	return filename_without_extension

	def get_file_path_end_with_ext(file_path):
	match = re.search(r'(.*[\/\\])?(.+)$', file_path)

	filename_end = match.group(2) if match else ''

	return filename_end

	def dummy_function(in_colnames):
	"""
	A dummy function that exists just so that dropdown updates work correctly.
	"""
	return None