Spaces:

Haseeb-001
/

smart-data-cleaner

Sleeping

App Files Files Community

smart-data-cleaner / app.py

Haseeb-001

Update app.py

605e4d8 verified 12 months ago

raw

history blame contribute delete

4.46 kB

	import pandas as pd
	import os
	import requests
	from groq import Groq
	import streamlit as st

	# Initialize Groq client
	api = "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"
	client = Groq(api_key=api)

	# Function to use Llama LLM for cleaning
	def clean_with_llama(text):
	try:
	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": f"Clean and simplify the following text for AI modeling: {text}",
	}
	],
	model="llama-3.3-70b-versatile",
	stream=False,
	)
	return chat_completion.choices[0].message.content
	except Exception as e:
	return f"Error: {str(e)}"

	# Function to clean and process data
	def process_data(df):
	# Remove completely empty rows and columns
	df.dropna(how="all", inplace=True)
	df.dropna(axis=1, how="all", inplace=True)

	# Clean text columns
	for col in df.select_dtypes(include=['object']).columns:
	df[col] = df[col].fillna("").apply(clean_with_llama)

	# Fill missing numeric data with the mean and text with "Unknown"
	for col in df.columns:
	if df[col].dtype in ['float64', 'int64']:
	df[col].fillna(df[col].mean(), inplace=True)
	else:
	df[col].fillna("Unknown", inplace=True)

	return df

	# Function to chunk the dataset
	def chunk_dataset(df, chunk_size=1000):
	chunks = [df.iloc[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
	processed_files = []
	for i, chunk in enumerate(chunks):
	file_name = f"processed_chunk_{i + 1}.csv"
	chunk.to_csv(file_name, index=False)
	processed_files.append(file_name)
	return processed_files

	# Function to download dataset from URL
	def download_dataset(url, file_name="downloaded_dataset.csv"):
	response = requests.get(url)
	with open(file_name, 'wb') as file:
	file.write(response.content)
	return file_name

	# Streamlit interface
	def main():
	st.set_page_config(page_title="AI-Enhanced File Processor", layout="wide")
	st.title("AI-Enhanced Dataset Processor")
	st.sidebar.header("Features")
	st.sidebar.write("- Clean and process datasets")
	st.sidebar.write("- Download dataset from URL")
	st.sidebar.write("- Create chunks for AI models")
	st.sidebar.write("- Professional interface for Hugging Face deployment")

	# File upload section
	st.subheader("Upload Dataset")
	uploaded_file = st.file_uploader("Upload a CSV file", type="csv")

	# URL download section
	st.subheader("OR Enter Dataset URL")
	dataset_url = st.text_input("Paste the URL of the dataset")

	if uploaded_file or dataset_url:
	if dataset_url:
	st.info("Downloading dataset from URL...")
	file_path = download_dataset(dataset_url)
	else:
	file_path = uploaded_file # Use uploaded file

	try:
	# Load dataset
	if hasattr(file_path, 'read'): # For Streamlit uploads
	df = pd.read_csv(file_path)
	else: # For downloaded or local files
	df = pd.read_csv(file_path)

	st.success("File uploaded successfully!")
	st.write("Original Dataset")
	st.dataframe(df)

	# Process data
	st.info("Cleaning and simplifying the dataset...")
	df_cleaned = process_data(df)

	# Display cleaned data
	st.write("Cleaned Dataset")
	st.dataframe(df_cleaned)

	# Chunk data
	st.info("Creating chunks for AI models...")
	processed_files = chunk_dataset(df_cleaned)

	# Allow download of processed chunks
	st.success(f"Processing complete! {len(processed_files)} chunk(s) created.")
	for file_name in processed_files:
	with open(file_name, 'rb') as file:
	st.download_button(
	label=f"Download {file_name}",
	data=file,
	file_name=file_name,
	mime="text/csv",
	)

	# Cleanup generated files
	for file_name in processed_files:
	os.remove(file_name)

	except Exception as e:
	st.error(f"An error occurred while processing the dataset: {e}")

	if __name__ == "__main__":
	main()