Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| import requests | |
| from groq import Groq | |
| import streamlit as st | |
| # Initialize Groq client | |
| api = "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" | |
| client = Groq(api_key=api) | |
| # Function to use Llama LLM for cleaning | |
| def clean_with_llama(text): | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": f"Clean and simplify the following text for AI modeling: {text}", | |
| } | |
| ], | |
| model="llama-3.3-70b-versatile", | |
| stream=False, | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Function to clean and process data | |
| def process_data(df): | |
| # Remove completely empty rows and columns | |
| df.dropna(how="all", inplace=True) | |
| df.dropna(axis=1, how="all", inplace=True) | |
| # Clean text columns | |
| for col in df.select_dtypes(include=['object']).columns: | |
| df[col] = df[col].fillna("").apply(clean_with_llama) | |
| # Fill missing numeric data with the mean and text with "Unknown" | |
| for col in df.columns: | |
| if df[col].dtype in ['float64', 'int64']: | |
| df[col].fillna(df[col].mean(), inplace=True) | |
| else: | |
| df[col].fillna("Unknown", inplace=True) | |
| return df | |
| # Function to chunk the dataset | |
| def chunk_dataset(df, chunk_size=1000): | |
| chunks = [df.iloc[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)] | |
| processed_files = [] | |
| for i, chunk in enumerate(chunks): | |
| file_name = f"processed_chunk_{i + 1}.csv" | |
| chunk.to_csv(file_name, index=False) | |
| processed_files.append(file_name) | |
| return processed_files | |
| # Function to download dataset from URL | |
| def download_dataset(url, file_name="downloaded_dataset.csv"): | |
| response = requests.get(url) | |
| with open(file_name, 'wb') as file: | |
| file.write(response.content) | |
| return file_name | |
| # Streamlit interface | |
| def main(): | |
| st.set_page_config(page_title="AI-Enhanced File Processor", layout="wide") | |
| st.title("AI-Enhanced Dataset Processor") | |
| st.sidebar.header("Features") | |
| st.sidebar.write("- Clean and process datasets") | |
| st.sidebar.write("- Download dataset from URL") | |
| st.sidebar.write("- Create chunks for AI models") | |
| st.sidebar.write("- Professional interface for Hugging Face deployment") | |
| # File upload section | |
| st.subheader("Upload Dataset") | |
| uploaded_file = st.file_uploader("Upload a CSV file", type="csv") | |
| # URL download section | |
| st.subheader("OR Enter Dataset URL") | |
| dataset_url = st.text_input("Paste the URL of the dataset") | |
| if uploaded_file or dataset_url: | |
| if dataset_url: | |
| st.info("Downloading dataset from URL...") | |
| file_path = download_dataset(dataset_url) | |
| else: | |
| file_path = uploaded_file # Use uploaded file | |
| try: | |
| # Load dataset | |
| if hasattr(file_path, 'read'): # For Streamlit uploads | |
| df = pd.read_csv(file_path) | |
| else: # For downloaded or local files | |
| df = pd.read_csv(file_path) | |
| st.success("File uploaded successfully!") | |
| st.write("**Original Dataset**") | |
| st.dataframe(df) | |
| # Process data | |
| st.info("Cleaning and simplifying the dataset...") | |
| df_cleaned = process_data(df) | |
| # Display cleaned data | |
| st.write("**Cleaned Dataset**") | |
| st.dataframe(df_cleaned) | |
| # Chunk data | |
| st.info("Creating chunks for AI models...") | |
| processed_files = chunk_dataset(df_cleaned) | |
| # Allow download of processed chunks | |
| st.success(f"Processing complete! {len(processed_files)} chunk(s) created.") | |
| for file_name in processed_files: | |
| with open(file_name, 'rb') as file: | |
| st.download_button( | |
| label=f"Download {file_name}", | |
| data=file, | |
| file_name=file_name, | |
| mime="text/csv", | |
| ) | |
| # Cleanup generated files | |
| for file_name in processed_files: | |
| os.remove(file_name) | |
| except Exception as e: | |
| st.error(f"An error occurred while processing the dataset: {e}") | |
| if __name__ == "__main__": | |
| main() |