Haseeb-001's picture
Update app.py
605e4d8 verified
import pandas as pd
import os
import requests
from groq import Groq
import streamlit as st
# Initialize Groq client
api = "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"
client = Groq(api_key=api)
# Function to use Llama LLM for cleaning
def clean_with_llama(text):
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"Clean and simplify the following text for AI modeling: {text}",
}
],
model="llama-3.3-70b-versatile",
stream=False,
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error: {str(e)}"
# Function to clean and process data
def process_data(df):
# Remove completely empty rows and columns
df.dropna(how="all", inplace=True)
df.dropna(axis=1, how="all", inplace=True)
# Clean text columns
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].fillna("").apply(clean_with_llama)
# Fill missing numeric data with the mean and text with "Unknown"
for col in df.columns:
if df[col].dtype in ['float64', 'int64']:
df[col].fillna(df[col].mean(), inplace=True)
else:
df[col].fillna("Unknown", inplace=True)
return df
# Function to chunk the dataset
def chunk_dataset(df, chunk_size=1000):
chunks = [df.iloc[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
processed_files = []
for i, chunk in enumerate(chunks):
file_name = f"processed_chunk_{i + 1}.csv"
chunk.to_csv(file_name, index=False)
processed_files.append(file_name)
return processed_files
# Function to download dataset from URL
def download_dataset(url, file_name="downloaded_dataset.csv"):
response = requests.get(url)
with open(file_name, 'wb') as file:
file.write(response.content)
return file_name
# Streamlit interface
def main():
st.set_page_config(page_title="AI-Enhanced File Processor", layout="wide")
st.title("AI-Enhanced Dataset Processor")
st.sidebar.header("Features")
st.sidebar.write("- Clean and process datasets")
st.sidebar.write("- Download dataset from URL")
st.sidebar.write("- Create chunks for AI models")
st.sidebar.write("- Professional interface for Hugging Face deployment")
# File upload section
st.subheader("Upload Dataset")
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
# URL download section
st.subheader("OR Enter Dataset URL")
dataset_url = st.text_input("Paste the URL of the dataset")
if uploaded_file or dataset_url:
if dataset_url:
st.info("Downloading dataset from URL...")
file_path = download_dataset(dataset_url)
else:
file_path = uploaded_file # Use uploaded file
try:
# Load dataset
if hasattr(file_path, 'read'): # For Streamlit uploads
df = pd.read_csv(file_path)
else: # For downloaded or local files
df = pd.read_csv(file_path)
st.success("File uploaded successfully!")
st.write("**Original Dataset**")
st.dataframe(df)
# Process data
st.info("Cleaning and simplifying the dataset...")
df_cleaned = process_data(df)
# Display cleaned data
st.write("**Cleaned Dataset**")
st.dataframe(df_cleaned)
# Chunk data
st.info("Creating chunks for AI models...")
processed_files = chunk_dataset(df_cleaned)
# Allow download of processed chunks
st.success(f"Processing complete! {len(processed_files)} chunk(s) created.")
for file_name in processed_files:
with open(file_name, 'rb') as file:
st.download_button(
label=f"Download {file_name}",
data=file,
file_name=file_name,
mime="text/csv",
)
# Cleanup generated files
for file_name in processed_files:
os.remove(file_name)
except Exception as e:
st.error(f"An error occurred while processing the dataset: {e}")
if __name__ == "__main__":
main()