import pandas as pd
import os
import requests
from groq import Groq
import streamlit as st

# Initialize Groq client
api = "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"
client = Groq(api_key=api)

# Function to use Llama LLM for cleaning
def clean_with_llama(text):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"Clean and simplify the following text for AI modeling: {text}",
                }
            ],
            model="llama-3.3-70b-versatile",
            stream=False,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

# Function to clean and process data
def process_data(df):
    # Remove completely empty rows and columns
    df.dropna(how="all", inplace=True)
    df.dropna(axis=1, how="all", inplace=True)
    
    # Clean text columns
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna("").apply(clean_with_llama)
    
    # Fill missing numeric data with the mean and text with "Unknown"
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna("Unknown", inplace=True)

    return df

# Function to chunk the dataset
def chunk_dataset(df, chunk_size=1000):
    chunks = [df.iloc[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
    processed_files = []
    for i, chunk in enumerate(chunks):
        file_name = f"processed_chunk_{i + 1}.csv"
        chunk.to_csv(file_name, index=False)
        processed_files.append(file_name)
    return processed_files

# Function to download dataset from URL
def download_dataset(url, file_name="downloaded_dataset.csv"):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)
    return file_name

# Streamlit interface
def main():
    st.set_page_config(page_title="AI-Enhanced File Processor", layout="wide")
    st.title("AI-Enhanced Dataset Processor")
    st.sidebar.header("Features")
    st.sidebar.write("- Clean and process datasets")
    st.sidebar.write("- Download dataset from URL")
    st.sidebar.write("- Create chunks for AI models")
    st.sidebar.write("- Professional interface for Hugging Face deployment")

    # File upload section
    st.subheader("Upload Dataset")
    uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
    
    # URL download section
    st.subheader("OR Enter Dataset URL")
    dataset_url = st.text_input("Paste the URL of the dataset")

    if uploaded_file or dataset_url:
        if dataset_url:
            st.info("Downloading dataset from URL...")
            file_path = download_dataset(dataset_url)
        else:
            file_path = uploaded_file  # Use uploaded file

        try:
            # Load dataset
            if hasattr(file_path, 'read'):  # For Streamlit uploads
                df = pd.read_csv(file_path)
            else:  # For downloaded or local files
                df = pd.read_csv(file_path)

            st.success("File uploaded successfully!")
            st.write("**Original Dataset**")
            st.dataframe(df)

            # Process data
            st.info("Cleaning and simplifying the dataset...")
            df_cleaned = process_data(df)

            # Display cleaned data
            st.write("**Cleaned Dataset**")
            st.dataframe(df_cleaned)

            # Chunk data
            st.info("Creating chunks for AI models...")
            processed_files = chunk_dataset(df_cleaned)

            # Allow download of processed chunks
            st.success(f"Processing complete! {len(processed_files)} chunk(s) created.")
            for file_name in processed_files:
                with open(file_name, 'rb') as file:
                    st.download_button(
                        label=f"Download {file_name}",
                        data=file,
                        file_name=file_name,
                        mime="text/csv",
                    )

            # Cleanup generated files
            for file_name in processed_files:
                os.remove(file_name)

        except Exception as e:
            st.error(f"An error occurred while processing the dataset: {e}")

if __name__ == "__main__":
    main()