File size: 3,991 Bytes
779bb9b
62e465d
6d6de4b
 
62e465d
779bb9b
 
 
62e465d
 
 
779bb9b
62e465d
 
779bb9b
62e465d
779bb9b
 
 
62e465d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779bb9b
 
62e465d
 
 
 
6d6de4b
50e1eaf
62e465d
 
 
 
 
779bb9b
62e465d
 
779bb9b
62e465d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import time
import pandas as pd
import streamlit as st
from io import StringIO
from dotenv import load_dotenv
from huggingface_hub import InferenceClient, login

# ==========================================================
# πŸ” Load environment + authenticate
# ==========================================================
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

if not HF_TOKEN:
    st.error("❌ Missing Hugging Face token. Please set HF_TOKEN in your .env file.")
else:
    login(token=HF_TOKEN)

# Create HF clients
cleaner_client = InferenceClient(model="Qwen/Qwen2.5-Coder-14B", token=HF_TOKEN)
analyst_client = InferenceClient(model="Qwen/Qwen2.5-14B-Instruct", token=HF_TOKEN)

# ==========================================================
# πŸŽ›οΈ App Layout
# ==========================================================
st.set_page_config(page_title="🧹 Smart Data Analysis", page_icon="πŸ“Š", layout="wide")
st.title("πŸ“Š Smart Data Analysis Assistant")
st.caption("Clean messy data, then run AI-powered insights and statistical analysis β€” all locally with open-source models.")

# ==========================================================
# πŸ“ Upload CSV
# ==========================================================
uploaded_file = st.file_uploader("πŸ“€ Upload your CSV dataset", type=["csv"])
if uploaded_file:
    df_raw = pd.read_csv(uploaded_file)
    st.subheader("πŸ“„ Raw Data Preview")
    st.dataframe(df_raw.head())

    # ==========================================================
    # 🧹 Data Cleaning
    # ==========================================================
    if st.button("🧹 Clean Data using Qwen Coder 14B"):
        with st.spinner("Cleaning data... please wait ⏳"):
            try:
                # Convert DataFrame to text for cleaning
                csv_text = df_raw.to_csv(index=False)

                prompt = f"""
You are a Python data cleaning assistant. 
Clean this dataset and fix inconsistent column names, missing values, and formatting.
Return a clean CSV version that can be loaded into pandas directly.

Dataset:
{csv_text}
                """

                response = cleaner_client.text_generation(
                    prompt,
                    temperature=0.2,
                    max_new_tokens=2048,
                )

                cleaned_csv = response.strip().split("```")[-1]  # extract text
                df_cleaned = pd.read_csv(StringIO(cleaned_csv))

                st.session_state.cleaned_df = df_cleaned
                st.success("βœ… Data cleaned successfully!")
                st.dataframe(df_cleaned.head())

            except Exception as e:
                st.error(f"⚠️ Cleaning failed: {e}")

# ==========================================================
# πŸ“Š Data Analysis
# ==========================================================
if "cleaned_df" in st.session_state:
    df = st.session_state.cleaned_df
    st.divider()
    st.subheader("πŸ“ˆ AI Data Analysis")

    user_query = st.text_area("Ask about your data:", placeholder="e.g., What is the correlation between experience and salary?")
    if st.button("πŸ” Analyze"):
        with st.spinner("Analyzing with Qwen 14B Instruct..."):
            try:
                csv_excerpt = df.head(30).to_csv(index=False)
                analysis_prompt = f"""
You are a data analyst. Analyze this dataset and answer the question.

Data sample (CSV):
{csv_excerpt}

Question:
{user_query}

Instructions:
- Be accurate and concise.
- If numerical analysis is relevant, describe it.
- Use markdown for readability.
"""

                response = analyst_client.text_generation(
                    analysis_prompt,
                    temperature=0.5,
                    max_new_tokens=1024,
                )

                st.markdown("### 🧠 Analysis Result")
                st.write(response.strip())

            except Exception as e:
                st.error(f"⚠️ Analysis failed: {e}")