data_analysis / src /streamlit_app.py
Starberry15's picture
Update src/streamlit_app.py
62e465d verified
raw
history blame
3.99 kB
import os
import time
import pandas as pd
import streamlit as st
from io import StringIO
from dotenv import load_dotenv
from huggingface_hub import InferenceClient, login
# ==========================================================
# πŸ” Load environment + authenticate
# ==========================================================
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
st.error("❌ Missing Hugging Face token. Please set HF_TOKEN in your .env file.")
else:
login(token=HF_TOKEN)
# Create HF clients
cleaner_client = InferenceClient(model="Qwen/Qwen2.5-Coder-14B", token=HF_TOKEN)
analyst_client = InferenceClient(model="Qwen/Qwen2.5-14B-Instruct", token=HF_TOKEN)
# ==========================================================
# πŸŽ›οΈ App Layout
# ==========================================================
st.set_page_config(page_title="🧹 Smart Data Analysis", page_icon="πŸ“Š", layout="wide")
st.title("πŸ“Š Smart Data Analysis Assistant")
st.caption("Clean messy data, then run AI-powered insights and statistical analysis β€” all locally with open-source models.")
# ==========================================================
# πŸ“ Upload CSV
# ==========================================================
uploaded_file = st.file_uploader("πŸ“€ Upload your CSV dataset", type=["csv"])
if uploaded_file:
df_raw = pd.read_csv(uploaded_file)
st.subheader("πŸ“„ Raw Data Preview")
st.dataframe(df_raw.head())
# ==========================================================
# 🧹 Data Cleaning
# ==========================================================
if st.button("🧹 Clean Data using Qwen Coder 14B"):
with st.spinner("Cleaning data... please wait ⏳"):
try:
# Convert DataFrame to text for cleaning
csv_text = df_raw.to_csv(index=False)
prompt = f"""
You are a Python data cleaning assistant.
Clean this dataset and fix inconsistent column names, missing values, and formatting.
Return a clean CSV version that can be loaded into pandas directly.
Dataset:
{csv_text}
"""
response = cleaner_client.text_generation(
prompt,
temperature=0.2,
max_new_tokens=2048,
)
cleaned_csv = response.strip().split("```")[-1] # extract text
df_cleaned = pd.read_csv(StringIO(cleaned_csv))
st.session_state.cleaned_df = df_cleaned
st.success("βœ… Data cleaned successfully!")
st.dataframe(df_cleaned.head())
except Exception as e:
st.error(f"⚠️ Cleaning failed: {e}")
# ==========================================================
# πŸ“Š Data Analysis
# ==========================================================
if "cleaned_df" in st.session_state:
df = st.session_state.cleaned_df
st.divider()
st.subheader("πŸ“ˆ AI Data Analysis")
user_query = st.text_area("Ask about your data:", placeholder="e.g., What is the correlation between experience and salary?")
if st.button("πŸ” Analyze"):
with st.spinner("Analyzing with Qwen 14B Instruct..."):
try:
csv_excerpt = df.head(30).to_csv(index=False)
analysis_prompt = f"""
You are a data analyst. Analyze this dataset and answer the question.
Data sample (CSV):
{csv_excerpt}
Question:
{user_query}
Instructions:
- Be accurate and concise.
- If numerical analysis is relevant, describe it.
- Use markdown for readability.
"""
response = analyst_client.text_generation(
analysis_prompt,
temperature=0.5,
max_new_tokens=1024,
)
st.markdown("### 🧠 Analysis Result")
st.write(response.strip())
except Exception as e:
st.error(f"⚠️ Analysis failed: {e}")