Spaces:
Sleeping
Sleeping
File size: 5,416 Bytes
638266b 6f23def 638266b 6f23def |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import streamlit as st
import pandas as pd
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor
import shap
import matplotlib.pyplot as plt
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from io import BytesIO
from streamlit_autorefresh import st_autorefresh
# Load environment variables
load_dotenv()
mongo_uri = os.getenv("MONGO_URI")
db_name = os.getenv("DB_NAME")
collection_name = os.getenv("COLLECTION_NAME")
groq_api_key = os.getenv("GROQ_API_KEY")
# MongoDB connection
def connect_mongo():
client = MongoClient(mongo_uri)
db = client[db_name]
return db[collection_name]
# Fetch data from MongoDB
def get_data(collection):
df = pd.DataFrame(list(collection.find()))
if '_id' in df.columns:
df.drop(columns=['_id'], inplace=True)
return df
# Train the regression model
def train_model(X, y):
model = RandomForestRegressor(random_state=42)
model.fit(X, y)
return model
# Generate AI Report using LangChain + Groq
def generate_report(feature_impact, predicted_wqi, location, timestamp, selected):
param_info = "\n".join([f"- {param}: {selected[param]}" for param in feature_impact.keys()])
prompt = PromptTemplate.from_template(
"""You are an expert environmental analyst.
The predicted Water Quality Index (WQI) is {predicted_wqi} at location \"{location}\" on {timestamp}.
The top contributing parameters with their actual sensor values are:
{param_info}
Write a report that includes:
1. Likely causes for this WQI
2. Why these parameters are significant
3. Practical recommendations to improve WQI"""
)
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile")
chain = LLMChain(llm=llm, prompt=prompt)
report = chain.run(
predicted_wqi=predicted_wqi,
location=location,
timestamp=timestamp,
param_info=param_info
)
report_cleaned = report.replace("**", "")
return report_cleaned
# Function to save report as TXT
def save_report_as_txt(text: str, filename: str) -> BytesIO:
buffer = BytesIO()
buffer.write(text.encode("utf-8"))
buffer.seek(0)
return buffer
# ---------- Streamlit UI ----------
st.set_page_config(page_title="Water Quality AI Analyzer", layout="wide")
st.title("π§ Water Quality Index Prediction & AI-Powered Report")
# Add auto-refresh using Streamlit timer
st_autorefresh(interval=60 * 1000, key="datarefresh")
st.markdown("β° Auto-refreshing every 60 seconds to fetch latest data...")
# Real-time data load from MongoDB
collection = connect_mongo()
df = get_data(collection)
if df.empty:
st.warning("No data found in MongoDB.")
st.stop()
st.success("β
Data successfully loaded from MongoDB")
st.dataframe(df.head())
# Define features and target
feature_cols = ['pH', 'turbidity', 'dissolved_oxygen', 'conductivity', 'temperature']
target_col = 'wqi'
if not all(col in df.columns for col in feature_cols + [target_col]):
st.error("β Required columns are missing from the dataset.")
st.stop()
# Train model
X = df[feature_cols]
y = df[target_col]
model = train_model(X, y)
# SHAP Explainer
explainer = shap.Explainer(model, X)
shap_values = explainer(X)
# Display SHAP feature importance with smaller size
st.subheader("π Feature Impact on WQI (SHAP Values)")
fig, ax = plt.subplots(figsize=(6, 4))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
st.pyplot(fig)
# Select record
st.subheader("π Select a Data Record for Detailed Analysis")
record_options = [f"{i}: {row.get('location', 'Unknown')} @ {row.get('timestamp', 'N/A')}" for i, row in df.iterrows()]
selected_label = st.selectbox("π Select a Record by Location & Time", options=record_options)
selected_index = int(selected_label.split(":")[0])
selected = df.iloc[selected_index]
# Show selected record details
st.markdown(f"π’ Selected Index: `{selected_index}`")
st.markdown(f"π Location: `{selected.get('location', 'N/A')}`")
st.markdown(f"β° Timestamp: `{selected.get('timestamp', 'N/A')}`")
input_data = selected[feature_cols].to_frame().T
predicted_wqi = model.predict(input_data)[0]
# Display chosen parameter values
st.markdown("### π§ͺ Selected Sensor Parameters Used for WQI Prediction")
for param in feature_cols:
st.markdown(f"- **{param}**: `{selected[param]}`")
# SHAP for selected row
individual_shap = explainer(input_data)
impact = pd.Series(individual_shap.values[0], index=feature_cols).abs().sort_values(ascending=False)
top_impact = impact.head(3).to_dict()
# Show prediction
st.markdown(f"### π€ Predicted WQI: `{predicted_wqi:.2f}`")
# Generate AI report and download
if st.button("π Generate AI Report"):
location = selected.get("location", "Unknown")
timestamp = selected.get("timestamp", "Unknown")
report = generate_report(top_impact, predicted_wqi, location, timestamp, selected)
st.subheader("π AI-Generated Water Quality Report")
st.markdown(report)
# Save as TXT
txt_file_name = f"water_quality_report_{location.replace(' ', '_')}_{timestamp[:10]}.txt"
report_txt = save_report_as_txt(report, txt_file_name)
st.download_button(
label="π Download Report (TXT)",
data=report_txt,
file_name=txt_file_name,
mime="text/plain"
)
|