File size: 5,416 Bytes
638266b
6f23def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638266b
6f23def
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import pandas as pd
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor
import shap
import matplotlib.pyplot as plt
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from io import BytesIO
from streamlit_autorefresh import st_autorefresh






# Load environment variables
load_dotenv()
mongo_uri = os.getenv("MONGO_URI")
db_name = os.getenv("DB_NAME")
collection_name = os.getenv("COLLECTION_NAME")
groq_api_key = os.getenv("GROQ_API_KEY")

# MongoDB connection
def connect_mongo():
    client = MongoClient(mongo_uri)
    db = client[db_name]
    return db[collection_name]

# Fetch data from MongoDB
def get_data(collection):
    df = pd.DataFrame(list(collection.find()))
    if '_id' in df.columns:
        df.drop(columns=['_id'], inplace=True)
    return df

# Train the regression model
def train_model(X, y):
    model = RandomForestRegressor(random_state=42)
    model.fit(X, y)
    return model

# Generate AI Report using LangChain + Groq
def generate_report(feature_impact, predicted_wqi, location, timestamp, selected):
    param_info = "\n".join([f"- {param}: {selected[param]}" for param in feature_impact.keys()])

    prompt = PromptTemplate.from_template(
        """You are an expert environmental analyst.

The predicted Water Quality Index (WQI) is {predicted_wqi} at location \"{location}\" on {timestamp}.
The top contributing parameters with their actual sensor values are:
{param_info}

Write a report that includes:
1. Likely causes for this WQI
2. Why these parameters are significant
3. Practical recommendations to improve WQI"""
    )

    llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile")
    chain = LLMChain(llm=llm, prompt=prompt)

    report = chain.run(
        predicted_wqi=predicted_wqi,
        location=location,
        timestamp=timestamp,
        param_info=param_info
    )

    report_cleaned = report.replace("**", "")
    return report_cleaned

# Function to save report as TXT
def save_report_as_txt(text: str, filename: str) -> BytesIO:
    buffer = BytesIO()
    buffer.write(text.encode("utf-8"))
    buffer.seek(0)
    return buffer

# ---------- Streamlit UI ----------
st.set_page_config(page_title="Water Quality AI Analyzer", layout="wide")
st.title("πŸ’§ Water Quality Index Prediction & AI-Powered Report")

# Add auto-refresh using Streamlit timer
st_autorefresh(interval=60 * 1000, key="datarefresh")
st.markdown("⏰ Auto-refreshing every 60 seconds to fetch latest data...")

# Real-time data load from MongoDB
collection = connect_mongo()
df = get_data(collection)

if df.empty:
    st.warning("No data found in MongoDB.")
    st.stop()

st.success("βœ… Data successfully loaded from MongoDB")
st.dataframe(df.head())

# Define features and target
feature_cols = ['pH', 'turbidity', 'dissolved_oxygen', 'conductivity', 'temperature']
target_col = 'wqi'

if not all(col in df.columns for col in feature_cols + [target_col]):
    st.error("❌ Required columns are missing from the dataset.")
    st.stop()


# Train model
X = df[feature_cols]
y = df[target_col]
model = train_model(X, y)

# SHAP Explainer
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# Display SHAP feature importance with smaller size
st.subheader("πŸ“Š Feature Impact on WQI (SHAP Values)")
fig, ax = plt.subplots(figsize=(6, 4))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
st.pyplot(fig)

# Select record
st.subheader("πŸ” Select a Data Record for Detailed Analysis")
record_options = [f"{i}: {row.get('location', 'Unknown')} @ {row.get('timestamp', 'N/A')}" for i, row in df.iterrows()]
selected_label = st.selectbox("πŸ“‹ Select a Record by Location & Time", options=record_options)
selected_index = int(selected_label.split(":")[0])
selected = df.iloc[selected_index]

# Show selected record details
st.markdown(f"πŸ”’ Selected Index: `{selected_index}`")
st.markdown(f"πŸ“ Location: `{selected.get('location', 'N/A')}`")
st.markdown(f"⏰ Timestamp: `{selected.get('timestamp', 'N/A')}`")

input_data = selected[feature_cols].to_frame().T
predicted_wqi = model.predict(input_data)[0]

# Display chosen parameter values
st.markdown("### πŸ§ͺ Selected Sensor Parameters Used for WQI Prediction")
for param in feature_cols:
    st.markdown(f"- **{param}**: `{selected[param]}`")

# SHAP for selected row
individual_shap = explainer(input_data)
impact = pd.Series(individual_shap.values[0], index=feature_cols).abs().sort_values(ascending=False)
top_impact = impact.head(3).to_dict()

# Show prediction
st.markdown(f"### πŸ€– Predicted WQI: `{predicted_wqi:.2f}`")

# Generate AI report and download
if st.button("πŸ“ Generate AI Report"):
    location = selected.get("location", "Unknown")
    timestamp = selected.get("timestamp", "Unknown")
    report = generate_report(top_impact, predicted_wqi, location, timestamp, selected)

    st.subheader("πŸ“ AI-Generated Water Quality Report")
    st.markdown(report)

    # Save as TXT
    txt_file_name = f"water_quality_report_{location.replace(' ', '_')}_{timestamp[:10]}.txt"
    report_txt = save_report_as_txt(report, txt_file_name)

    st.download_button(
        label="πŸ“„ Download Report (TXT)",
        data=report_txt,
        file_name=txt_file_name,
        mime="text/plain"
    )