hydraadra112 commited on
Commit
fbd7328
·
1 Parent(s): e4fb332

Added data viz and details about the data

Browse files
Files changed (5) hide show
  1. .gitignore +1 -2
  2. app.py +107 -31
  3. model.ipynb +16 -16
  4. requirements.txt +2 -2
  5. yield_df.csv +0 -0
.gitignore CHANGED
@@ -1,3 +1,2 @@
1
  .venv
2
- .venv/
3
- yield_df.csv
 
1
  .venv
2
+ .venv/
 
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import joblib
 
4
 
5
  models = ["Linear Regression", "XGBoost", "Random Forests Regressor"]
6
 
@@ -71,50 +72,125 @@ def get_input_data(df: pd.core.frame.DataFrame):
71
  "avg_temp": [avg_temp]
72
  }), (item, area)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def main():
75
  st.title("Crop Yield Predictor")
76
- st.caption("The science of training machines to learn and produce models for future predictions is widely used, and not for nothing. Agriculture plays a critical role in the global economy. With the continuing expansion of the human population understanding worldwide crop yield is central to addressing food security challenges and reducing the impacts of climate change. Crop yield prediction is an important agricultural problem. The Agricultural yield primarily depends on weather conditions (rain, temperature, etc), pesticides and accurate information about history of crop yield is an important thing for making decisions related to agricultural risk management and future predictions.")
 
77
 
78
  df = load_dataset('./yield_df.csv')
79
  df = df.drop("Unnamed: 0", axis=1)
80
 
81
- st.dataframe(df, height=300, width=900)
82
-
83
- st.divider()
84
-
85
- input_data, (item, area) = get_input_data(df)
86
-
87
- selected_model = st.selectbox("Which model do you want to use?",
88
- tuple(models),
89
- placeholder="Select your model"
90
- )
91
-
92
- with st.expander(f"Click to see performance of {selected_model}"):
93
- if selected_model == models[0]:
94
- st.image("./plots/lr_plot.png", caption="Linear Regression Plot")
95
- elif selected_model == models[1]:
96
- st.image("./plots/xgb_plot.png", caption="XG Boost Plot")
97
- elif selected_model == models[2]:
98
- st.image("./plots/rf_plot.png", caption="Random Forests Regressor Plot")
99
-
100
- if st.button("Predict yield!"):
101
 
102
  col1, col2 = st.columns(2)
103
- col1.metric("Area", area, border=True)
104
- col2.metric("Item", item, border=True)
105
 
106
- col3, col4, col5 = st.columns(3)
107
- col3.metric("Average Rainfall", input_data['average_rain_fall_mm_per_year'], border=True)
108
- col4.metric("Pestiscide Usage (Tonne)", input_data['pesticides_tonnes'], border=True)
109
- col5.metric("Average Temperature (Celcius)", input_data['avg_temp'], border=True)
110
 
111
- model, scaler = load_model(selected_model)
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- input_scaled = scaler.transform(input_data)
114
 
115
- pred = model.predict(input_scaled)
 
116
 
117
- st.header(f"Predicted Crop Yield: **{int(pred[0])}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  if __name__ == "__main__":
120
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  import joblib
4
+ import plotly.express as px
5
 
6
  models = ["Linear Regression", "XGBoost", "Random Forests Regressor"]
7
 
 
72
  "avg_temp": [avg_temp]
73
  }), (item, area)
74
 
75
+ def plot_map(countries: pd.Series) -> None:
76
+ """
77
+ Plots the world map and highlights the countries that are frequent.
78
+
79
+ Args:
80
+ countries (pd.Series): A pandas series of the countries
81
+ """
82
+
83
+ country_counts = countries.value_counts().reset_index()
84
+ country_counts.columns = ['country', 'count']
85
+
86
+ # Create a choropleth map
87
+ fig = px.choropleth(
88
+ country_counts,
89
+ locations='country',
90
+ locationmode='country names',
91
+ color='count',
92
+ hover_name='country',
93
+ color_continuous_scale='Blues',
94
+ title='Countries in the Dataset'
95
+ )
96
+
97
+ # Display in Streamlit
98
+ st.plotly_chart(fig)
99
+
100
  def main():
101
  st.title("Crop Yield Predictor")
102
+
103
+ tab1, tab2, tab3 = st.tabs(["About the Data", "Data Viz", "Model Inference"])
104
 
105
  df = load_dataset('./yield_df.csv')
106
  df = df.drop("Unnamed: 0", axis=1)
107
 
108
+ with tab1:
109
+ st.caption("The science of training machines to learn and produce models for future predictions is widely used, and not for nothing. Agriculture plays a critical role in the global economy. With the continuing expansion of the human population understanding worldwide crop yield is central to addressing food security challenges and reducing the impacts of climate change.")
110
+ st.caption(" Crop yield prediction is an important agricultural problem. The Agricultural yield primarily depends on weather conditions (rain, temperature, etc), pesticides and accurate information about history of crop yield is an important thing for making decisions related to agricultural risk management and future predictions.")
111
+
112
+ st.dataframe(df, height=300, width=900)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  col1, col2 = st.columns(2)
 
 
115
 
116
+ col1.caption("**Area**: Geographic region or country where the crop is cultivated, serving as a key factor in yield variations due to climate, soil, and regional practices.")
117
+ col2.caption("**Item**: Type of crop grown (e.g., wheat, rice), essential for modeling yield patterns and crop-specific responses to environmental factors.")
 
 
118
 
119
+ col1.caption("**Year**: Time of harvest, helping analyze yield trends, seasonal patterns, and the impact of climate change over time.")
120
+ col2.caption("**hg/ha_yield**: Crop yield per hectare (hectograms per hectare), the target variable indicating agricultural productivity for each crop and region.")
121
+
122
+ col1.caption("**average_rain_fall_mm_per_year**: Annual rainfall measured in millimeters, a critical environmental factor influencing crop growth and yield.")
123
+ col2.caption("**pesticides_tonnes**: Total pesticides applied (in tonnes), providing insight into pest control measures and their impact on crop yield.")
124
+
125
+ col1.caption("**avg_temp**: Average annual temperature (°C), a vital climate factor affecting crop growth cycles, maturity rates, and overall yield.")
126
+
127
+ st.divider()
128
+
129
+ with tab2:
130
+ plot_map(df['Area'])
131
+ st.caption("The world map plot above showcases each country and its frequency in the dataset.")
132
 
133
+ st.divider()
134
 
135
+ x = st.selectbox("Choose X for plotting.", tuple(df.columns))
136
+ y = st.selectbox("Choose Y for plotting.", tuple(df.drop(x, axis=1).columns))
137
 
138
+ plot = st.selectbox("Select type of plot.", ("Scatter", "Bar", "Line"))
139
+
140
+ if st.button("Plot X and Y!"):
141
+ if plot == "Scatter":
142
+ st.scatter_chart(
143
+ data=df,
144
+ x=x,
145
+ y=y,
146
+ size='hg/ha_yield'
147
+ )
148
+ elif plot == "Bar":
149
+ st.bar_chart(
150
+ data=df,
151
+ x=x,
152
+ y=y
153
+ )
154
+ elif plot == "Line":
155
+ st.line_chart(
156
+ data=df,
157
+ x=x,
158
+ y=y
159
+ )
160
+ with tab3:
161
+ input_data, (item, area) = get_input_data(df)
162
+
163
+ selected_model = st.selectbox("Which model do you want to use?",
164
+ tuple(models),
165
+ placeholder="Select your model"
166
+ )
167
+
168
+ with st.expander(f"Click to see performance of {selected_model}"):
169
+ if selected_model == models[0]:
170
+ st.image("./plots/lr_plot.png", caption="Linear Regression Plot")
171
+ elif selected_model == models[1]:
172
+ st.image("./plots/xgb_plot.png", caption="XG Boost Plot")
173
+ elif selected_model == models[2]:
174
+ st.image("./plots/rf_plot.png", caption="Random Forests Regressor Plot")
175
+
176
+ if st.button("Predict yield!"):
177
+
178
+ col1, col2 = st.columns(2)
179
+ col1.metric("Area", area, border=True)
180
+ col2.metric("Item", item, border=True)
181
+
182
+ col3, col4, col5 = st.columns(3)
183
+ col3.metric("Average Rainfall", input_data['average_rain_fall_mm_per_year'], border=True)
184
+ col4.metric("Pestiscide Usage (Tonne)", input_data['pesticides_tonnes'], border=True)
185
+ col5.metric("Average Temperature (Celcius)", input_data['avg_temp'], border=True)
186
+
187
+ model, scaler = load_model(selected_model)
188
+
189
+ input_scaled = scaler.transform(input_data)
190
+
191
+ pred = model.predict(input_scaled)
192
+
193
+ st.header(f"Predicted Crop Yield: **{int(pred[0])}**")
194
 
195
  if __name__ == "__main__":
196
  main()
model.ipynb CHANGED
@@ -20,7 +20,7 @@
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 33,
24
  "metadata": {},
25
  "outputs": [],
26
  "source": [
@@ -40,7 +40,7 @@
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": 34,
44
  "metadata": {},
45
  "outputs": [
46
  {
@@ -215,7 +215,7 @@
215
  "9 1485.0 121.0 15.36 "
216
  ]
217
  },
218
- "execution_count": 34,
219
  "metadata": {},
220
  "output_type": "execute_result"
221
  }
@@ -227,7 +227,7 @@
227
  },
228
  {
229
  "cell_type": "code",
230
- "execution_count": 35,
231
  "metadata": {},
232
  "outputs": [
233
  {
@@ -258,7 +258,7 @@
258
  },
259
  {
260
  "cell_type": "code",
261
- "execution_count": 36,
262
  "metadata": {},
263
  "outputs": [
264
  {
@@ -269,7 +269,7 @@
269
  " dtype='object')"
270
  ]
271
  },
272
- "execution_count": 36,
273
  "metadata": {},
274
  "output_type": "execute_result"
275
  }
@@ -280,7 +280,7 @@
280
  },
281
  {
282
  "cell_type": "code",
283
- "execution_count": 37,
284
  "metadata": {},
285
  "outputs": [],
286
  "source": [
@@ -304,7 +304,7 @@
304
  },
305
  {
306
  "cell_type": "code",
307
- "execution_count": 38,
308
  "metadata": {},
309
  "outputs": [
310
  {
@@ -350,7 +350,7 @@
350
  },
351
  {
352
  "cell_type": "code",
353
- "execution_count": 39,
354
  "metadata": {},
355
  "outputs": [
356
  {
@@ -381,7 +381,7 @@
381
  },
382
  {
383
  "cell_type": "code",
384
- "execution_count": 40,
385
  "metadata": {},
386
  "outputs": [
387
  {
@@ -425,7 +425,7 @@
425
  },
426
  {
427
  "cell_type": "code",
428
- "execution_count": 41,
429
  "metadata": {},
430
  "outputs": [
431
  {
@@ -478,7 +478,7 @@
478
  },
479
  {
480
  "cell_type": "code",
481
- "execution_count": 42,
482
  "metadata": {},
483
  "outputs": [
484
  {
@@ -588,7 +588,7 @@
588
  "4 1485.0 121.0 16.37 "
589
  ]
590
  },
591
- "execution_count": 42,
592
  "metadata": {},
593
  "output_type": "execute_result"
594
  }
@@ -599,7 +599,7 @@
599
  },
600
  {
601
  "cell_type": "code",
602
- "execution_count": 43,
603
  "metadata": {},
604
  "outputs": [],
605
  "source": [
@@ -608,7 +608,7 @@
608
  },
609
  {
610
  "cell_type": "code",
611
- "execution_count": 44,
612
  "metadata": {},
613
  "outputs": [
614
  {
@@ -619,7 +619,7 @@
619
  " dtype='object')"
620
  ]
621
  },
622
- "execution_count": 44,
623
  "metadata": {},
624
  "output_type": "execute_result"
625
  }
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 1,
24
  "metadata": {},
25
  "outputs": [],
26
  "source": [
 
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 2,
44
  "metadata": {},
45
  "outputs": [
46
  {
 
215
  "9 1485.0 121.0 15.36 "
216
  ]
217
  },
218
+ "execution_count": 2,
219
  "metadata": {},
220
  "output_type": "execute_result"
221
  }
 
227
  },
228
  {
229
  "cell_type": "code",
230
+ "execution_count": 3,
231
  "metadata": {},
232
  "outputs": [
233
  {
 
258
  },
259
  {
260
  "cell_type": "code",
261
+ "execution_count": 4,
262
  "metadata": {},
263
  "outputs": [
264
  {
 
269
  " dtype='object')"
270
  ]
271
  },
272
+ "execution_count": 4,
273
  "metadata": {},
274
  "output_type": "execute_result"
275
  }
 
280
  },
281
  {
282
  "cell_type": "code",
283
+ "execution_count": 5,
284
  "metadata": {},
285
  "outputs": [],
286
  "source": [
 
304
  },
305
  {
306
  "cell_type": "code",
307
+ "execution_count": 6,
308
  "metadata": {},
309
  "outputs": [
310
  {
 
350
  },
351
  {
352
  "cell_type": "code",
353
+ "execution_count": 7,
354
  "metadata": {},
355
  "outputs": [
356
  {
 
381
  },
382
  {
383
  "cell_type": "code",
384
+ "execution_count": 8,
385
  "metadata": {},
386
  "outputs": [
387
  {
 
425
  },
426
  {
427
  "cell_type": "code",
428
+ "execution_count": 9,
429
  "metadata": {},
430
  "outputs": [
431
  {
 
478
  },
479
  {
480
  "cell_type": "code",
481
+ "execution_count": 10,
482
  "metadata": {},
483
  "outputs": [
484
  {
 
588
  "4 1485.0 121.0 16.37 "
589
  ]
590
  },
591
+ "execution_count": 10,
592
  "metadata": {},
593
  "output_type": "execute_result"
594
  }
 
599
  },
600
  {
601
  "cell_type": "code",
602
+ "execution_count": 11,
603
  "metadata": {},
604
  "outputs": [],
605
  "source": [
 
608
  },
609
  {
610
  "cell_type": "code",
611
+ "execution_count": 12,
612
  "metadata": {},
613
  "outputs": [
614
  {
 
619
  " dtype='object')"
620
  ]
621
  },
622
+ "execution_count": 12,
623
  "metadata": {},
624
  "output_type": "execute_result"
625
  }
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
  streamlit==1.42.0
3
  scikit-learn==1.6.1
4
- kagglehub==0.3.7
5
- matplotlib==3.10.0
 
1
  pandas==2.2.3
2
  streamlit==1.42.0
3
  scikit-learn==1.6.1
4
+ matplotlib==3.10.0
5
+ plotly
yield_df.csv ADDED
The diff for this file is too large to render. See raw diff