antitheft159 commited on
Commit
d5446c7
·
verified ·
1 Parent(s): 1567fe1

Upload 1094_871_252_511_.py

Browse files
Files changed (1) hide show
  1. 1094_871_252_511_.py +104 -0
1094_871_252_511_.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """1094_871_252_511_
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Mz5F8L08R3c_7vQ77xcKumhO9Q__9jAV
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+ import plotly.express as px
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.decomposition import PCA
17
+ from sklearn.cluster import KMeans
18
+ from sklearn.ensemble import RandomForestRegressor
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.metrics import mean_squared_error, r2_score
21
+ import plotly.io as pio
22
+ pio.renderers.default = 'notebook'
23
+ pio.renderers.default = 'iframe_connected'
24
+
25
+ import pandas as pd
26
+ df=pd.read_csv("/content/unified_monthly_data_interpolated_1990_20250101 (1).csv")
27
+
28
+ df.info()
29
+
30
+ df.shape
31
+
32
+ df.isna().sum()
33
+ df.duplicated().sum()
34
+
35
+ df['Date'] = pd.to_datetime(df['Date'])
36
+ df.set_index('Date', inplace=True)
37
+ df.drop(columns=['Region'], inplace=True)
38
+
39
+ import matplotlib.pyplot as plt
40
+ import seaborn as sns
41
+ plt.figure(figsize=(16, 12))
42
+ corr = df.corr()
43
+ sns.heatmap(corr, annot=True, fmt='.2f', cmap='Reds', linewidths=0.5)
44
+ plt.title('Correlation Heatmap')
45
+ plt.tight_layout()
46
+ plt.show()
47
+
48
+ from sklearn.preprocessing import StandardScaler
49
+ from sklearn.decomposition import PCA
50
+ from sklearn.cluster import KMeans
51
+ from sklearn.ensemble import RandomForestRegressor
52
+ from sklearn.model_selection import train_test_split
53
+ from sklearn.metrics import mean_squared_error, r2_score
54
+ import matplotlib.pyplot as plt
55
+ import seaborn as sns
56
+ import plotly.express as px
57
+ import pandas as pd
58
+
59
+ scaler = StandardScaler()
60
+ scaled_data = scaler.fit_transform(df)
61
+ pca = PCA(n_components=2)
62
+ pca_result = pca.fit_transform(scaled_data)
63
+ df['PCA1'] = pca_result[:, 0]
64
+ df['PCA2'] = pca_result[:, 1]
65
+
66
+ kmeans = KMeans(n_clusters=3, random_state=42)
67
+ df['Cluster'] = kmeans.fit_predict(pca_result)
68
+
69
+ fig = px.scatter(df, x='PCA1', y='PCA2', color=df['Cluster'].astype(str),
70
+ title='KMeans Clustering on PCA Features')
71
+ fig.show()
72
+
73
+ plt.figure(figsize=(14, 6))
74
+ for col in ['AverageSalesPrice', 'MedianSalesPriceofHousesSold', 'MedianListingPriceperSquareFeet']:
75
+ sns.lineplot(data=df[col], label=col)
76
+
77
+ plt.title('House Price Trends Over Time')
78
+ plt.xlabel('Date')
79
+ plt.ylabel('Price')
80
+ plt.legend()
81
+ plt.tight_layout()
82
+ plt.show()
83
+
84
+ X = df.drop(columns=['MedianListingPriceperSquareFeet', 'Cluster', 'PCA1', 'PCA2'])
85
+ y = df['MedianListingPriceperSquareFeet']
86
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
87
+
88
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
89
+ model.fit(X_train, y_train)
90
+ preds = model.predict(X_test)
91
+
92
+ mse = mean_squared_error(y_test, preds)
93
+ r2 = r2_score(y_test, preds)
94
+ print(f"Mean Squared Error: {mse:.3f}")
95
+ print(f"R2 Score: {r2:.3f}")
96
+
97
+ importances = model.feature_importances_
98
+ feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
99
+
100
+ plt.figure(figsize=(10, 6))
101
+ sns.barplot(data=feat_df, x='Importance', y='Feature', palette='viridis')
102
+ plt.title('Feature Importance')
103
+ plt.tight_layout()
104
+ plt.show()