Spaces:

limravy
/

AUPPMSAI

Sleeping

App Files Files Community

AUPPMSAI / app.py

limravy

share=True

fe95fab verified about 1 year ago

raw

history blame contribute delete

5.73 kB

	import subprocess
	import sys
	import os

	requirements_file = "requirements.txt"

	if os.path.exists(requirements_file):
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file])
	print(f"Successfully installed requirements from {requirements_file}")
	except subprocess.CalledProcessError as e:
	print(f"Error installing requirements: {e}")
	except FileNotFoundError:
	print("pip not found. Ensure pip is installed and in your PATH.")
	except Exception as generic_exception:
	print(f"An unexpected error occured: {generic_exception}")
	else:
	print(f"Requirements file not found: {requirements_file}")






	import numpy as np
	import statsmodels.api as sm
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split

	# filepath="/content/data.txt"
	filepath="data.txt"
	test_size= 0.2
	random_state=0

	data = np.loadtxt(filepath,delimiter=',')

	# Separate features (X) and target (y)
	X = data[:, :-1] # All columns except the last one are features
	y = data[:, -1] # The last column is the target

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

	print("Data loaded and split successfully.")
	print(f"X_train shape: {X_train.shape}")
	print(f"X_test shape: {X_test.shape}")
	print(f"y_train shape: {y_train.shape}")
	print(f"y_test shape: {y_test.shape}")


	if X is not None:
	num_inputs = X.shape[1]

	# Create scatter plots
	for i in range(num_inputs):
	plt.figure(figsize=(8, 6))
	plt.scatter(X[:, i], y)
	plt.xlabel(f"Input {i+1}")
	plt.ylabel("Target")
	plt.title(f"Scatter Plot: Input {i+1} vs. Target")
	plt.grid(True)
	plt.show()

	# Comment on the relationship
	correlation = np.corrcoef(X[:, i], y)[0, 1]
	print(f"Correlation between Input {i+1} and Target: {correlation:.4f}")

	if abs(correlation) > 0.7:
	print("Strong relationship detected.")
	elif abs(correlation) >0.3:
	print("Moderate relationship detected.")
	else:
	print("Weak or no linear relationship detected.")
	print("-" * 30)
	else:
	print("Data loading or splitting failed.")



	# Add a constant (intercept) to the training data
	X_train_const = sm.add_constant(X_train)

	# Train the model using statsmodels
	model = sm.OLS(y_train, X_train_const).fit()

	# Print the summary results
	print(model.summary())


	# Identify significant and insignificant inputs
	p_values = model.pvalues[1:] # Exclude the constant's p-value
	significant_inputs = np.where(p_values < 0.05)[0] # Assuming 0.05 significance level
	insignificant_inputs = np.where(p_values >= 0.05)[0]

	print("\nSignificant Inputs (indices):", significant_inputs)
	print("Insignificant Inputs (indices):", insignificant_inputs)


	if len(insignificant_inputs) > 0:
	significant_columns = np.concatenate(([0], significant_inputs + 1)) #add constant index back.
	Xs_train_const = X_train_const[:, significant_columns]
	Xs_test = X_test[:, significant_inputs]
	Xs_test_const = sm.add_constant(Xs_test)

	model_reduced = sm.OLS(y_train, Xs_train_const).fit()
	print("\nModel Summary after dropping insignificant inputs:")
	print(model_reduced.summary())
	else:
	print("\nNo insignificant inputs found. No model retraining needed.")

	print("\nUnderstanding Insignificant Inputs:")
	print("Insignificant inputs, as indicated by their high p-values (typically > 0.05), suggest that they do not have a statistically significant linear relationship with the target variable, given the other inputs in the model. In other words, their coefficients are not reliably different from zero. Removing them can simplify the model and potentially improve its generalization performance by reducing noise. They contribute little to explaining the variance in the target variable.")



	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import r2_score

	# Use scikit-learn LinearRegression
	model_sklearn = LinearRegression()
	model_sklearn.fit(Xs_train_const[:, 1:], y_train) #slicing to remove the constant column for sklearn.

	# Predict on the training set
	y_train_pred = model_sklearn.predict(Xs_train_const[:, 1:])

	# Predict on the test set
	y_test_pred = model_sklearn.predict(Xs_test)

	# Compute R-squared scores
	r2_train = r2_score(y_train, y_train_pred)
	r2_test = r2_score(y_test, y_test_pred)

	print(f"\nR-squared score on the training set: {r2_train:.4f}")
	print(f"R-squared score on the test set: {r2_test:.4f}")


	# Compare R-squared scores and comment
	if r2_train > r2_test and (r2_train - r2_test) > 0.1: #setting a threshold to detect overfitting.
	print("\nOverfitting: The model performs significantly better on the training data than on the test data.")
	elif r2_train < r2_test:
	print("\nUnderfitting: The model performs poorly on both the training and test data.")
	else:
	print("\nGood fit: The model performs well on both the training and test data, indicating good generalization.")



	## YOUR CODES need to screenshot app
	# !pip install gradio
	import gradio as gr

	# Gradio Interface
	input_components = [gr.Number(label=f"Input {i+1}") for i in significant_inputs]

	def predict(*inputs):
	input_array = np.array(inputs).reshape(1, -1)
	prediction = model_sklearn.predict(input_array)[0]
	return prediction

	iface = gr.Interface(
	fn=predict,
	inputs=input_components,
	outputs=gr.Number(label="Prediction"),
	title="Linear Regression Prediction App",
	description="Enter the input values to get the predicted target value."
	)
	iface.launch(share=True)