import subprocess import sys import os requirements_file = "requirements.txt" if os.path.exists(requirements_file): try: subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file]) print(f"Successfully installed requirements from {requirements_file}") except subprocess.CalledProcessError as e: print(f"Error installing requirements: {e}") except FileNotFoundError: print("pip not found. Ensure pip is installed and in your PATH.") except Exception as generic_exception: print(f"An unexpected error occured: {generic_exception}") else: print(f"Requirements file not found: {requirements_file}") import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # filepath="/content/data.txt" filepath="data.txt" test_size= 0.2 random_state=0 data = np.loadtxt(filepath,delimiter=',') # Separate features (X) and target (y) X = data[:, :-1] # All columns except the last one are features y = data[:, -1] # The last column is the target # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) print("Data loaded and split successfully.") print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") if X is not None: num_inputs = X.shape[1] # Create scatter plots for i in range(num_inputs): plt.figure(figsize=(8, 6)) plt.scatter(X[:, i], y) plt.xlabel(f"Input {i+1}") plt.ylabel("Target") plt.title(f"Scatter Plot: Input {i+1} vs. Target") plt.grid(True) plt.show() # Comment on the relationship correlation = np.corrcoef(X[:, i], y)[0, 1] print(f"Correlation between Input {i+1} and Target: {correlation:.4f}") if abs(correlation) > 0.7: print("Strong relationship detected.") elif abs(correlation) >0.3: print("Moderate relationship detected.") else: print("Weak or no linear relationship detected.") print("-" * 30) else: print("Data loading or splitting failed.") # Add a constant (intercept) to the training data X_train_const = sm.add_constant(X_train) # Train the model using statsmodels model = sm.OLS(y_train, X_train_const).fit() # Print the summary results print(model.summary()) # Identify significant and insignificant inputs p_values = model.pvalues[1:] # Exclude the constant's p-value significant_inputs = np.where(p_values < 0.05)[0] # Assuming 0.05 significance level insignificant_inputs = np.where(p_values >= 0.05)[0] print("\nSignificant Inputs (indices):", significant_inputs) print("Insignificant Inputs (indices):", insignificant_inputs) if len(insignificant_inputs) > 0: significant_columns = np.concatenate(([0], significant_inputs + 1)) #add constant index back. Xs_train_const = X_train_const[:, significant_columns] Xs_test = X_test[:, significant_inputs] Xs_test_const = sm.add_constant(Xs_test) model_reduced = sm.OLS(y_train, Xs_train_const).fit() print("\nModel Summary after dropping insignificant inputs:") print(model_reduced.summary()) else: print("\nNo insignificant inputs found. No model retraining needed.") print("\nUnderstanding Insignificant Inputs:") print("Insignificant inputs, as indicated by their high p-values (typically > 0.05), suggest that they do not have a statistically significant linear relationship with the target variable, given the other inputs in the model. In other words, their coefficients are not reliably different from zero. Removing them can simplify the model and potentially improve its generalization performance by reducing noise. They contribute little to explaining the variance in the target variable.") from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score # Use scikit-learn LinearRegression model_sklearn = LinearRegression() model_sklearn.fit(Xs_train_const[:, 1:], y_train) #slicing to remove the constant column for sklearn. # Predict on the training set y_train_pred = model_sklearn.predict(Xs_train_const[:, 1:]) # Predict on the test set y_test_pred = model_sklearn.predict(Xs_test) # Compute R-squared scores r2_train = r2_score(y_train, y_train_pred) r2_test = r2_score(y_test, y_test_pred) print(f"\nR-squared score on the training set: {r2_train:.4f}") print(f"R-squared score on the test set: {r2_test:.4f}") # Compare R-squared scores and comment if r2_train > r2_test and (r2_train - r2_test) > 0.1: #setting a threshold to detect overfitting. print("\nOverfitting: The model performs significantly better on the training data than on the test data.") elif r2_train < r2_test: print("\nUnderfitting: The model performs poorly on both the training and test data.") else: print("\nGood fit: The model performs well on both the training and test data, indicating good generalization.") ## YOUR CODES need to screenshot app # !pip install gradio import gradio as gr # Gradio Interface input_components = [gr.Number(label=f"Input {i+1}") for i in significant_inputs] def predict(*inputs): input_array = np.array(inputs).reshape(1, -1) prediction = model_sklearn.predict(input_array)[0] return prediction iface = gr.Interface( fn=predict, inputs=input_components, outputs=gr.Number(label="Prediction"), title="Linear Regression Prediction App", description="Enter the input values to get the predicted target value." ) iface.launch(share=True)