import subprocess
import sys
import os

requirements_file = "requirements.txt"

if os.path.exists(requirements_file):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file])
        print(f"Successfully installed requirements from {requirements_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e}")
    except FileNotFoundError:
        print("pip not found. Ensure pip is installed and in your PATH.")
    except Exception as generic_exception:
        print(f"An unexpected error occured: {generic_exception}")
else:
    print(f"Requirements file not found: {requirements_file}")


import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# filepath="/content/data.txt"
filepath="data.txt"
test_size= 0.2
random_state=0

data = np.loadtxt(filepath,delimiter=',')

# Separate features (X) and target (y)
X = data[:, :-1]  # All columns except the last one are features
y = data[:, -1]   # The last column is the target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print("Data loaded and split successfully.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


if X is not None:
  num_inputs = X.shape[1]

  # Create scatter plots
  for i in range(num_inputs):
      plt.figure(figsize=(8, 6))
      plt.scatter(X[:, i], y)
      plt.xlabel(f"Input {i+1}")
      plt.ylabel("Target")
      plt.title(f"Scatter Plot: Input {i+1} vs. Target")
      plt.grid(True)
      plt.show()

      # Comment on the relationship
      correlation = np.corrcoef(X[:, i], y)[0, 1]
      print(f"Correlation between Input {i+1} and Target: {correlation:.4f}")

      if abs(correlation) > 0.7:
          print("Strong relationship detected.")
      elif abs(correlation) >0.3:
          print("Moderate relationship detected.")
      else:
          print("Weak or no linear relationship detected.")
      print("-" * 30)
else:
    print("Data loading or splitting failed.")


# Add a constant (intercept) to the training data
X_train_const = sm.add_constant(X_train)

# Train the model using statsmodels
model = sm.OLS(y_train, X_train_const).fit()

# Print the summary results
print(model.summary())


# Identify significant and insignificant inputs
p_values = model.pvalues[1:]  # Exclude the constant's p-value
significant_inputs = np.where(p_values < 0.05)[0]  # Assuming 0.05 significance level
insignificant_inputs = np.where(p_values >= 0.05)[0]

print("\nSignificant Inputs (indices):", significant_inputs)
print("Insignificant Inputs (indices):", insignificant_inputs)


if len(insignificant_inputs) > 0:
      significant_columns = np.concatenate(([0], significant_inputs + 1)) #add constant index back.
      Xs_train_const = X_train_const[:, significant_columns]
      Xs_test = X_test[:, significant_inputs]
      Xs_test_const = sm.add_constant(Xs_test)

      model_reduced = sm.OLS(y_train, Xs_train_const).fit()
      print("\nModel Summary after dropping insignificant inputs:")
      print(model_reduced.summary())
else:
      print("\nNo insignificant inputs found. No model retraining needed.")

      print("\nUnderstanding Insignificant Inputs:")
      print("Insignificant inputs, as indicated by their high p-values (typically > 0.05), suggest that they do not have a statistically significant linear relationship with the target variable, given the other inputs in the model. In other words, their coefficients are not reliably different from zero. Removing them can simplify the model and potentially improve its generalization performance by reducing noise. They contribute little to explaining the variance in the target variable.")


from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Use scikit-learn LinearRegression
model_sklearn = LinearRegression()
model_sklearn.fit(Xs_train_const[:, 1:], y_train) #slicing to remove the constant column for sklearn.

# Predict on the training set
y_train_pred = model_sklearn.predict(Xs_train_const[:, 1:])

# Predict on the test set
y_test_pred = model_sklearn.predict(Xs_test)

# Compute R-squared scores
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"\nR-squared score on the training set: {r2_train:.4f}")
print(f"R-squared score on the test set: {r2_test:.4f}")


# Compare R-squared scores and comment
if r2_train > r2_test and (r2_train - r2_test) > 0.1: #setting a threshold to detect overfitting.
    print("\nOverfitting: The model performs significantly better on the training data than on the test data.")
elif r2_train < r2_test:
    print("\nUnderfitting: The model performs poorly on both the training and test data.")
else:
    print("\nGood fit: The model performs well on both the training and test data, indicating good generalization.")


## YOUR CODES need to screenshot app
# !pip install gradio
import gradio as gr

# Gradio Interface
input_components = [gr.Number(label=f"Input {i+1}") for i in significant_inputs]

def predict(*inputs):
  input_array = np.array(inputs).reshape(1, -1)
  prediction = model_sklearn.predict(input_array)[0]
  return prediction

iface = gr.Interface(
            fn=predict,
            inputs=input_components,
            outputs=gr.Number(label="Prediction"),
            title="Linear Regression Prediction App",
            description="Enter the input values to get the predicted target value."
)
iface.launch(share=True)