Spaces:

wayne-chi
/

Testing

Sleeping

App Files Files Community

wayne-chi commited on Aug 12, 2025

Commit

78575a4

verified ·

1 Parent(s): 55c27cb

Upload 6 files

Browse files

Files changed (6) hide show

download_models.py +28 -0
inference.py +242 -0
predictor.py +640 -0
requirements.txt +13 -3
setup.sh +2 -0
streamlit_app.py +15 -0

download_models.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from huggingface_hub import hf_hub_download, snapshot_download
+# Target directory for models
+target_dir = "Models"
+os.makedirs(target_dir, exist_ok=True)
+# Download specific files (Folds 1–5) from willieseun/Eagle-Team-TabPFN
+print("Downloading fold models from willieseun/Eagle-Team-TabPFN...")
+for i in range(1, 6):
+    file_name = f"Fold_{i}_best_model.tabpfn_fit"
+    model_path = hf_hub_download(
+        repo_id="willieseun/Eagle-Team-TabPFN",
+        filename=file_name,
+        local_dir=target_dir
+    )
+    print(f"Downloaded: {model_path}")
+# Download full snapshot from wayne-chi/Eagle_Team
+print("\nDownloading snapshot from wayne-chi/Eagle_Team...")
+snapshot_download(
+    repo_id="wayne-chi/Eagle_Team",
+    revision="main",  # Optional, default is "main"
+    local_dir=target_dir,
+    local_dir_use_symlinks=False
+)
+print("\n✅ All models downloaded successfully to:", target_dir)

inference.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import pandas as pd
+import numpy as np
+import torch
+import joblib
+import argparse
+import os
+import glob
+from sklearn.multioutput import MultiOutputRegressor
+from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
+from tabpfn import TabPFNRegressor
+os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "true"
+def joblib_load_cpu(path):
+	# Patch torch.load globally inside joblib to always load on CPU
+	original_load = torch.load
+	def cpu_loader(*args, **kwargs):
+		kwargs['map_location'] = torch.device('cpu')
+		return original_load(*args, **kwargs)
+	torch.load = cpu_loader
+	try:
+		model = joblib.load(path)
+	finally:
+		torch.load = original_load  # Restore original torch.load
+	return model
+class TabPFNEnsemblePredictor:
+	"""
+	A class to load an ensemble of TabPFN models and generate averaged predictions.
+	This class is designed to find and load all k-fold models from a specified
+	directory, handle the necessary feature engineering, and produce a single,
+	ensembled prediction from various input types (DataFrame, numpy array, or CSV file path).
+	Attributes:
+		model_paths (list): A list of file paths for the loaded models.
+		models (list): A list of the loaded model objects.
+		target_cols (list): The names of the target columns for the output DataFrame.
+	"""
+	def __init__(self, model_dir: str, model_pattern: str = "Fold_*_best_model.tabpfn_fit*"):
+		"""
+		Initializes the predictor by finding and loading the ensemble of models.
+		Args:
+			model_dir (str): The directory containing the saved .tabpfn_fit model files.
+			model_pattern (str, optional): The glob pattern to find model files.
+										   Defaults to "Fold_*_best_model.tabpfn_fit".
+		Raises:
+			FileNotFoundError: If no models matching the pattern are found in the directory.
+		"""
+		print("Initializing the TabPFN Ensemble Predictor...")
+		self.model_paths = sorted(glob.glob(os.path.join(model_dir, model_pattern)))
+		if not self.model_paths:
+			raise FileNotFoundError(
+				f"Error: No models found in '{model_dir}' matching the pattern '{model_pattern}'"
+			)
+		print(f"Found {len(self.model_paths)} models to form the ensemble.")
+		self.models = self._load_models()
+		self.target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
+	def _load_models(self) -> list:
+		"""
+		Loads the TabPFN models from the specified paths and moves them to the CPU.
+		This is a private method called during initialization.
+		"""
+		loaded_models = []
+		for model_path in self.model_paths:
+			print(f"Loading model: {os.path.basename(model_path)}...")
+			try:
+				# Move model components to CPU for inference to avoid potential CUDA errors
+				# and ensure compatibility on machines without a GPU.
+				if not torch.cuda.is_available():
+					#torch.device("cpu")  # Force default
+					#os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
+					#os.environ["CUDA_VISIBLE_DEVICES"] = ""
+					#os.environ["HSA_OVERRIDE_GFX_VERSION"] = "0"
+					model = joblib_load_cpu(model_path)
+					for estimator in model.estimators_:
+						estimator.device = "cpu"
+						estimator.max_time = 40
+					print("Cuda not available using cpu")
+					#for estimator in model.estimators_:
+					#	if hasattr(estimator, "predictor_") and hasattr(estimator.predictor_, "predictors"):
+					#		for p in estimator.predictor_.predictors:
+					#			p.to("cpu")
+					#	if hasattr(estimator.predictor_, 'to'):
+					#		estimator.predictor_.to('cpu')
+				else:
+					print("Cuda is available")
+					model = joblib.load(model_path)
+					for estimator in model.estimators_:
+						if hasattr(estimator, "predictor_") and hasattr(estimator.predictor_, "predictors"):
+							for p in estimator.predictor_.predictors:
+								p.to("cuda")
+				loaded_models.append(model)
+				print(f"Successfully loaded {os.path.basename(model_path)}")
+			except Exception as e:
+				print(f"Warning: Could not load model from {model_path}. Skipping. Error: {e}")
+		return loaded_models
+	@staticmethod
+	def _feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
+		"""
+		Applies feature engineering to the input dataframe. This is a static method
+		as it does not depend on the state of the class instance.
+		Args:
+			df (pd.DataFrame): The input dataframe.
+		Returns:
+			pd.DataFrame: The dataframe with new engineered features.
+		"""
+		components = ['Component1', 'Component2', 'Component3', 'Component4', 'Component5']
+		properties = [f'Property{i}' for i in range(1, 11)]
+		df_featured = df.copy()
+		for prop in properties:
+			df_featured[f'Weighted_{prop}'] = sum(
+				df_featured[f'{comp}_fraction'] * df_featured[f'{comp}_{prop}'] for comp in components
+			)
+			cols = [f'{comp}_{prop}' for comp in components]
+			df_featured[f'{prop}_variance'] = df_featured[cols].var(axis=1)
+			df_featured[f'{prop}_range'] = df_featured[cols].max(axis=1) - df_featured[cols].min(axis=1)
+		return df_featured
+	def custom_predict(self, input_data: pd.DataFrame or np.ndarray or str) -> (np.ndarray, pd.DataFrame):
+		"""
+		Generates ensembled predictions for the given input data.
+		This method takes input data, preprocesses it if necessary, generates a
+		prediction from each model in the ensemble, and returns the averaged result.
+		Args:
+			input_data (pd.DataFrame or np.ndarray or str): The input data for prediction.
+				Can be a pandas DataFrame, a numpy array (must be pre-processed),
+				or a string path to a CSV file.
+		Returns:
+			tuple: A tuple containing:
+				- np.ndarray: The averaged predictions as a numpy array.
+				- pd.DataFrame: The averaged predictions as a pandas DataFrame.
+		"""
+		if not self.models:
+			print("Error: No models were loaded. Cannot make predictions.")
+			return None, None
+		# --- Data Preparation ---
+		if isinstance(input_data, str) and os.path.isfile(input_data):
+			print(f"Loading and processing data from CSV: {input_data}")
+			test_df = pd.read_csv(input_data)
+			processed_df = self._feature_engineering(test_df)
+		elif isinstance(input_data, pd.DataFrame):
+			print("Processing input DataFrame...")
+			processed_df = self._feature_engineering(input_data)
+		elif isinstance(input_data, np.ndarray):
+			print("Using input numpy array directly (assuming it's pre-processed).")
+			sub = input_data
+		else:
+			raise TypeError("Input data must be a pandas DataFrame, a numpy array, or a path to a CSV file.")
+		if isinstance(input_data, (str, pd.DataFrame)):
+			if "ID" in processed_df.columns:
+				sub = processed_df.drop(columns=["ID"]).values
+			else:
+				sub = processed_df.values
+		# --- Prediction Loop ---
+		all_fold_predictions = []
+		print("\nGenerating predictions from the model ensemble...")
+		for i, model in enumerate(self.models):
+			try:
+				y_sub = model.predict(sub)
+				all_fold_predictions.append(y_sub)
+				print(f"  - Prediction from model {i+1} completed.")
+			except Exception as e:
+				print(f"  - Warning: Could not predict with model {i+1}. Skipping. Error: {e}")
+		if not all_fold_predictions:
+			print("\nError: No predictions were generated from any model.")
+			return None, None
+		# --- Averaging ---
+		print("\nAveraging predictions from all models...")
+		averaged_preds_array = np.mean(all_fold_predictions, axis=0)
+		averaged_preds_df = pd.DataFrame(averaged_preds_array, columns=self.target_cols)
+		print("Ensemble prediction complete.")
+		return averaged_preds_array, averaged_preds_df
+# This block allows the script to be run directly from the command line
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(
+		description="""
+		Command-line interface for the TabPFNEnsemblePredictor.
+		Example Usage:
+		python inference.py --model_dir ./saved_models/ --input_path ./test_data.csv --output_path ./final_preds.csv
+		""",
+		formatter_class=argparse.RawTextHelpFormatter
+	)
+	parser.add_argument("--model_dir", type=str, required=True,
+						help="Directory containing the saved .tabpfn_fit model files.")
+	parser.add_argument("--input_path", type=str, required=True,
+						help="Path to the input CSV file for prediction.")
+	parser.add_argument("--output_path", type=str, default="predictions_ensembled.csv",
+						help="Path to save the final ensembled predictions CSV file.")
+	args = parser.parse_args()
+	if not os.path.isdir(args.model_dir):
+		print(f"Error: Model directory not found at {args.model_dir}")
+	elif not os.path.exists(args.input_path):
+		print(f"Error: Input file not found at {args.input_path}")
+	else:
+		try:
+			# 1. Instantiate the predictor class
+			predictor = TabPFNEnsemblePredictor(model_dir=args.model_dir)
+			# 2. Call the predict method
+			preds_array, preds_df = predictor.predict(args.input_path)
+			# 3. Save the results
+			if preds_df is not None:
+				preds_df.to_csv(args.output_path, index=False)
+				print(f"\nEnsembled predictions successfully saved to {args.output_path}")
+				print("\n--- Sample of Final Averaged Predictions ---")
+				print(preds_df.head())
+				print("------------------------------------------")
+		except Exception as e:
+			print(f"\nAn error occurred during the process: {e}")

predictor.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# prompt: import pandas and basic machine learning models for regression
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.svm import SVR
+from sklearn.model_selection import train_test_split
+import itertools
+import random
+import torch
+import random
+import numpy as np
+import os
+import joblib
+import matplotlib.pyplot as plt
+from tabpfn import TabPFNRegressor
+from sklearn.model_selection import KFold
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.metrics import mean_absolute_percentage_error
+from sklearn.linear_model import LinearRegression
+from inference import TabPFNEnsemblePredictor  # import inference.py
+# from sklearn.metrics import mean_absolute_percentage_error
+# from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
+from itertools import combinations
+from scipy.special import comb
+# from tabpfn.model.loading import (
+#     load_fitted_tabpfn_model,
+#     save_fitted_tabpfn_model,
+# )
+class EagleBlendPredictor:
+    def __init__(self, model_sources = './Models'):
+        """
+        model_sources: Dict[str, Any]
+            A dictionary where keys are 'BlendProperty1', ..., 'BlendProperty10'
+            and values are:
+              - loaded model objects, or
+              - callables returning models, or
+              - custom loading logic (you will supply these)
+        """
+        self.home = model_sources
+        self.saved_files_map = {
+                      1: {
+                          "model": 'linear_model_poly_target_1.joblib',
+                          "transform": 'poly1_features.joblib'
+                      },
+                      2: {
+                          "model": 'linear_model_poly_target_2.joblib',
+                          "transform": 'poly2_features.joblib'
+                      },
+                      5: {
+                          "model": 'tabpfn_model_target_5.joblib', #tabpfn_model_target_5_cpu.tabpfn_fit,'tabpfn_model_target_5_cpu.tabpfn_fit'
+                          "transform": 'poly5_features.joblib'
+                      },
+                      6: {
+                          "model": 'linear_model_poly_target_6.joblib',
+                          "transform": 'poly6_features.joblib'
+                      },
+                      7: {
+                          "model": 'tabpfn_model_target_7.joblib',
+                          # For Property 7, the transformation is the mixture feature generation,
+                          # which is not a saved object like PolynomialFeatures.
+                          # You would need to apply the generate_mixture_features function.
+                          "transform_function": "generate_mixture_features"
+                      },
+                      8: {
+                          # For Property 8, the "model" is the initial prediction model (not explicitly saved in this workflow)
+                          # and the correction is the piecewise function defined by parameters and threshold.
+                          "params": 'piecewise_params_prop8.joblib',
+                          "threshold": 'piecewise_threshold_prop8.joblib',
+                          "correction_function": "piecewise_model" # Reference the function name
+                      },
+                      10: {
+                          "model": 'linear_model_poly_target_10.joblib',
+                          "transform": 'poly10_features.joblib'
+                      }
+                  }
+        self.models = {}
+        # Load models and transformers manually
+        self.model_1 = joblib.load(os.path.join(self.home, self.saved_files_map[1]["model"]))
+        self.poly_1 = joblib.load(os.path.join(self.home, self.saved_files_map[1]["transform"]))
+        self.model_2 = joblib.load(os.path.join(self.home, self.saved_files_map[2]["model"]))
+        self.poly_2 = joblib.load(os.path.join(self.home, self.saved_files_map[2]["transform"]))
+        self.model_5 = joblib.load(
+            os.path.join(self.home, self.saved_files_map[5]["model"]), #device="cpu"
+        )
+        self.poly_5 = joblib.load(os.path.join(self.home, self.saved_files_map[5]["transform"]))
+        self.model_6 = joblib.load(os.path.join(self.home, self.saved_files_map[6]["model"]))
+        self.poly_6 = joblib.load(os.path.join(self.home, self.saved_files_map[6]["transform"]))
+        self.model_7 = joblib.load(
+            os.path.join(self.home, self.saved_files_map[7]["model"]), #device="cpu"
+        )
+        # No saved transform for model_7 — use generate_mixture_features later in prediction
+        self.piecewise_params_8 = joblib.load(os.path.join(self.home, self.saved_files_map[8]["params"]))
+        self.piecewise_threshold_8 = joblib.load(os.path.join(self.home, self.saved_files_map[8]["threshold"]))
+        # Use piecewise_model function later
+        self.model_10 = joblib.load(os.path.join(self.home, self.saved_files_map[10]["model"]))
+        self.poly_10 = joblib.load(os.path.join(self.home, self.saved_files_map[10]["transform"]))
+        self.model_3489 = TabPFNEnsemblePredictor(model_dir="Models")
+        pass
+    def piecewise_model(self, x, boundaries=np.linspace(-0.2, 0.2, 10+1)[1:-1]):
+        """
+        x: a single float value
+        params: list of 20 parameters [A1, B1, A2, B2, ..., A10, B10]
+        boundaries: 9 values that divide x into 10 regions
+        """
+        params = self.piecewise_params_8
+        # Unpack parameters
+        segments = [(params[i], params[i+1]) for i in range(0, 20, 2)]
+        # Piecewise logic using boundaries
+        for i, bound in enumerate(boundaries):
+            if x < bound:
+                A, B = segments[i]
+                return A * x + B
+        # If x is greater than all boundaries, use the last segment
+        A, B = segments[-1]
+        return A * x + B
+    def predict_BlendProperty1(self, data, full = True):
+        # Dummy custom transformation and prediction for BlendProperty1
+        if full:
+            features = self._transform1(data)
+            features = self.poly_1.transform(features)
+        else:
+            features = self.poly_1.transform(data)
+        res_df = self.model_1.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty1'])
+    def predict_BlendProperty2(self, data, full = True):
+        if full:
+            features = self._transform2(data)
+            features = self.poly_2.transform(features)
+        else:
+            features = self.poly_2.transform(data)
+        res_df = self.model_2.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty2'])
+    def predict_BlendProperty3489(self, df):
+        arrray,result_df = self.model_3489.custom_predict(df)
+        ans_df= result_df[['BlendProperty3','BlendProperty4','BlendProperty8','BlendProperty9']].copy() # Explicitly create a copy
+        ans_df.loc[ans_df['BlendProperty8'].abs()<0.2,'BlendProperty8'] = ans_df[ans_df['BlendProperty8'].abs()<0.2]['BlendProperty8'].apply(self.piecewise_model)
+        ans_df.loc[ans_df['BlendProperty9'].abs()<0.1,'BlendProperty9'] = 0 #ans_df[ans_df['BlendProperty8'].abs()<0.2]['BlendProperty8'].apply(self.piecewise_model)
+        return ans_df
+        # ndf.loc[ndf[pred_col].abs() < threshold_8, pred_col] = ndf[ndf[pred_col].abs() < threshold_8][pred_col].apply(func8)
+    def predict_BlendProperty5(self, data, full =True ):
+        if full:
+            features = self._transform5(data)
+            features = self.poly_5.transform(features)
+        else:
+            features = self.poly_5.transform(data)
+        res_df = self.model_5.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty5'])
+    def predict_BlendProperty6(self, data, full=True):
+        if full:
+            features = self._transform6(data)
+            features = self.poly_6.transform(features)
+        else:
+            features = self.poly_6.transform(data)
+        res_df = self.model_6.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty6'])
+    def predict_BlendProperty7(self, data, full =True)-> pd.DataFrame:
+        if full:
+            features = self._transform7(data)
+        else:
+            raise ValueError("BlendProperty7 prediction requires full data.")
+        res_df = self.model_7.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty7'])
+    def predict_BlendProperty10(self, data, full = False)-> pd.DataFrame:
+        if full:
+            features = self._transform10(data)
+            features = self.poly_10.transform(features)
+        else:
+            features = self.poly_10.transform(data)
+        res_df = self.model_10.predict(features)
+        return pd.DataFrame(res_df, columns=['BlendProperty10'])
+    def predict_all(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Generates predictions for all blend properties using the individual prediction methods.
+        Args:
+            df: Input DataFrame containing the features.
+        Returns:
+            DataFrame with predicted blend properties from 'BlendProperty1' to 'BlendProperty10'.
+        """
+        predictions_list = []
+        # Predict individual properties
+        predictions_list.append(self.predict_BlendProperty1(df, full=True))
+        predictions_list.append(self.predict_BlendProperty2(df, full=True))
+        # Predict BlendProperty3, 4, 8, and 9 together using predict_BlendProperty3489
+        # Assuming predict_BlendProperty3489 returns a DataFrame with columns for these properties.
+        predictions_3489_df = self.predict_BlendProperty3489(df)
+        predictions_list.append(predictions_3489_df[['BlendProperty3']])
+        predictions_list.append(predictions_3489_df[['BlendProperty4']])
+        predictions_list.append(predictions_3489_df[['BlendProperty8']])
+        predictions_list.append(predictions_3489_df[['BlendProperty9']])
+        predictions_list.append(self.predict_BlendProperty5(df, full=True))
+        predictions_list.append(self.predict_BlendProperty6(df, full=True))
+        predictions_list.append(self.predict_BlendProperty7(df, full=True))
+        predictions_list.append(self.predict_BlendProperty10(df, full=True))
+        # Concatenate the list of single-column DataFrames into a single DataFrame
+        predictions_df = pd.concat(predictions_list, axis=1)
+        # Ensure columns are in the desired order
+        ordered_cols = [f'BlendProperty{i}' for i in range(1, 11)]
+        # Reindex to ensure columns are in order, dropping any not generated (though all should be)
+        predictions_df = predictions_df.reindex(columns=ordered_cols)
+        return predictions_df
+    # Dummy transformation functions (replace with your actual logic later)
+    def _transform1(self, data):
+        """
+        Transforms input data (DataFrame or NumPy array) to features for BlendProperty1 prediction.
+        If input is a DataFrame, selects 'ComponentX_fraction' (X=1-5) and 'ComponentX_Property1' (X=1-5).
+        If input is a NumPy array, assumes the columns are already in the correct order:
+        Component1-5_fraction, Component1-5_Property1, Component1-5_Property2, ..., Component1-5_Property10
+        and selects the relevant columns for Property1.
+        Args:
+            data: pandas DataFrame or numpy array.
+        Returns:
+            numpy array of transformed features.
+        """
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        property_cols = [f'Component{i+1}_Property1' for i in range(5)]
+        required_cols = fraction_cols + property_cols
+        if isinstance(data, pd.DataFrame):
+            # Select the required columns from the DataFrame
+            # Ensure columns exist to avoid KeyError
+            try:
+                features = data[required_cols]
+            except KeyError as e:
+                missing_col = str(e).split("'")[1]
+                raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
+        elif isinstance(data, np.ndarray):
+            # Assume the NumPy array has columns in the specified order
+            # Select the first 5 columns (fractions) and columns for Property1 (indices 5 to 9)
+            if data.shape[1] < 10: # Need at least 5 fractions and 5 properties
+                raise ValueError(f"Input NumPy array must have at least 10 columns for this transformation.")
+            # Selecting columns based on the assumed order: fractions (0-4), Property1 (5-9)
+            features = data[:, :10] # Select first 10 columns: 5 fractions + 5 Property1
+        else:
+            raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
+        # Return as numpy array, as expected by PolynomialFeatures.transform
+        return features
+    def _transform2(self, data):
+        """
+        Transforms input data (DataFrame or NumPy array) to features for BlendProperty2 prediction.
+        """
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        property_cols = [f'Component{i+1}_Property2' for i in range(5)]
+        required_cols = fraction_cols + property_cols
+        if isinstance(data, pd.DataFrame):
+            try:
+                features = data[required_cols]
+            except KeyError as e:
+                missing_col = str(e).split("'")[1]
+                raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
+        elif isinstance(data, np.ndarray):
+            # Assume the NumPy array has columns in the specified order
+            # Select the first 5 columns (fractions) and columns for Property2 (indices 10 to 14)
+            if data.shape[1] < 15: # Need at least 5 fractions, 5 Property1, and 5 Property2
+                raise ValueError(f"Input NumPy array must have at least 15 columns for this transformation.")
+            # Selecting columns based on the assumed order: fractions (0-4), Property1 (5-9), Property2 (10-14)
+            features = np.concatenate([data[:, :5], data[:, 10:15]], axis=1)
+        else:
+            raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
+        return features.values if isinstance(features, pd.DataFrame) else features
+    def _transform3(self, data): return None
+    def _transform4(self, data): return None
+    def _transform5(self, data):
+        """
+        Transforms input data (DataFrame or NumPy array) to features for BlendProperty5 prediction.
+        Args:
+            data: pandas DataFrame or numpy array.
+        Returns:
+            numpy array of transformed features.
+        """
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        property_cols = [f'Component{i+1}_Property5' for i in range(5)]
+        required_cols = fraction_cols + property_cols
+        if isinstance(data, pd.DataFrame):
+            try:
+                features = data[required_cols]
+            except KeyError as e:
+                missing_col = str(e).split("'")[1]
+                raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
+        elif isinstance(data, np.ndarray):
+            # Assume the NumPy array has columns in the specified order
+            # Select the first 5 columns (fractions) and columns for Property5 (indices 25 to 29)
+            if data.shape[1] < 30: # Need at least 5 fractions and 5 properties for each of Property1-5
+                raise ValueError(f"Input NumPy array must have at least 30 columns for this transformation.")
+            # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, (10-14) for P2, ..., (25-29) for P5
+            features = np.concatenate([data[:, :5], data[:, 25:30]], axis=1)
+        else:
+            raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
+        return features
+    def _transform6(self, data):
+        """
+        Transforms input data (DataFrame or NumPy array) to features for BlendProperty6 prediction.
+        Args:
+            data: pandas DataFrame or numpy array.
+        Returns:
+            numpy array of transformed features.
+        """
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        property_cols = [f'Component{i+1}_Property6' for i in range(5)]
+        required_cols = fraction_cols + property_cols
+        if isinstance(data, pd.DataFrame):
+            try:
+                features = data[required_cols]
+            except KeyError as e:
+                missing_col = str(e).split("'")[1]
+                raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
+        elif isinstance(data, np.ndarray):
+            # Assume the NumPy array has columns in the specified order
+            # Select the first 5 columns (fractions) and columns for Property6 (indices 30 to 34)
+            if data.shape[1] < 35: # Need at least 5 fractions and 5 properties for each of Property1-6
+                raise ValueError(f"Input NumPy array must have at least 35 columns for this transformation.")
+            # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, ..., (30-34) for P6
+            features = np.concatenate([data[:, :5], data[:, 30:35]], axis=1)
+        else:
+            raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
+        return features
+    def _transform7(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Corrected transformation function for BlendProperty7 prediction.
+        Args:
+            df: Input DataFrame containing the features.
+        Returns:
+            DataFrame with generated features for BlendProperty7 prediction.
+        """
+        tn = 7
+        fn = tn
+        property_tn = [f'Component{i+1}_Property{fn}' for i in range(5)]
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        # Generate mixture features
+        df_prop7 = df[fraction_cols + property_tn].reset_index(drop=True) # Reset index here
+        # Call the class's generate_mixture_features method
+        mixture_features = self.generate_mixture_features(df_prop7)
+        # Identify columns to concatenate (all ComponentX_PropertyY where Y != 7)
+        other_property_cols = [f"Component{i}_Property{j}" for j in range(1,11) for i in range(1,6) if j!= 7]
+        # Select these columns from the input DataFrame
+        try:
+            # Use .loc to preserve the original index when selecting columns, then reset index
+            other_features_df = df.loc[:, other_property_cols].reset_index(drop=True) # Reset index here
+        except KeyError as e:
+            missing_col = str(e).split("'")[1]
+            raise ValueError(f"Input DataFrame for _transform7 is missing required column: {missing_col}") from e
+        # Concatenate along columns (axis=1). Indices should now be aligned after resetting.
+        combined_features = pd.concat([mixture_features, other_features_df], axis=1)
+        return combined_features
+    def _transform8(self, row): return None
+    def _transform9(self, row): return None
+    def _transform10(self, data):
+        """
+        Transforms input data (DataFrame or NumPy array) to features for BlendProperty10 prediction.
+        If input is a DataFrame, selects 'ComponentX_fraction' (X=1-5) and 'ComponentX_Property10' (X=1-5).
+        If input is a NumPy array, assumes the columns are already in the correct order:
+        Component1-5_fraction, Component1-5_Property1, Component1-5_Property2, ..., Component1-5_Property10
+        and selects the relevant columns for Property10.
+        Args:
+            data: pandas DataFrame or numpy array.
+        Returns:
+            numpy array of transformed features.
+        """
+        fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
+        property_cols = [f'Component{i+1}_Property10' for i in range(5)]
+        required_cols = fraction_cols + property_cols
+        if isinstance(data, pd.DataFrame):
+            try:
+                features = data[required_cols]
+            except KeyError as e:
+                missing_col = str(e).split("'")[1]
+                raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
+        elif isinstance(data, np.ndarray):
+            # Assume the NumPy array has columns in the specified order
+            # Select the first 5 columns (fractions) and columns for Property10 (indices 50 to 54)
+            if data.shape[1] < 55: # Need at least 5 fractions and 5 properties for each of Property1-10
+                raise ValueError(f"Input NumPy array must have at least 55 columns for this transformation.")
+            # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, ..., (50-54) for P10
+            features = np.concatenate([data[:, :5], data[:, 50:55]], axis=1)
+        else:
+            raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
+        return features
+    def generate_mixture_features(self,data):
+        """
+        Generate symmetric and weighted nonlinear interactions between fuel weights and properties.
+        The input 'data' should contain weights in the first 5 columns/elements and properties in the next 5.
+        :param data: np.ndarray, pd.DataFrame, or list of shape (n_samples, 10) or (10,)
+        :return: pd.DataFrame with generated features.
+        """
+        # Convert input to numpy array and handle single row/list input
+        if isinstance(data, pd.DataFrame):
+            data_array = data.values
+        elif isinstance(data, list):
+            data_array = np.array(data)
+        elif isinstance(data, np.ndarray):
+            data_array = data
+        else:
+            raise TypeError("Input data must be a pandas DataFrame, numpy array, or list.")
+        # Reshape single row/list input to 2D array
+        if data_array.ndim == 1:
+            data_array = data_array.reshape(1, -1)
+        # Ensure the input has 10 columns (5 weights + 5 properties)
+        if data_array.shape[1] != 10:
+            raise ValueError("Input data must have 10 columns/elements (5 weights and 5 properties).")
+        # Separate weights and properties
+        W = data_array[:, :5]
+        P = data_array[:, 5:]
+        n_samples, n_fuels = W.shape
+        features = {}
+        # Original weights and properties
+        for i in range(n_fuels):
+            features[f'w{i+1}'] = W[:, i]
+            features[f'p{i+1}'] = P[:, i]
+            features[f'w{i+1}_p{i+1}'] = W[:, i] * P[:, i]  # weighted property
+        # --- 1. Weighted sum of properties ---
+        features['weighted_sum'] = np.sum(W * P, axis=1)
+        # --- 2. Weighted square of properties ---
+        features['weighted_sum_sq'] = np.sum(W * P**2, axis=1)
+        # --- 3. Weighted tanh of properties ---
+        features['weighted_tanh'] = np.sum(W * np.tanh(P), axis=1)
+        # --- 4. Weighted exponential ---
+        # features['weighted_exp'] = np.sum(W * np.exp(P), axis=1)
+        # Clip P before exponential to avoid overflow
+        safe_exp = np.exp(np.clip(P, a_min=None, a_max=50))  # 50 is safe upper bound
+        features['weighted_exp'] = np.sum(W * safe_exp, axis=1)
+        # --- 5. Weighted logarithm (clip to avoid -inf) ---
+        # features['weighted_log'] = np.sum(W * np.log(np.clip(P, 1e-6, None)), axis=1)
+        features['weighted_log'] = np.sum(W * np.log(np.clip(P, 1e-6, None)), axis=1)
+        # --- 6. Pairwise interactions (symmetric, weighted) ---
+        for i, j in combinations(range(n_fuels), 2):
+            pij = P[:, i] * P[:, j]
+            wij = W[:, i] * W[:, j]
+            features[f'pair_p{i+1}p{j+1}'] = pij
+            features[f'weighted_pair_p{i+1}p{j+1}'] = pij * wij
+        # --- 7. Triple interactions (weighted & symmetric) ---
+        for i, j, k in combinations(range(n_fuels), 3):
+            pij = P[:, i] * P[:, j] * P[:, k]
+            wij = W[:, i] * W[:, j] * W[:, k]
+            features[f'triplet_p{i+1}{j+1}{k+1}'] = pij
+            features[f'weighted_triplet_p{i+1}{j+1}{k+1}'] = pij * wij
+        # --- 8. Power series + weight modulated ---
+        for power in [2, 3, 4]:
+            features[f'power_sum_{power}'] = np.sum(W * P**power, axis=1)
+        # --- 9. Log-weighted property (prevent log(0)) ---
+        logW = np.log(np.clip(W, 1e-6, None))
+        features['log_weighted_p'] = np.sum(logW * P, axis=1)
+        # --- 10. Symmetric polynomial combinations (elementary symmetric) ---
+        # Up to degree 5 (since you have 5 fuels)
+        for r in range(1, 6):
+            key = f'e_sym_poly_r{r}'
+            val = np.zeros(n_samples)
+            for idx in combinations(range(n_fuels), r):
+                prod_p = np.prod(P[:, idx], axis=1)
+                val += prod_p
+            features[key] = val
+        # --- 11. Weighted interaction difference (symmetry in differences) ---
+        for i, j in combinations(range(n_fuels), 2):
+            diff = P[:, i] - P[:, j]
+            wdiff = W[:, i] * W[:, j]
+            features[f'weighted_diff_p{i+1}{j+1}'] = diff * wdiff
+        # --- 12. Mean, max, min (weighted) ---
+        total_weight = np.sum(W, axis=1, keepdims=True)
+        weighted_mean = np.sum(W * P, axis=1) / np.clip(total_weight.squeeze(), 1e-6, None)
+        features['weighted_mean'] = weighted_mean
+        features['max_prop'] = np.max(P, axis=1)
+        features['min_prop'] = np.min(P, axis=1)
+        # --- 13. Weighted cross-log terms ---
+        for i, j in combinations(range(n_fuels), 2):
+            log_mix = np.log(np.clip(P[:, i] + P[:, j], 1e-6, None))
+            wij = W[:, i] * W[:, j]
+            features[f'logsum_p{i+1}{j+1}'] = log_mix * wij
+        # --- 14. Inverse + weighted inverse ---
+        # features['inv_prop_sum'] = np.sum(W / np.clip(P, 1e-6, None), axis=1)
+        features['inv_prop_sum'] = np.sum(W / np.clip(P, 1e-6, None), axis=1)
+        # --- 15. Weighted relu (max(p, 0)) ---
+        relu = np.maximum(P, 0)
+        features['weighted_relu'] = np.sum(W * relu, axis=1)
+        # --- 16. Weighted sin/cos transforms ---
+        features['weighted_sin'] = np.sum(W * np.sin(P), axis=1)
+        features['weighted_cos'] = np.sum(W * np.cos(P), axis=1)
+        # --- 17. Normalized properties ---
+        prop_sum = np.sum(P, axis=1, keepdims=True)
+        normalized_P = P / np.clip(prop_sum, 1e-6, None)
+        for i in range(n_fuels):
+            features[f'norm_p{i+1}'] = normalized_P[:, i]
+        # --- 18. Product of all p's and all w's ---
+        features['total_product_p'] = np.prod(P, axis=1)
+        features['total_product_w'] = np.prod(W, axis=1)
+        # --- 19. Mixed entropic form ---
+        # entropy_like = -np.sum(W * np.log(np.clip(W, 1e-6, None)), axis=1)
+        # features['entropy_weights'] = entropy_like
+        # Convert to DataFrame
+        df = pd.DataFrame(features)
+        return df

requirements.txt CHANGED Viewed

@@ -1,3 +1,13 @@
-altair
-pandas
-streamlit

+tabpfn-extensions @ git+https://github.com/PriorLabs/tabpfn-extensions.git@16e0e4f4305a3546eab5be6ebf163ff41bd3843d
+scikit-learn==1.5.1
+huggingface_hub
+autogluon
+tabpfn==2.0.9
+streamlit==1.43.0
+numpy==1.26.4
+pandas==2.2.3
+matplotlib==3.10.0
+matplotlib-inline==0.1.7
+seaborn==0.13.2
+torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
+setuptools

setup.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/bash
2	+ python download_models.py

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# app.py
+import streamlit as st
+import pkg_resources
+st.title("📦 Installed Python Modules")
+# Get all installed packages
+packages = sorted(
+    [(d.project_name, d.version) for d in pkg_resources.working_set],
+    key=lambda x: x[0].lower()
+)
+# Display them
+for name, version in packages:
+    st.write(f"{name} — {version}")