File size: 2,134 Bytes
b55453c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import pandas as pd
from app.core.config import get_settings
from app.infrastructure.model_loader import XGBoostModelLoader
def run_test():
file_path = "data/trident_clean_top3000.parquet"
print(f"Loading data from {file_path}...")
try:
df = pd.read_parquet(file_path)
except Exception as e:
print(f"Error loading parquet: {e}")
return
print(f"Data loaded, shape: {df.shape}")
settings = get_settings()
loader = XGBoostModelLoader(settings)
print("Resolving feature columns...")
feature_cols = loader.resolve_feature_columns(list(df.columns))
print(f"Model requires {len(feature_cols)} features")
missing_cols = set(feature_cols) - set(df.columns)
if missing_cols:
print(f"WARNING: The following required features are missing in the data: {missing_cols}")
for col in missing_cols:
df[col] = 0
print("Running predictions...")
try:
preds = loader.predict(df[feature_cols])
except Exception as e:
print(f"Error during prediction: {e}")
return
df["predicted_score"] = preds
# Define columns for display and deduplication
# We want to keep unique wells. In this dataset, WELL_NAME + FIELD_NAME is usually unique to a well.
# Group by well identifiers and keep the record with the highest score
id_cols = ["FID", "WELL_NAME", "FIELD_NAME"]
# Deduplicate: Group by identifiers and take the maximum score
well_groups = df.sort_values("predicted_score", ascending=False).drop_duplicates(subset=["WELL_NAME", "FIELD_NAME"])
# Take top 30 unique wells
top_30 = well_groups.head(30)
# Output display
print("\n" + "="*80)
print(f"{'FID':<8} {'WELL_NAME':<30} {'FIELD_NAME':<20} {'predicted_score'}")
print("-" * 80)
for _, row in top_30.iterrows():
print(f"{int(row['FID']):<8} {str(row['WELL_NAME']):<30} {str(row['FIELD_NAME']):<20} {row['predicted_score']:.2f}")
print("="*80)
print(f"Total unique wells identified: {len(well_groups)}")
if __name__ == "__main__":
run_test()
|