File size: 2,134 Bytes
b55453c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from app.core.config import get_settings
from app.infrastructure.model_loader import XGBoostModelLoader

def run_test():
    file_path = "data/trident_clean_top3000.parquet"
    print(f"Loading data from {file_path}...")
    try:
        df = pd.read_parquet(file_path)
    except Exception as e:
        print(f"Error loading parquet: {e}")
        return

    print(f"Data loaded, shape: {df.shape}")

    settings = get_settings()
    loader = XGBoostModelLoader(settings)
    
    print("Resolving feature columns...")
    feature_cols = loader.resolve_feature_columns(list(df.columns))
    print(f"Model requires {len(feature_cols)} features")
    
    missing_cols = set(feature_cols) - set(df.columns)
    if missing_cols:
        print(f"WARNING: The following required features are missing in the data: {missing_cols}")
        for col in missing_cols:
            df[col] = 0
            
    print("Running predictions...")
    try:
        preds = loader.predict(df[feature_cols])
    except Exception as e:
        print(f"Error during prediction: {e}")
        return
    
    df["predicted_score"] = preds
    
    # Define columns for display and deduplication
    # We want to keep unique wells. In this dataset, WELL_NAME + FIELD_NAME is usually unique to a well.
    # Group by well identifiers and keep the record with the highest score
    id_cols = ["FID", "WELL_NAME", "FIELD_NAME"]
    
    # Deduplicate: Group by identifiers and take the maximum score
    well_groups = df.sort_values("predicted_score", ascending=False).drop_duplicates(subset=["WELL_NAME", "FIELD_NAME"])
    
    # Take top 30 unique wells
    top_30 = well_groups.head(30)
    
    # Output display
    print("\n" + "="*80)
    print(f"{'FID':<8} {'WELL_NAME':<30} {'FIELD_NAME':<20} {'predicted_score'}")
    print("-" * 80)
    
    for _, row in top_30.iterrows():
        print(f"{int(row['FID']):<8} {str(row['WELL_NAME']):<30} {str(row['FIELD_NAME']):<20} {row['predicted_score']:.2f}")
    print("="*80)
    print(f"Total unique wells identified: {len(well_groups)}")

if __name__ == "__main__":
    run_test()