Spaces:

pvyas96
/

well_log_x

Sleeping

App Files Files Community

well_log_x / app.py

pvyas96

Update app.py

1d3e733 verified about 1 month ago

raw

history blame contribute delete

16.6 kB

	import streamlit as st
	import streamlit.components.v1 as components
	import pandas as pd
	import numpy as np
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.linear_model import LinearRegression
	from sklearn.svm import SVR
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
	import joblib
	import io
	import lasio
	import os
	from typing import Optional, Tuple, Dict, Any, List
	import datetime
	import copy

	# ------------------------------------
	# CONFIGURATION & CONSTANTS
	# ------------------------------------
	st.set_page_config(
	page_title="Well Log Analysis Pro",
	layout="wide",
	page_icon="🛢️",
	initial_sidebar_state="expanded"
	)

	# ------------------------------------
	# CLASS: DATA MANAGER
	# ------------------------------------
	class DataManager:
	"""Handles data ingestion, cleaning, and export."""

	@staticmethod
	@st.cache_data(show_spinner=False)
	def load_las(file_bytes: bytes) -> Tuple[pd.DataFrame, lasio.LASFile]:
	"""
	Robust LAS file loader that handles encoding issues.
	Returns cleaned dataframe and the raw LAS object.
	"""
	str_data = None
	# Try common encodings
	for encoding in ["utf-8", "latin-1", "cp1252"]:
	try:
	str_data = file_bytes.decode(encoding)
	break
	except UnicodeDecodeError:
	continue

	if str_data is None:
	raise ValueError("Unable to decode LAS file with supported encodings.")

	file_buffer = io.StringIO(str_data)
	las = lasio.read(file_buffer)
	df = las.df().reset_index()

	# Standardize Depth Column
	found_depth = False
	for col in df.columns:
	if col.lower() in ["depth", "dept", "depth.m", "depth.ft", "depta"]:
	df = df.rename(columns={col: "Depth"})
	found_depth = True
	break

	if not found_depth:
	# Fallback: check index name if it wasn't a column
	if las.index_unit and "m" in las.index_unit.lower():
	df["Depth"] = df.index

	return df, las

	@staticmethod
	@st.cache_data(show_spinner=False)
	def clean_data(df: pd.DataFrame) -> pd.DataFrame:
	"""Removes infinite values and standardizes NaNs."""
	df = df.replace([np.inf, -np.inf], np.nan)
	# Drop rows only if ALL columns are NaN (rare) or specific cleanup logic
	# For ML, we usually drop rows with NaNs in feature columns during training
	return df

	@staticmethod
	def export_to_las(df: pd.DataFrame, original_las: Optional[lasio.LASFile] = None) -> bytes:
	"""
	Converts DataFrame back to LAS format bytes for download.
	If original_las is provided, it preserves headers.
	"""
	new_las = lasio.LASFile()

	if original_las:
	# FIX: Do not assign .header directly (read-only property).
	# Instead, copy sections item by item.

	# Copy WELL information (Metadata)
	for item in original_las.well:
	# We use deepcopy to ensure we don't link to the session state object
	new_las.well[item.mnemonic] = copy.deepcopy(item)

	# Copy PARAMETER information
	for item in original_las.params:
	new_las.params[item.mnemonic] = copy.deepcopy(item)

	# Copy Other info (often just text)
	new_las.other = original_las.other
	else:
	# Set default date if no original file
	new_las.well.DATE = str(datetime.datetime.now())

	# Add curves from the DataFrame
	for col in df.columns:
	unit = "UNKNOWN"
	# Try to preserve units from original LAS if available
	if original_las:
	for curve in original_las.curves:
	if curve.mnemonic == col:
	unit = curve.unit
	break

	new_las.append_curve(col, df[col].values, unit=unit)

	# Write to string buffer then encode
	output = io.StringIO()
	new_las.write(output)
	return output.getvalue().encode("utf-8")

	# ------------------------------------
	# CLASS: MODEL MANAGER
	# ------------------------------------
	class ModelManager:
	"""Handles Machine Learning operations."""

	MODELS = {
	"Linear Regression": LinearRegression,
	"Decision Tree": DecisionTreeRegressor,
	"Random Forest": RandomForestRegressor,
	"Gradient Boosting": GradientBoostingRegressor,
	"SVR": SVR
	}

	@staticmethod
	def train(
	df: pd.DataFrame,
	target: str,
	features: List[str],
	algorithm: str,
	params: Dict[str, Any]
	) -> Dict[str, Any]:
	"""Trains a model and returns the artifact dictionary."""

	# Prepare Data: Drop rows where Target or Features are NaN
	train_df = df.dropna(subset=[target] + features)

	if len(train_df) < 50:
	raise ValueError("Not enough data points to train (requires > 50).")

	X = train_df[features]
	y = train_df[target]

	# Initialize Model
	model_class = ModelManager.MODELS[algorithm]
	model = model_class(**params)

	# Train
	model.fit(X, y)
	score = model.score(X, y)

	return {
	"model": model,
	"features": features,
	"target": target,
	"algorithm": algorithm,
	"r2_score": score,
	"training_date": str(datetime.datetime.now())
	}

	# ------------------------------------
	# CLASS: VISUALIZER
	# ------------------------------------
	class Visualizer:
	"""Handles Plotly visualizations."""

	@staticmethod
	def plot_well_logs(df: pd.DataFrame, x_cols: List[str], depth_col: str) -> go.Figure:
	rows = 1
	cols = len(x_cols)
	fig = make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=x_cols)

	for i, col in enumerate(x_cols):
	fig.add_trace(
	go.Scatter(x=df[col], y=df[depth_col], mode="lines", name=col),
	row=1, col=i + 1
	)

	fig.update_yaxes(title_text=depth_col, autorange="reversed", row=1, col=1)
	fig.update_layout(height=800, showlegend=False, title_text="Well Log Viewer", template="plotly_white")
	return fig

	@staticmethod
	def plot_crossplot(y_true: pd.Series, y_pred: pd.Series, title: str) -> go.Figure:
	fig = go.Figure()
	fig.add_trace(go.Scatter(
	x=y_true,
	y=y_pred,
	mode='markers',
	marker=dict(color='blue', opacity=0.5, size=6),
	name='Data'
	))

	# 1:1 Line
	min_val = min(y_true.min(), y_pred.min())
	max_val = max(y_true.max(), y_pred.max())
	fig.add_shape(
	type="line", x0=min_val, y0=min_val, x1=max_val, y1=max_val,
	line=dict(color="red", dash="dash", width=2)
	)

	fig.update_layout(
	title=title,
	xaxis_title="Actual",
	yaxis_title="Predicted",
	height=600,
	template="plotly_white"
	)
	return fig

	# ------------------------------------
	# MAIN APPLICATION LOGIC
	# ------------------------------------
	def main():

	# --- Session State Initialization ---
	if "data" not in st.session_state:
	st.session_state.data = None
	if "las_object" not in st.session_state:
	st.session_state.las_object = None

	# --- Sidebar ---
	with st.sidebar:
	st.title("Navigation")
	page = st.radio("Go to", ["Home", "Visualizer", "Trainer", "Prediction"])
	st.markdown("---")
	st.caption("Pro Version 2.0")

	# --- Page Routing ---
	if page == "Home":
	render_home()
	elif page == "Visualizer":
	render_visualizer()
	elif page == "Trainer":
	render_trainer()
	elif page == "Prediction":
	render_prediction()

	# ------------------------------------
	# PAGE RENDERERS
	# ------------------------------------
	def render_home():
	st.title("🛢️ Well Log Analysis Pro")
	st.markdown("### Production-Grade Petrophysics ML Tool")

	col1, col2 = st.columns([2, 1])

	with col1:
	st.info("Upload standard `.LAS` files or `.CSV` data to begin.")
	uploaded_file = st.file_uploader("Upload File", type=["csv", "las"])

	if uploaded_file:
	try:
	if uploaded_file.name.lower().endswith(".las"):
	bytes_data = uploaded_file.read()
	df, las_obj = DataManager.load_las(bytes_data)
	st.session_state.las_object = las_obj
	else:
	df = pd.read_csv(uploaded_file)
	st.session_state.las_object = None # No LAS header for CSVs

	# Global Clean
	df = DataManager.clean_data(df)
	st.session_state.data = df

	st.success(f"Loaded {uploaded_file.name} successfully!")
	st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
	st.dataframe(df.head(), use_container_width=True)

	except Exception as e:
	st.error(f"Failed to load file: {str(e)}")

	with col2:
	st.subheader("Quick Guide")
	st.markdown("""
	1. Upload data here.
	2. Visualize logs to check quality.
	3. Train a model on existing curves.
	4. Predict missing curves or export synthetic logs.
	""")

	def render_visualizer():
	if st.session_state.data is None:
	st.warning("Please upload data on the Home page first.")
	return

	st.title("📊 Log Visualizer")
	df = st.session_state.data
	columns = df.columns.tolist()

	c1, c2 = st.columns([1, 3])
	with c1:
	depth_col = st.selectbox("Depth Column", columns, index=0)
	x_cols = st.multiselect("Logs to Plot", [c for c in columns if c != depth_col], default=columns[1:4] if len(columns)>3 else columns[1:])

	with c2:
	if x_cols:
	fig = Visualizer.plot_well_logs(df, x_cols, depth_col)
	st.plotly_chart(fig, use_container_width=True)

	def render_trainer():
	if st.session_state.data is None:
	st.warning("Please upload data first.")
	return

	st.title("🤖 Model Trainer")
	df = st.session_state.data
	columns = df.columns.tolist()

	c1, c2, c3 = st.columns(3)
	target = c1.selectbox("Target (Output)", columns)
	features = c2.multiselect("Features (Input)", [c for c in columns if c != target])
	algo = c3.selectbox("Algorithm", list(ModelManager.MODELS.keys()), index=2)

	with st.expander("Advanced Hyperparameters"):
	n_estimators = st.slider("Trees (RF/GB)", 10, 200, 100)
	max_depth = st.slider("Max Depth", 3, 20, 10)

	if st.button("Train Model", type="primary"):
	if not features:
	st.error("Select features!")
	return

	params = {}
	if algo in ["Random Forest", "Gradient Boosting"]:
	params = {"n_estimators": n_estimators, "max_depth": max_depth, "random_state": 42}

	with st.spinner("Training model..."):
	try:
	artifact = ModelManager.train(df, target, features, algo, params)

	# Save to disk
	joblib.dump(artifact, "well_model.pkl")

	st.success(f"Trained {algo} with R²: {artifact['r2_score']:.4f}")

	# Create Download for Model
	model_bytes = io.BytesIO()
	joblib.dump(artifact, model_bytes)
	st.download_button(
	label="⬇️ Download Trained Model (.pkl)",
	data=model_bytes.getvalue(),
	file_name="well_log_model.pkl",
	mime="application/octet-stream"
	)

	except Exception as e:
	st.error(f"Training failed: {e}")

	def render_prediction():
	st.title("🔮 Prediction & Export")

	if st.session_state.data is None:
	st.warning("Upload data first.")
	return

	if not os.path.exists("well_model.pkl"):
	st.warning("No model found. Train one in the 'Trainer' tab or upload a .pkl file.")
	uploaded_model = st.file_uploader("Upload Pre-trained Model (.pkl)", type="pkl")
	if uploaded_model:
	artifact = joblib.load(uploaded_model)
	joblib.dump(artifact, "well_model.pkl")
	st.rerun()
	return

	# Load Model
	artifact = joblib.load("well_model.pkl")
	model = artifact['model']
	feats = artifact['features']
	target = artifact['target']

	st.info(f"Loaded Model: {artifact['algorithm']} (Predicting: `{target}`)")

	df = st.session_state.data.copy()

	# Check features
	missing = [f for f in feats if f not in df.columns]
	if missing:
	st.error(f"Missing features in current dataset: {missing}")
	return

	mode = st.radio("Prediction Mode", ["Predict New Curve (Overwrite)", "Fill Gaps (Imputation)"])

	if st.button("Generate Prediction", type="primary"):
	X = df[feats]

	# Handle NaNs in input features for prediction
	# Option 1: Drop rows (simple)
	# Option 2: Simple Impute (mean) - using simple drop for safety in production
	valid_indices = X.dropna().index

	if len(valid_indices) == 0:
	st.error("Input features contain too many NaNs. Cannot predict.")
	return

	X_valid = X.loc[valid_indices]
	preds = model.predict(X_valid)

	output_col = f"PRED_{target}"

	if mode == "Predict New Curve (Overwrite)":
	df[output_col] = np.nan
	df.loc[valid_indices, output_col] = preds
	display_col = output_col
	else:
	# Imputation Mode
	if target not in df.columns:
	st.error(f"Target column {target} not found for imputation.")
	return

	df[output_col] = df[target].copy()
	# Find where target is NaN but we have valid inputs
	target_nans = df[df[target].isna()].index
	impute_indices = list(set(target_nans) & set(valid_indices))

	if impute_indices:
	X_impute = df.loc[impute_indices, feats]
	impute_preds = model.predict(X_impute)
	df.loc[impute_indices, output_col] = impute_preds
	st.success(f"Filled {len(impute_indices)} gaps.")
	else:
	st.warning("No gaps found that could be filled with available features.")
	display_col = output_col

	# Visual Comparison
	st.subheader("Result Preview")

	# Plot
	fig = go.Figure()
	if "Depth" in df.columns:
	fig.add_trace(go.Scatter(x=df[display_col], y=df["Depth"], name="Prediction", line=dict(color='red', dash='dot')))
	if target in df.columns:
	fig.add_trace(go.Scatter(x=df[target], y=df["Depth"], name="Original", line=dict(color='black', width=1)))
	fig.update_yaxes(autorange="reversed", title="Depth")
	else:
	fig.add_trace(go.Scatter(y=df[display_col], mode='lines', name="Prediction"))

	st.plotly_chart(fig, use_container_width=True)

	# Export Options
	st.divider()
	st.subheader("Export Data")

	c1, c2 = st.columns(2)
	with c1:
	csv_data = df.to_csv(index=False).encode('utf-8')
	st.download_button("Download CSV", csv_data, "results.csv", "text/csv")

	with c2:
	try:
	las_bytes = DataManager.export_to_las(df, st.session_state.las_object)
	st.download_button("Download LAS", las_bytes, "results.las", "application/octet-stream")
	except Exception as e:
	st.error(f"LAS export unavailable: {e}")

	if __name__ == "__main__":
	main()