Spaces:
Sleeping
Sleeping
| import os, io, json, requests | |
| from typing import Optional, List, Dict | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import tensorflow as tf | |
| import gradio as gr | |
| # ===== Artifacts ===== | |
| MODEL_PATH = "modelo_tabular.h5" | |
| SCALER_PATH = "scaler.pkl" | |
| ENCODER_PATH = "label_encoder.pkl" | |
| STATS_PATH = "feature_stats.json" | |
| assert os.path.exists(MODEL_PATH), "Falta modelo_tabular.h5" | |
| assert os.path.exists(SCALER_PATH), "Falta scaler.pkl" | |
| assert os.path.exists(ENCODER_PATH), "Falta label_encoder.pkl" | |
| assert os.path.exists(STATS_PATH), "Falta feature_stats.json" | |
| model = tf.keras.models.load_model(MODEL_PATH) | |
| scaler = joblib.load(SCALER_PATH) | |
| label_encoder = joblib.load(ENCODER_PATH) | |
| with open(STATS_PATH) as f: | |
| stats = json.load(f) | |
| FEATURE_COLUMNS: List[str] = stats["feature_columns"] | |
| MEDIANS: Dict[str, float] = stats["medians"] | |
| CLASSES = list(label_encoder.classes_) | |
| # ===== Helpers ===== | |
| def first_present(candidates, cols_set): | |
| for c in candidates: | |
| if c in cols_set: | |
| return c | |
| for c in candidates: | |
| found = [x for x in cols_set if c in x] | |
| if found: | |
| return found[0] | |
| return None | |
| CANDIDATES_MAP = { | |
| "koi_period": ["pl_orbper","tce_period","orbper","period"], | |
| "koi_duration": ["pl_trandurh","tce_duration","trandur","duration","dur"], | |
| "koi_depth": ["pl_trandep","tce_depth","depth","trandep"], | |
| "koi_prad": ["pl_rade","prad","rade","planet_radius"], | |
| "koi_srad": ["st_rad","srad","stellar_radius","star_radius"], | |
| "koi_teq": ["pl_eqt","teq","equilibrium_temp"], | |
| "koi_steff": ["st_teff","teff","stellar_teff","effective_temp"], | |
| "koi_slogg": ["st_logg","logg","slogg"], | |
| "koi_smet": ["st_met","feh","metallicity","smet"], | |
| "koi_kepmag": ["st_tmag","tmag","kepmag","koi_kepmag"], | |
| "koi_model_snr": ["tce_model_snr","model_snr","snr"], | |
| "koi_num_transits": ["tce_num_transits","num_transits","ntransits","tran_count"] | |
| } | |
| def impute_and_scale(df: pd.DataFrame) -> np.ndarray: | |
| for col in FEATURE_COLUMNS: | |
| if col not in df.columns: | |
| df[col] = np.nan | |
| df = df[FEATURE_COLUMNS].copy() | |
| for c in FEATURE_COLUMNS: | |
| if df[c].isna().any(): | |
| df[c] = df[c].fillna(MEDIANS.get(c, 0.0)) | |
| X = scaler.transform(df.values) | |
| return X | |
| def predict_proba_from_df(df: pd.DataFrame): | |
| X = impute_and_scale(df) | |
| probs = model.predict(X, verbose=0) | |
| classes = list(label_encoder.classes_) | |
| return probs, classes | |
| # ===== Endpoint 1: Probar con 2 TOI/TCE de la API ===== | |
| def predict_toi_samples(n=2, table="tce"): | |
| if table not in {"tce","toi"}: | |
| table = "tce" | |
| if table == "tce": | |
| TAP_URL = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync" | |
| query = f""" | |
| SELECT TOP {int(n)} | |
| kepid, tce_plnt_num, tce_period, tce_duration, tce_depth, tce_model_snr | |
| FROM q1_q17_dr25_tce | |
| WHERE tce_period > 0 AND tce_duration > 0 AND tce_depth > 0 | |
| ORDER BY tce_model_snr DESC | |
| """ | |
| r = requests.get(TAP_URL, params={"query": query, "format": "csv"}, timeout=90) | |
| else: | |
| BASE = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI" | |
| where = ("(tfopwg_disp like 'PC' or tfopwg_disp like 'APC') and " | |
| "(pl_orbper is not null or tce_period is not null)") | |
| r = requests.get(BASE, params={"table":"toi","where":where,"format":"csv"}, timeout=90) | |
| r.raise_for_status() | |
| df = pd.read_csv(io.StringIO(r.text)) | |
| df.columns = [c.strip().lower() for c in df.columns] | |
| df = df.sample(min(n, len(df)), random_state=7).reset_index(drop=True) | |
| # map flexible a FEATURE_COLUMNS | |
| cols_set = set(df.columns) | |
| cases = pd.DataFrame(index=df.index, columns=FEATURE_COLUMNS, dtype="float64") | |
| for feat in FEATURE_COLUMNS: | |
| src = first_present(CANDIDATES_MAP.get(feat, []), cols_set) | |
| if src is not None: | |
| cases[feat] = pd.to_numeric(df[src], errors="coerce") | |
| else: | |
| cases[feat] = np.nan | |
| probs, classes = predict_proba_from_df(cases) | |
| idx = np.argmax(probs, axis=1) | |
| preds = label_encoder.inverse_transform(idx) | |
| # construir salida | |
| out = [] | |
| for i in range(len(df)): | |
| row_probs = probs[i] | |
| d = {"prediction": preds[i]} | |
| for j, cls in enumerate(classes): | |
| d[f"P({cls})"] = float(row_probs[j]) | |
| out.append(d) | |
| res = pd.DataFrame(out) | |
| csv_path = "pred_toi_samples.csv" | |
| res.to_csv(csv_path, index=False) | |
| return res, csv_path | |
| # ===== Endpoint 2: POST JSON manual ===== | |
| def predict_from_json(json_text: str, threshold: float = 0.5): | |
| try: | |
| payload = json.loads(json_text) | |
| except Exception as e: | |
| return {"error": f"JSON inválido: {e}"} | |
| df = pd.DataFrame([payload]) | |
| # normalizar nombres | |
| df.columns = [c.strip().lower() for c in df.columns] | |
| # map a FEATURE_COLUMNS | |
| cols_set = set(df.columns) | |
| cases = pd.DataFrame(index=df.index, columns=FEATURE_COLUMNS, dtype="float64") | |
| for feat in FEATURE_COLUMNS: | |
| # si ya viene con el nombre koi_* lo usamos | |
| if feat in cols_set: | |
| cases[feat] = pd.to_numeric(df[feat], errors="coerce") | |
| continue | |
| # sino buscamos sinónimos | |
| src = first_present(CANDIDATES_MAP.get(feat, []), cols_set) | |
| if src is not None: | |
| cases[feat] = pd.to_numeric(df[src], errors="coerce") | |
| else: | |
| cases[feat] = np.nan | |
| probs, classes = predict_proba_from_df(cases) | |
| p = probs[0] | |
| idx = int(np.argmax(p)) | |
| pred = label_encoder.inverse_transform([idx])[0] | |
| p_confirmed = float(p[classes.index("CONFIRMED")]) if "CONFIRMED" in classes else 0.0 | |
| return { | |
| "prediction": pred, | |
| "probabilities": {classes[i]: float(p[i]) for i in range(len(classes))}, | |
| "is_exoplanet": bool(pred.upper()=="CONFIRMED" and p_confirmed >= float(threshold)), | |
| "p_confirmed": p_confirmed | |
| } | |
| # ===== Endpoint 3: Descargar CSV de un TOI/TCE específico ===== | |
| def download_object_csv(identifier: str, table: str = "toi"): | |
| table = table.lower() | |
| if table not in {"toi","tce"}: | |
| table = "toi" | |
| if table == "toi": | |
| BASE = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI" | |
| where = f"toi like '{identifier}'" | |
| r = requests.get(BASE, params={"table":"toi","where":where,"format":"csv"}, timeout=60) | |
| else: | |
| # para TCE usamos TAP por kepid + tce_plnt_num, ejemplo: "KIC 11446443 1" | |
| TAP_URL = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync" | |
| parts = identifier.replace(",", " ").split() | |
| if len(parts) >= 2: | |
| kep = parts[0] | |
| num = parts[1] | |
| query = f""" | |
| SELECT * | |
| FROM q1_q17_dr25_tce | |
| WHERE CAST(kepid AS VARCHAR) like '{kep.replace('KIC','').strip()}' | |
| AND CAST(tce_plnt_num AS VARCHAR) like '{num.strip()}' | |
| """ | |
| else: | |
| query = f"SELECT TOP 1 * FROM q1_q17_dr25_tce WHERE CAST(kepid AS VARCHAR) like '{identifier.strip()}'" | |
| r = requests.get(TAP_URL, params={"query": query, "format": "csv"}, timeout=90) | |
| r.raise_for_status() | |
| path = "object.csv" | |
| with open(path, "w") as f: | |
| f.write(r.text) | |
| return path | |
| # ===== Endpoint 4: Subir CSV y predecir ===== | |
| def predict_from_csv(file_obj, threshold: float = 0.5): | |
| if file_obj is None: | |
| return pd.DataFrame(), None | |
| df = pd.read_csv(file_obj.name) | |
| # normalizar nombres | |
| df.columns = [c.strip().lower() for c in df.columns] | |
| cols_set = set(df.columns) | |
| cases = pd.DataFrame(index=df.index, columns=FEATURE_COLUMNS, dtype="float64") | |
| for feat in FEATURE_COLUMNS: | |
| src = feat if feat in cols_set else first_present(CANDIDATES_MAP.get(feat, []), cols_set) | |
| if src is not None: | |
| cases[feat] = pd.to_numeric(df[src], errors="coerce") | |
| else: | |
| cases[feat] = np.nan | |
| probs, classes = predict_proba_from_df(cases) | |
| idx = np.argmax(probs, axis=1) | |
| preds = label_encoder.inverse_transform(idx) | |
| out = [] | |
| for i in range(len(df)): | |
| row = {"prediction": preds[i]} | |
| for j, cls in enumerate(classes): | |
| row[f"P({cls})"] = float(probs[i][j]) | |
| out.append(row) | |
| res = pd.DataFrame(out) | |
| out_path = "predicciones.csv" | |
| res.to_csv(out_path, index=False) | |
| return res, out_path | |
| # ===== Gradio UI ===== | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🔭 Exoplanet Classifier — API + UI (Gradio)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 1) Probar con 2 objetos de la API (TOI o TCE)") | |
| table_dd = gr.Dropdown(choices=["toi","tce"], value="tce", label="Tabla") | |
| n_objs = gr.Slider(1, 10, value=2, step=1, label="N objetos") | |
| out_df1 = gr.Dataframe(label="Resultados") | |
| out_file1 = gr.File(label="Descargar CSV") | |
| gr.Button("Probar API").click(predict_toi_samples, inputs=[n_objs, table_dd], outputs=[out_df1, out_file1], api_name="predict_toi_samples") | |
| with gr.Column(): | |
| gr.Markdown("### 2) JSON manual (POST)") | |
| jt = gr.Textbox(lines=12, label="JSON de entrada (TOI/TCE-like o koi_* )") | |
| thr_json = gr.Slider(0, 1, value=0.5, step=0.01, label="Umbral P(CONFIRMED)") | |
| out_json = gr.JSON(label="Respuesta") | |
| gr.Button("Predecir JSON").click(predict_from_json, inputs=[jt, thr_json], outputs=out_json, api_name="predict_json") | |
| gr.Markdown("### 3) Descargar CSV de un objeto (por id)") | |
| ident = gr.Textbox(label="Identificador (ej: TOI-1234.01 o 'KIC 11446443 1')", placeholder="TOI-xxx.yy ó KIC ###### <planet_num>") | |
| table2 = gr.Dropdown(choices=["toi","tce"], value="toi", label="Tabla") | |
| out_csv = gr.File(label="CSV del objeto") | |
| gr.Button("Descargar CSV").click(download_object_csv, inputs=[ident, table2], outputs=out_csv, api_name="toi_csv") | |
| gr.Markdown("### 4) Subir CSV y clasificar") | |
| f_in = gr.File(label="CSV subida", file_types=[".csv"]) | |
| thr = gr.Slider(0,1,value=0.5, step=0.01, label="Umbral P(CONFIRMED)") | |
| out_df2 = gr.Dataframe(label="Resultados") | |
| out_file2 = gr.File(label="Descargar predicciones") | |
| gr.Button("Predecir CSV").click(predict_from_csv, inputs=[f_in, thr], outputs=[out_df2, out_file2], api_name="predict_csv") | |
| demo.queue().launch() |