Spaces:

UFOSINT
/

UAP-Data-Analysis-Tool

Sleeping

App Files Files Community

UAP-Data-Analysis-Tool / api /main.py

Ashoka74

Deploy: SCU_v1 default, Sankey, filter reconciliation, nested-expander fixes, deps

0e62cc9 29 days ago

Raw

History Blame Contribute Delete

48.7 kB

	import sys
	import os
	import io
	import time
	import logging
	import traceback
	from datetime import datetime, timezone
	import uuid

	import numpy as np
	import pandas as pd
	from fastapi import FastAPI, UploadFile, File, HTTPException, Query, Header, Depends
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel

	# Add parent directory to path for importing uap_analyzer
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	# Suppress expected outputs to only have clean API routes
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	app = FastAPI(title="UAP Analysis API", version="1.0.0")

	# CORS Improvement (Codex #6): Explicit origin allowlist, credentials disabled unless required
	origins = os.getenv("UAP_API_CORS_ORIGINS", "http://localhost:5173,http://127.0.0.1:5173")
	allow_origins = [origin.strip() for origin in origins.split(",") if origin.strip()]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=allow_origins,
	allow_credentials=False,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ---------------------------------------------------------------------------
	# State Management (Codex #1): Session isolation
	# ---------------------------------------------------------------------------
	class SessionState:
	def __init__(self):
	self.dataset = None
	self.filtered_data = None
	self.analyzers = []
	self.col_names = []
	self.new_data = None
	self.data_processed = False
	self.analysis_results = {}
	self.cluster_viz = {}
	self.cramers_v = None
	self.analysis_runs = 0
	self.last_analysis_at = None
	self.analysis_mode = os.getenv("UAP_ANALYSIS_MODE", "production") # demo vs production
	# Parsing / SCU state (mirrors the Streamlit parsing.py session keys)
	self.parse_source_df = None # raw uploaded reports awaiting extraction
	self.parsed_responses = None # {description: parsed JSON dict}
	self.parsed_df = None # flat parsed-responses DataFrame
	self.scu_normalized_df = None # scu_normalizer.normalize() output
	self.last_accessed = time.time()

	sessions = {}
	SESSION_TTL = 3600 * 24 # 24 hours

	def get_session(session_id: str = Header(None, alias="X-Session-ID")) -> SessionState:
	if not session_id:
	session_id = "default"

	# Cleanup expired sessions
	current_time = time.time()
	expired = [k for k, v in sessions.items() if current_time - v.last_accessed > SESSION_TTL]
	for k in expired:
	del sessions[k]

	if session_id not in sessions:
	sessions[session_id] = SessionState()

	sessions[session_id].last_accessed = current_time
	return sessions[session_id]

	# Dataset paths
	DATA_PATH_WEST = os.path.join(os.path.dirname(__file__), "..", "parsed_files_distance_embeds.h5")
	DATA_PATH_EAST = os.path.join(os.path.dirname(__file__), "..", "final_ufoseti_dataset.h5")
	# Default for backward compatibility
	DATA_PATH = DATA_PATH_WEST
	MAX_QUERY_CONTEXT_ROWS = int(os.getenv("UAP_QUERY_MAX_ROWS", "250"))
	MAX_QUERY_CONTEXT_CHARS = int(os.getenv("UAP_QUERY_MAX_CHARS", "24000"))

	# ---------------------------------------------------------------------------
	# Pydantic models
	# ---------------------------------------------------------------------------
	class FilterSpec(BaseModel):
	column: str
	type: str # "categorical", "numeric", "text"
	values: list = None
	min_val: float = None
	max_val: float = None
	pattern: str = None


	class AnalysisRequest(BaseModel):
	columns: list[str]
	# Cluster pipeline tuning (mirrors analyzing.py controls). When enable_tfidf
	# is False (the analyzing.py default), clusters keep numeric "Cluster N"
	# names and raw HDBSCAN labels are passed straight to XGBoost.
	enable_tfidf: bool = False
	min_cluster_size: int = 15
	n_neighbors: int = 15
	min_dist: float = 0.1
	top_n: int = 32


	class QueryRequest(BaseModel):
	question: str
	columns: list[str]
	gemini_key: str


	# ── Parsing ────────────────────────────────────────────────────────────────
	class SchemaMergeRequest(BaseModel):
	labels: list[str]
	custom_fields: dict \| None = None


	class SchemaCoverageRequest(BaseModel):
	labels: list[str]
	custom_fields: dict \| None = None
	# Dataset columns to diff against; defaults to the uploaded parse source.
	columns: list[str] \| None = None


	class ParseEstimateRequest(BaseModel):
	columns: list[str]
	format_json: str # merged schema template (JSON string)
	model: str = "gpt-4o-mini"
	use_cache: bool = True
	use_batch: bool = False


	class ParseRunRequest(BaseModel):
	columns: list[str]
	format_json: str # merged schema template (JSON string)
	provider: str = "openai" # "openai" \| "deepseek"
	model: str = "gpt-4o-mini"
	api_key: str
	max_workers: int = 10
	keep_columns: list[str] = [] # carry-through source columns


	# ── SCU ────────────────────────────────────────────────────────────────────
	class ScuFilterRequest(BaseModel):
	criterion_keys: list[str]


	# ── RAG ────────────────────────────────────────────────────────────────────
	class RagSearchRequest(BaseModel):
	columns: list[str]
	question: str
	cohere_key: str
	top_n: int = 50


	# ── Cramér's V explorer ────────────────────────────────────────────────────
	class CramersVRequest(BaseModel):
	columns: list[str] \| None = None
	drop_missing: bool = False
	exclude_trivial: bool = True
	strong_threshold: float = 0.30
	high_threshold: int = 30
	source: str = "dataset" # "dataset" \| "parsed"


	class ContingencyRequest(BaseModel):
	col1: str
	col2: str
	drop_missing: bool = False
	source: str = "dataset"


	class ColumnGroupsRequest(BaseModel):
	source: str = "dataset" # "dataset" \| "parsed"
	high_threshold: int = 30


	class XgboostRequest(BaseModel):
	columns: list[str]
	source: str = "dataset" # "dataset" \| "parsed"

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------
	def df_to_json(df: pd.DataFrame, max_rows: int = 50000, total_rows: int \| None = None) -> dict:
	df_subset = df.head(max_rows).copy()
	for col in df_subset.columns:
	# Convert categories and datetimes to strings
	if df_subset[col].dtype.name == "category" or pd.api.types.is_datetime64_any_dtype(df_subset[col]):
	df_subset[col] = df_subset[col].astype(str)
	else:
	# For object columns, we need to be careful with fillna if they contain dicts
	# We use a custom fillna for objects
	if df_subset[col].dtype == object:
	# Fill missing values without triggering hash checks on the content
	mask = df_subset[col].isna()
	df_subset.loc[mask, col] = ""
	else:
	df_subset[col] = df_subset[col].fillna("")
	return {
	"columns": list(df_subset.columns),
	"rows": df_subset.to_dict(orient="records"),
	# total_rows reflects the full dataset; pass it explicitly when df has
	# already been truncated upstream so the count is not misreported.
	"total_rows": total_rows if total_rows is not None else len(df),
	"returned_rows": len(df_subset),
	}

	def get_column_stats(df: pd.DataFrame) -> list[dict]:
	stats = []
	for col in df.columns:
	# Standard info
	info = {
	"name": col,
	"dtype": str(df[col].dtype),
	"non_null": int(df[col].notna().sum())
	}

	# Safe nunique
	try:
	info["unique"] = int(df[col].nunique())
	except (TypeError, ValueError):
	# For unhashable types (dicts/lists), we count unique by converting to string first
	try:
	info["unique"] = int(df[col].astype(str).nunique())
	except Exception:
	info["unique"] = 0

	# Numerical stats
	if pd.api.types.is_numeric_dtype(df[col]):
	try:
	if df[col].notna().any():
	info["min"] = float(df[col].min())
	info["max"] = float(df[col].max())
	info["mean"] = float(df[col].mean())
	else:
	info["min"] = info["max"] = info["mean"] = None
	except Exception:
	pass
	# Categorical / Object stats
	elif pd.api.types.is_object_dtype(df[col]) or df[col].dtype.name == "category":
	try:
	top = df[col].value_counts().head(10)
	info["top_values"] = [{"value": str(k), "count": int(v)} for k, v in top.items()]
	except (TypeError, ValueError):
	# Handle unhashable types in value_counts by converting to string
	try:
	top = df[col].astype(str).value_counts().head(10)
	info["top_values"] = [{"value": str(k), "count": int(v)} for k, v in top.items()]
	except Exception:
	info["top_values"] = []
	except Exception:
	info["top_values"] = []

	stats.append(info)
	return stats

	def truncate_context(items: list[str], max_chars: int) -> tuple[str, int]:
	context_parts = []
	used = 0
	chars = 0
	for item in items:
	to_add = item if not context_parts else f"\\n{item}"
	if chars + len(to_add) > max_chars:
	break
	context_parts.append(item)
	chars += len(to_add)
	used += 1
	return "\\n".join(context_parts), used

	# ---------------------------------------------------------------------------
	# Data endpoints
	# ---------------------------------------------------------------------------
	@app.get("/api/data/load")
	def load_data(
	rows: int = Query(default=15000, le=50000),
	type: str = Query(default="west"),
	state: SessionState = Depends(get_session)
	):
	path = DATA_PATH_EAST if type == "east" else DATA_PATH_WEST

	if not os.path.exists(path):
	raise HTTPException(status_code=404, detail=f"Dataset file not found: {os.path.basename(path)}")
	try:
	df = pd.read_hdf(path, key="df")
	if "embeddings" in df.columns:
	df = df.drop(columns=["embeddings"])
	full_row_count = len(df)
	df = df.head(rows)
	state.dataset = df
	state.filtered_data = df
	state.data_processed = False
	state.col_names = []
	return {
	"status": "ok",
	"data": df_to_json(df, total_rows=full_row_count),
	"column_stats": get_column_stats(df),
	}
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Error loading {type} dataset: {e}")

	@app.get("/api/analysis/clusters")
	def get_clusters_viz():
	from fastapi.responses import FileResponse
	path = os.path.join(os.path.dirname(__file__), "..", "frontend", "uap_clusters_llm.html")
	if not os.path.exists(path):
	# Fallback to current dir or project root
	path = os.path.join(os.path.dirname(__file__), "..", "uap_clusters_llm.html")

	if not os.path.exists(path):
	raise HTTPException(status_code=404, detail="Cluster visualization not found")
	return FileResponse(path)

	@app.post("/api/data/upload")
	async def upload_data(file: UploadFile = File(...), state: SessionState = Depends(get_session)):
	try:
	contents = await file.read()
	name = (file.filename or "").lower()
	if name.endswith(".csv"):
	df = pd.read_csv(io.BytesIO(contents))
	elif name.endswith((".xlsx", ".xls")):
	df = pd.read_excel(io.BytesIO(contents))
	elif name.endswith(".json"):
	import json as _json
	obj = _json.loads(contents.decode("utf-8"))
	df = pd.json_normalize(obj if isinstance(obj, list) else [obj])
	else:
	raise HTTPException(status_code=400, detail="Unsupported file type (CSV / XLSX / JSON).")
	state.dataset = df
	state.filtered_data = df
	state.data_processed = False
	return {"status": "ok", "filename": file.filename, "data": df_to_json(df), "column_stats": get_column_stats(df)}
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/api/data/filter")
	def filter_data(filters: list[FilterSpec], state: SessionState = Depends(get_session)):
	if state.dataset is None:
	raise HTTPException(status_code=400, detail="No dataset loaded")
	df = state.dataset.copy()
	try:
	for f in filters:
	if f.column not in df.columns:
	continue
	if f.type == "categorical" and f.values:
	df = df[df[f.column].astype(str).isin([str(v) for v in f.values])]
	elif f.type == "numeric":
	if f.min_val is not None and f.max_val is not None:
	if f.min_val > f.max_val:
	raise ValueError(f"Invalid range: min {f.min_val} > max {f.max_val}")
	df = df[df[f.column].between(f.min_val, f.max_val)]
	elif f.min_val is not None:
	df = df[df[f.column] >= f.min_val]
	elif f.max_val is not None:
	df = df[df[f.column] <= f.max_val]
	elif f.type == "text" and f.pattern:
	import re
	df = df[df[f.column].astype(str).str.contains(re.escape(f.pattern), case=False, na=False)]
	state.filtered_data = df
	return {"status": "ok", "data": df_to_json(df), "column_stats": get_column_stats(df)}
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Filtering failed: {e}")

	@app.get("/api/data/columns")
	def get_columns(state: SessionState = Depends(get_session)):
	if state.dataset is None:
	raise HTTPException(status_code=400, detail="No dataset loaded")
	df = state.dataset
	columns = []
	for col in df.columns:
	columns.append({
	"name": col, "dtype": str(df[col].dtype),
	"unique": int(df[col].nunique()), "non_null": int(df[col].notna().sum()),
	})
	return {"columns": columns}

	@app.get("/api/data/column-values")
	def get_column_values(
	column: str = Query(...),
	search: str = Query(default=""),
	limit: int = Query(default=50, le=500),
	state: SessionState = Depends(get_session),
	):
	"""Return unique values of a column, optionally filtered by a search term.

	Backs the searchable categorical filter so any value can be selected,
	not just the precomputed top-N from column statistics.
	"""
	if state.dataset is None:
	raise HTTPException(status_code=400, detail="No dataset loaded")
	if column not in state.dataset.columns:
	raise HTTPException(status_code=404, detail=f"Column '{column}' not found")

	import re
	series = state.dataset[column].astype(str)
	counts = series.value_counts()
	if search:
	mask = counts.index.str.contains(re.escape(search), case=False, na=False)
	counts = counts[mask]
	total_matches = int(len(counts))
	counts = counts.head(limit)
	return {
	"column": column,
	"values": [{"value": str(k), "count": int(v)} for k, v in counts.items()],
	"total_matches": total_matches,
	}

	@app.post("/api/analyze/run")
	def run_analysis(req: AnalysisRequest, state: SessionState = Depends(get_session)):
	if state.filtered_data is None or state.filtered_data.empty:
	raise HTTPException(status_code=400, detail="No data available")

	df = state.filtered_data
	valid_columns = [c for c in req.columns if c in df.columns]
	if not valid_columns:
	raise HTTPException(status_code=400, detail="No valid columns selected")

	results, cluster_viz, xgboost_results = {}, {}, {}
	state.analyzers = []
	new_data = pd.DataFrame()

	if state.analysis_mode == "production":
	# Codex #7: Run real embedding, UMAP, HDBSCAN cluster pipeline instead of generic mock
	from sklearn.model_selection import train_test_split
	from uap_analyzer import UAPAnalyzer, train_xgboost

	for col in valid_columns:
	analyzer = UAPAnalyzer(df, column=col)
	try:
	analyzer.preprocess_data(top_n=req.top_n)
	analyzer.reduce_dimensionality(
	method="UMAP", n_components=2,
	n_neighbors=req.n_neighbors, min_dist=req.min_dist,
	)
	analyzer.cluster_data(method="HDBSCAN", min_cluster_size=req.min_cluster_size)
	_labels = analyzer.__dict__["cluster_labels"]
	row_terms = None
	if req.enable_tfidf:
	# TF-IDF naming + near-duplicate cluster merging. The merge
	# path re-encodes on GPU and is best-effort; fall back to
	# numeric labels if anything in it fails.
	try:
	analyzer.get_tf_idf_clusters(top_n=3)
	row_terms = analyzer.merge_similar_clusters(
	cluster_terms=analyzer.__dict__["cluster_terms"],
	cluster_labels=analyzer.__dict__["cluster_labels"],
	)
	except Exception as merge_err:
	logger.warning(f"TF-IDF naming failed for {col}, using raw labels: {merge_err}")
	row_terms = None
	if row_terms is None:
	# Numeric placeholder names; raw HDBSCAN labels flow to XGBoost.
	row_terms = [f"Cluster {cid}" for cid in _labels]
	# cluster_terms is per-row here (length == len(df)) so the
	# cluster_viz masks below index reduced_embeddings correctly.
	analyzer.cluster_terms = row_terms

	new_data[f"Analyzer_{col}"] = analyzer.cluster_terms
	state.analyzers.append(analyzer)

	traces = []
	unique_terms = np.unique(analyzer.cluster_terms)
	for term in unique_terms:
	mask = (np.array(analyzer.cluster_terms) == term)
	if not mask.any(): continue
	traces.append({
	"name": term,
	"x": analyzer.reduced_embeddings[mask, 0].tolist(),
	"y": analyzer.reduced_embeddings[mask, 1].tolist(),
	"text": df[col].iloc[mask].astype(str).tolist(),
	"count": int(mask.sum()),
	})
	cluster_viz[col] = {"traces": traces, "title": f"{col} Clusters (HDBSCAN)"}
	_dist = pd.Series(analyzer.cluster_terms).astype(str).value_counts().head(20)
	results[col] = {
	"cluster_count": len(unique_terms),
	"total_points": len(df),
	"distribution": [{"label": str(k), "count": int(v)} for k, v in _dist.items()],
	}
	except Exception as e:
	logger.error(f"Error analyzing {col}: {e}")

	if len(state.analyzers) >= 2:
	new_data_cat = new_data.fillna('null').astype('category')
	data_nums = new_data_cat.apply(lambda x: x.cat.codes)
	for col in data_nums.columns:
	try:
	categories = new_data_cat[col].cat.categories
	x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
	bst, accuracy, preds = train_xgboost(x_train, y_train, x_test, y_test, len(categories))

	importances_dict = bst.get_score(importance_type="gain")
	importances = {k.replace("Analyzer_", ""): float(v) for k, v in importances_dict.items()}
	importances = dict(sorted(importances.items(), key=lambda x: x[1], reverse=True))
	xgboost_results[col.replace("Analyzer_", "")] = {
	"feature_importance": importances,
	"accuracy": round(accuracy, 3),
	}
	except Exception as e:
	logger.error(f"Error in xgboost for {col}: {e}")

	else:
	# Mock mode
	for col in valid_columns:
	col_data = df[col].fillna("").astype(str)
	vc = col_data.value_counts()
	top_labels = vc.head(32).index.tolist()
	cmap = {l: i for i, l in enumerate(top_labels)}
	clabels = col_data.apply(lambda x: cmap.get(x, -1)).values

	n = len(col_data)
	np.random.seed(42)
	red_embeds = np.random.randn(n, 2)

	cluster_terms = []
	for i, label in enumerate(top_labels[:20]):
	mask = col_data == label
	red_embeds[mask.values] = (np.random.randn(2) * 3) + np.random.randn(mask.sum(), 2) * 0.5

	new_data[f"Analyzer_{col}"] = [top_labels[c] if 0 <= c < len(top_labels) else "Other" for c in clabels]

	traces = []
	for i, term in enumerate(top_labels[:20]):
	mask = (clabels == i)
	if mask.sum() == 0: continue
	traces.append({
	"name": term,
	"x": red_embeds[mask, 0].tolist(),
	"y": red_embeds[mask, 1].tolist(),
	"text": col_data[mask].tolist(),
	"count": int(mask.sum()),
	})
	cluster_viz[col] = {"traces": traces, "title": f"{col} Clusters (Mock)"}
	results[col] = {"cluster_count": len(top_labels), "distribution": [{"label": str(k), "count": int(v)} for k, v in vc.head(20).items()], "total_points": n}

	cramers_data = None
	if len(valid_columns) >= 2:
	new_data_cat = new_data.fillna("null").astype("category")
	cols = list(new_data_cat.columns)
	from scipy.stats import chi2_contingency
	matrix = []
	for c1 in cols:
	row = []
	for c2 in cols:
	if c1 == c2:
	row.append(1.0)
	else:
	try:
	ct = pd.crosstab(new_data_cat[c1], new_data_cat[c2])
	chi2 = chi2_contingency(ct)[0]
	n_obs = ct.sum().sum()
	phi2 = chi2 / n_obs
	r, k = ct.shape
	phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n_obs-1))
	rcorr, kcorr = r - ((r-1)2)/(n_obs-1), k - ((k-1)2)/(n_obs-1)
	denom = min(kcorr-1, rcorr-1)
	v = np.sqrt(phi2corr / denom) if denom > 0 else 0.0
	row.append(round(float(v), 3))
	except Exception: row.append(0.0)
	matrix.append(row)
	cramers_data = {"labels": [c.replace("Analyzer_", "") for c in cols], "matrix": matrix}

	state.new_data = new_data
	state.data_processed = True
	state.analysis_results = results
	state.cluster_viz = cluster_viz
	state.cramers_v = cramers_data
	state.col_names = valid_columns
	state.analysis_runs += 1
	state.last_analysis_at = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

	return {
	"status": "ok",
	"analysis_mode": state.analysis_mode,
	"mock_mode": state.analysis_mode == "mock",
	"warnings": ["Results generated by mock pipeline for demo purposes."] if state.analysis_mode == "mock" else [],
	"results": results,
	"cluster_viz": cluster_viz,
	"cramers_v": cramers_data,
	"xgboost": xgboost_results if xgboost_results else {},
	"processed_data": df_to_json(new_data),
	}

	@app.get("/api/analyze/results")
	def get_analysis_results(state: SessionState = Depends(get_session)):
	if not state.data_processed:
	raise HTTPException(status_code=400, detail="No analysis run yet")
	return {"results": state.analysis_results, "cluster_viz": state.cluster_viz, "cramers_v": state.cramers_v}

	@app.post("/api/query/gemini")
	def query_gemini(req: QueryRequest, state: SessionState = Depends(get_session)):
	if state.filtered_data is None:
	raise HTTPException(status_code=400, detail="No data")
	valid_cols = [c for c in req.columns if c in state.filtered_data.columns]
	if not valid_cols:
	raise HTTPException(status_code=400, detail="No valid columns selected")
	try:
	import google.generativeai as genai
	# Combine the selected columns per row with " - " (mirrors rag_search.py
	# and the parsing _build_text_series), then drop empty rows.
	text_series = _build_text_series(state.filtered_data, valid_cols)
	filtered = [t for t in text_series.dropna().tolist() if t.strip()]
	context_seed = filtered[:MAX_QUERY_CONTEXT_ROWS]
	# Token aware chunking
	context, rows_used = truncate_context(context_seed, MAX_QUERY_CONTEXT_CHARS)
	if not context:
	raise HTTPException(status_code=400, detail="No text available for querying")

	genai.configure(api_key=req.gemini_key)
	model = genai.GenerativeModel("models/gemini-3.1-pro-preview")
	response = model.generate_content([f"{req.question or 'Summarize'}\\nContext: {context}\\n\\n"])
	return {"status": "ok", "response": response.text,
	"context_rows_used": rows_used, "columns_used": valid_cols}
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/api/dashboard/summary")
	def get_dashboard_summary(state: SessionState = Depends(get_session)):
	if state.dataset is None:
	return {"loaded": False, "analysis_runs": state.analysis_runs, "last_analysis_at": state.last_analysis_at}
	df = state.dataset
	return {
	"loaded": True,
	"total_rows": len(df),
	"total_columns": len(df.columns),
	"columns": list(df.columns),
	"analyzed": state.data_processed,
	"analyzed_columns": len(state.col_names),
	"analysis_runs": state.analysis_runs,
	"last_analysis_at": state.last_analysis_at,
	"analysis_mode": state.analysis_mode,
	"null_counts": {col: int(df[col].isna().sum()) for col in df.columns}
	}

	# ---------------------------------------------------------------------------
	# Parsing — LLM feature extraction (parsing.py parity, core tier)
	# ---------------------------------------------------------------------------
	from api.services import parsing_service, scu_service, rag_service, analysis_service


	def _build_text_series(df: pd.DataFrame, columns: list[str]) -> pd.Series:
	"""Concatenate the selected raw-text columns per row with ' - ' (mirrors
	parsing.py's _build_text), dropping empty cells."""
	def _row(row):
	parts = [
	str(row[c]).strip() for c in columns
	if c in row and pd.notna(row[c]) and str(row[c]).strip()
	]
	return " - ".join(parts) if parts else None
	return df.apply(_row, axis=1)


	@app.get("/api/parse/schemas")
	def parse_schemas():
	try:
	return {**parsing_service.list_schemas(), "models": parsing_service.available_models()}
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Could not load schemas: {e}")


	@app.post("/api/parse/schema-merge")
	def parse_schema_merge(req: SchemaMergeRequest):
	try:
	return parsing_service.merge_schema(req.labels, req.custom_fields)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/api/parse/schema-coverage")
	def parse_schema_coverage(req: SchemaCoverageRequest, state: SessionState = Depends(get_session)):
	"""Diff the merged schema's leaf fields against the dataset columns (🟢/🔴
	coverage) and return per-mode extraction schemas (all / missing / database)."""
	cols = req.columns
	if cols is None:
	if state.parse_source_df is None:
	raise HTTPException(
	status_code=400,
	detail="No raw dataset uploaded and no columns provided.",
	)
	cols = list(state.parse_source_df.columns)
	try:
	return parsing_service.schema_coverage_report(req.labels, cols, req.custom_fields)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Coverage diff failed: {e}")


	@app.post("/api/parse/upload")
	async def parse_upload(file: UploadFile = File(...), state: SessionState = Depends(get_session)):
	"""Upload a raw report dataset to be fed through the LLM extractor."""
	try:
	contents = await file.read()
	name = (file.filename or "").lower()
	if name.endswith(".csv"):
	df = pd.read_csv(io.BytesIO(contents))
	elif name.endswith((".xlsx", ".xls")):
	df = pd.read_excel(io.BytesIO(contents))
	elif name.endswith(".json"):
	import json as _json
	raw = contents.decode("utf-8")
	obj = _json.loads(raw)
	df = pd.json_normalize(obj if isinstance(obj, list) else [obj])
	else:
	raise HTTPException(status_code=400, detail="Unsupported file type (CSV / XLSX / JSON).")
	state.parse_source_df = df
	return {
	"status": "ok",
	"filename": file.filename,
	"data": df_to_json(df, max_rows=200),
	"columns": list(df.columns),
	"total_rows": len(df),
	}
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/api/parse/estimate")
	def parse_estimate(req: ParseEstimateRequest, state: SessionState = Depends(get_session)):
	if state.parse_source_df is None:
	raise HTTPException(status_code=400, detail="No raw dataset uploaded. Upload reports first.")
	valid = [c for c in req.columns if c in state.parse_source_df.columns]
	if not valid:
	raise HTTPException(status_code=400, detail="No valid text columns selected.")
	texts = _build_text_series(state.parse_source_df, valid).dropna().tolist()
	if not texts:
	raise HTTPException(status_code=400, detail="Selected columns produced no text.")
	try:
	return parsing_service.estimate(texts, req.format_json, req.model,
	use_cache=req.use_cache, use_batch=req.use_batch)
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Could not estimate cost: {e}")


	@app.post("/api/parse/run")
	def parse_run(req: ParseRunRequest, state: SessionState = Depends(get_session)):
	if state.parse_source_df is None:
	raise HTTPException(status_code=400, detail="No raw dataset uploaded. Upload reports first.")
	if not req.api_key:
	raise HTTPException(status_code=400, detail="An API key is required to run extraction.")
	src = state.parse_source_df
	valid = [c for c in req.columns if c in src.columns]
	if not valid:
	raise HTTPException(status_code=400, detail="No valid text columns selected.")

	text_series = _build_text_series(src, valid)
	texts = text_series.dropna().tolist()
	if not texts:
	raise HTTPException(status_code=400, detail="Selected columns produced no text to parse.")

	try:
	result = parsing_service.run_parse(
	texts, req.format_json, provider=req.provider, model=req.model,
	api_key=req.api_key, max_workers=req.max_workers,
	)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Parsing failed: {e}")

	df = result["df"]
	# Carry-through columns: align on the raw-text input, like parsing.py.
	if req.keep_columns and not df.empty:
	keep = [c for c in req.keep_columns if c in src.columns]
	if keep:
	keep_src = src[keep].copy()
	keep_src.index = text_series.values
	keep_src = keep_src[keep_src.index.notna()]
	keep_src = keep_src[~keep_src.index.duplicated(keep="first")]
	df = df.join(keep_src, how="left")

	state.parsed_responses = result["parsed_responses"]
	state.parsed_df = df
	return {
	"status": "ok",
	"n_ok": result["n_ok"],
	"n_total": result["n_total"],
	"n_failed": len(result["errors"]),
	"errors": result["errors"][:10],
	"data": df_to_json(df, max_rows=2000),
	}


	@app.get("/api/parse/result")
	def parse_result(state: SessionState = Depends(get_session)):
	if state.parsed_df is None:
	raise HTTPException(status_code=400, detail="No parsed data in session.")
	return {"status": "ok", "data": df_to_json(state.parsed_df, max_rows=2000),
	"n_records": len(state.parsed_df)}


	# ---------------------------------------------------------------------------
	# SCU normalization (scu_normalizer parity)
	# ---------------------------------------------------------------------------
	@app.get("/api/scu/criteria")
	def scu_criteria():
	try:
	return scu_service.criteria_info()
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/api/scu/normalize")
	def scu_normalize(state: SessionState = Depends(get_session)):
	if not state.parsed_responses:
	raise HTTPException(
	status_code=400,
	detail="No parsed responses in session. Run the Parsing step first.",
	)
	try:
	result = scu_service.normalize(state.parsed_responses)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Normalization failed: {e}")
	state.scu_normalized_df = result["df"]
	return {
	"status": "ok",
	"metrics": result["metrics"],
	"audit_markdown": result["audit_markdown"],
	"data": df_to_json(result["df"], max_rows=5000),
	}


	@app.post("/api/scu/filter")
	def scu_filter(req: ScuFilterRequest, state: SessionState = Depends(get_session)):
	if state.scu_normalized_df is None:
	raise HTTPException(status_code=400, detail="No normalized data. Run SCU normalization first.")
	try:
	result = scu_service.filter_eligibility(state.scu_normalized_df, req.criterion_keys)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Filter failed: {e}")
	return {
	"status": "ok",
	"funnel": result["funnel"],
	"n_passed": result["n_passed"],
	"data": df_to_json(result["df"], max_rows=5000),
	}


	# ---------------------------------------------------------------------------
	# RAG search — Cohere rerank (rag_search.py Dataset RAG parity)
	# ---------------------------------------------------------------------------
	@app.post("/api/rag/search")
	def rag_search(req: RagSearchRequest, state: SessionState = Depends(get_session)):
	df = state.filtered_data if state.filtered_data is not None else state.dataset
	if df is None or df.empty:
	raise HTTPException(status_code=400, detail="No dataset loaded. Load data first.")
	if not req.cohere_key:
	raise HTTPException(status_code=400, detail="A Cohere API key is required.")
	try:
	result = rag_service.rerank(df, req.columns, req.question, req.cohere_key, top_n=req.top_n)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=502, detail=f"Cohere rerank failed: {e}")
	return {
	"status": "ok",
	"n_results": result["n_results"],
	"searched_columns": result["searched_columns"],
	"data": df_to_json(result["df"], max_rows=req.top_n),
	}


	# ---------------------------------------------------------------------------
	# Cramér's V Categorical Association Explorer (analyzing.py parity)
	# ---------------------------------------------------------------------------
	def _assoc_source(state: SessionState, source: str) -> pd.DataFrame:
	if source == "parsed":
	if state.parsed_df is None:
	raise HTTPException(status_code=400, detail="No parsed data in session.")
	return state.parsed_df
	df = state.filtered_data if state.filtered_data is not None else state.dataset
	if df is None or df.empty:
	raise HTTPException(status_code=400, detail="No dataset loaded.")
	return df


	@app.post("/api/analysis/column-groups")
	def analysis_column_groups(req: ColumnGroupsRequest, state: SessionState = Depends(get_session)):
	"""Eligible categorical columns grouped by dotted parent, for the explorer's
	group selector. Cheap (cardinality only) so it loads before any matrix compute."""
	df = _assoc_source(state, req.source)
	try:
	return analysis_service.column_groups(df, high_threshold=req.high_threshold)
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Column grouping failed: {e}")


	@app.post("/api/analysis/xgboost")
	def analysis_xgboost(req: XgboostRequest, state: SessionState = Depends(get_session)):
	"""XGBoost feature importance computed directly on the selected categorical
	columns (no cluster pipeline) — fed by the Cramér's V explorer selection."""
	df = _assoc_source(state, req.source)
	try:
	return analysis_service.xgboost_importance(df, req.columns)
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"XGBoost feature importance failed: {e}")


	@app.post("/api/analysis/cramers-v")
	def analysis_cramers_v(req: CramersVRequest, state: SessionState = Depends(get_session)):
	df = _assoc_source(state, req.source)
	try:
	return analysis_service.cramers_v_report(
	df, req.columns, drop_missing=req.drop_missing,
	exclude_trivial=req.exclude_trivial, strong_threshold=req.strong_threshold,
	high_threshold=req.high_threshold,
	)
	except Exception as e:
	logger.error(traceback.format_exc())
	raise HTTPException(status_code=500, detail=f"Cramér's V failed: {e}")


	@app.post("/api/analysis/contingency")
	def analysis_contingency(req: ContingencyRequest, state: SessionState = Depends(get_session)):
	df = _assoc_source(state, req.source)
	try:
	return analysis_service.contingency(df, req.col1, req.col2, drop_missing=req.drop_missing)
	except ValueError as e:
	raise HTTPException(status_code=400, detail=str(e))
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Contingency failed: {e}")


	# ---------------------------------------------------------------------------
	# Map and Magnetic Features
	# ---------------------------------------------------------------------------
	from fastapi.responses import HTMLResponse
	from api.services.map_service import MapService

	@app.get("/api/map/html")
	def get_map_html(state: SessionState = Depends(get_session)):
	if state.filtered_data is None or state.filtered_data.empty:
	return HTMLResponse(
	"<html><body style='background:#121212;color:white;display:flex;"
	"justify-content:center;align-items:center;height:100vh;font-family:sans-serif;'>"
	"<h3>No data to display. Please load and filter your dataset first.</h3>"
	"</body></html>"
	)
	try:
	html, from_cache = MapService.get_or_generate(state.filtered_data)
	headers = {"X-Map-Cache": "HIT" if from_cache else "MISS"}
	return HTMLResponse(content=html, headers=headers)
	except Exception as e:
	logger.error(f"Map Error: {traceback.format_exc()}")
	return HTMLResponse(
	f"<html><body style='background:#121212;color:red;'>"
	f"<h3>Map generation failed: {e}</h3></body></html>"
	)


	class MagneticRequest(BaseModel):
	lat_col: str
	lon_col: str
	date_col: str
	distance: int = 100

	def _fig_to_data_uri(fig, dpi: int = 80) -> str:
	"""Render a matplotlib figure to a base64-encoded PNG data URI on a dark background."""
	import io, base64
	import matplotlib.pyplot as plt
	buf = io.BytesIO()
	fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", facecolor="#0d0d0d")
	plt.close(fig)
	buf.seek(0)
	return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode("ascii")


	# Max events scanned per request (each match is a live BGS API call). Synchronous
	# endpoint, so this is kept small to bound request duration.
	MAGNETIC_MAX_EVENTS = 25
	MAGNETIC_TIME_BUDGET_S = 240


	@app.post("/api/magnetic/run")
	def run_magnetic(req: MagneticRequest, state: SessionState = Depends(get_session)):
	"""Cross-reference UAP events with geomagnetic observatory data.

	For each event with valid coordinates and a post-1995 date, find the nearest
	INTERMAGNET/BGS observatory within ``distance`` km, fetch its X/Y/Z/S minute
	data around the event, and render a time-series graph. Also produces a
	FastDTW-aligned aggregate across every matched event.
	"""
	if state.filtered_data is None or state.filtered_data.empty:
	raise HTTPException(status_code=400, detail="No data loaded. Load a dataset in the Data Explorer first.")

	df = state.filtered_data
	for label, col in (("Latitude", req.lat_col), ("Longitude", req.lon_col), ("Date", req.date_col)):
	if col not in df.columns:
	raise HTTPException(status_code=400, detail=f"{label} column '{col}' not found in dataset.")

	import sys
	proj_root = os.path.dirname(os.path.dirname(__file__))
	if proj_root not in sys.path:
	sys.path.insert(0, proj_root)
	try:
	import matplotlib
	matplotlib.use("Agg")
	except Exception:
	pass
	import magnetic

	started = time.time()

	# 1. Observatory list from the BGS API
	try:
	stations = magnetic.get_stations()
	st_lat = pd.to_numeric(stations["Latitude"], errors="coerce").to_numpy()
	st_lon = pd.to_numeric(stations["Longitude"], errors="coerce").to_numpy()
	except Exception as e:
	logger.error(f"Magnetic: station fetch failed: {traceback.format_exc()}")
	raise HTTPException(status_code=502, detail=f"Could not reach the BGS observatory API: {e}")

	# 2. Candidate events: numeric coordinates + parseable, post-1995 dates
	work = pd.DataFrame({
	"lat": pd.to_numeric(df[req.lat_col], errors="coerce"),
	"lon": pd.to_numeric(df[req.lon_col], errors="coerce"),
	"date": df[req.date_col].apply(magnetic.parse_uap_date),
	}).dropna(subset=["lat", "lon", "date"])
	work = work[work["lat"].between(-90, 90) & work["lon"].between(-180, 180)]
	work = work[work["date"].dt.year >= 1995]
	candidates = len(work)
	if candidates == 0:
	raise HTTPException(
	status_code=400,
	detail="No events with valid coordinates and a post-1995 date. Check the selected columns.",
	)

	# 3. Per-event: nearest observatory -> fetch data -> render graph
	graphs, agg_data, agg_times = [], [], []
	scanned = skipped_no_station = skipped_no_data = 0

	for lat_, lon_, date_ in work[["lat", "lon", "date"]].itertuples(index=False):
	if scanned >= MAGNETIC_MAX_EVENTS or (time.time() - started) > MAGNETIC_TIME_BUDGET_S:
	break
	scanned += 1

	dists = np.array([
	magnetic.get_haversine_distance(lat_, lon_, a, b)
	for a, b in zip(st_lat, st_lon)
	])
	nearest = int(np.nanargmin(dists))
	if not np.isfinite(dists[nearest]) or dists[nearest] > req.distance:
	skipped_no_station += 1
	continue
	station = stations.iloc[nearest]
	iaga = str(station["IagaCode"])

	d = pd.Timestamp(date_)
	if d.tz is not None:
	d = d.tz_localize(None)
	start = (d - pd.Timedelta(hours=12)).strftime("%Y-%m-%d")
	end = (d + pd.Timedelta(hours=48)).strftime("%Y-%m-%d")

	try:
	res = magnetic.get_data(iaga, start, end)
	except Exception:
	res = None
	dt = (res or {}).get("datetime") or []
	if not dt:
	skipped_no_data += 1
	continue

	n = len(dt)
	comp = {}
	for k in ("X", "Y", "Z", "S"):
	vals = res.get(k) or []
	comp[k] = (
	pd.to_numeric(pd.Series(vals), errors="coerce").to_numpy()
	if len(vals) == n else np.full(n, np.nan)
	)
	if not any(np.isfinite(v).any() for v in comp.values()):
	skipped_no_data += 1
	continue

	dt_naive = pd.to_datetime(pd.Series(dt), utc=True, errors="coerce").dt.tz_localize(None)
	plotted = pd.DataFrame({"datetime": dt_naive, **comp})

	subtitle = (
	f"{d.date()} \| {lat_:.3f}, {lon_:.3f} \| "
	f"{station['Name']} ({iaga}) \| {dists[nearest]:.0f} km"
	)
	try:
	fig = magnetic.plot_data_custom(plotted.copy(), date=d, save_path=None, subtitle=subtitle)
	graphs.append({
	"title": f"{d.date()} — {station['Name']}",
	"station": str(station["Name"]),
	"iaga": iaga,
	"distance_km": round(float(dists[nearest]), 1),
	"event_date": str(d),
	"image": _fig_to_data_uri(fig),
	})
	except Exception:
	logger.error(f"Magnetic: plot failed for {iaga} {d}: {traceback.format_exc()}")
	skipped_no_data += 1
	continue

	agg = plotted.copy()
	agg["datetime"] = agg["datetime"].dt.tz_localize("UTC")
	agg_data.append(agg)
	agg_times.append(d.tz_localize("UTC"))

	# 4. FastDTW-aligned aggregate across every matched event
	aggregate = None
	if len(agg_data) >= 2:
	try:
	fig = magnetic.plot_average_timeseries_with_dtw(agg_data, agg_times, window_hours=12, save_path=None)
	aggregate = {
	"title": f"FastDTW-aligned average across {len(agg_data)} events",
	"image": _fig_to_data_uri(fig),
	}
	except Exception:
	logger.error(f"Magnetic: DTW aggregate failed: {traceback.format_exc()}")

	return {
	"status": "ok",
	"candidates": candidates,
	"scanned": scanned,
	"matched": len(graphs),
	"skipped_no_station": skipped_no_station,
	"skipped_no_data": skipped_no_data,
	"distance_km": req.distance,
	"elapsed_s": round(time.time() - started, 1),
	"graphs": graphs,
	"aggregate": aggregate,
	"events": len(graphs),
	"message": (
	f"Scanned {scanned} of {candidates} candidate events within {req.distance} km — "
	f"{len(graphs)} produced graphs, {skipped_no_station} had no observatory in range, "
	f"{skipped_no_data} returned no usable data."
	),
	}