Spaces:
Sleeping
Sleeping
| # streamlit_app.py | |
| """ | |
| Streamlit-приложение: предобработка (3.2), описательный анализ (3.3), тесты стационарности (3.4), генерация лагов/скользящих признаков (3.5), ACF/PACF (3.6), декомпозиция (3.7) и экспорт/веб-интерфейс (3.8). | |
| Запуск: | |
| pip install pandas numpy streamlit pytz plotly statsmodels scikit-learn | |
| streamlit run streamlit_app.py | |
| Файл создан для Дмитрия: сохраняет результаты в st.session_state, чтобы при смене виджетов | |
| результаты не пропадали. | |
| """ | |
| import os | |
| import io | |
| import base64 | |
| from typing import Optional, List, Tuple, Dict | |
| import numpy as np | |
| import pandas as pd | |
| import pytz | |
| import streamlit as st | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import matplotlib.pyplot as plt | |
| from statsmodels.tsa.stattools import adfuller, kpss, acf as sm_acf, pacf as sm_pacf | |
| from statsmodels.graphics.tsaplots import plot_acf, plot_pacf | |
| from statsmodels.tsa.seasonal import seasonal_decompose | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| from statsmodels.tools import add_constant | |
| st.set_page_config(page_title="TS Preprocess & EDA (3.2–3.8)", layout="wide") | |
| MOSCOW = pytz.timezone("Europe/Moscow") | |
| # ---------------- Utilities ---------------- | |
| def detect_date_column(df: pd.DataFrame) -> Optional[str]: | |
| candidates = [c for c in df.columns if any(k in c.lower() for k in ("date", "time", "timestamp", "dt", "day"))] | |
| if candidates: | |
| pref = [c for c in candidates if 'date' in c.lower()] | |
| return pref[0] if pref else candidates[0] | |
| scores = {} | |
| for c in df.columns: | |
| parsed = pd.to_datetime(df[c], errors='coerce', dayfirst=True, infer_datetime_format=True) | |
| scores[c] = parsed.notna().mean() | |
| best, score = max(scores.items(), key=lambda x: x[1]) | |
| return best if score > 0.5 else None | |
| def try_parse_dates(series: pd.Series) -> pd.Series: | |
| s = series.astype(str).replace('nan', pd.NA) | |
| parsed = pd.to_datetime(s, errors='coerce', infer_datetime_format=True) | |
| parsed = parsed.fillna(pd.to_datetime(s, format='%d.%m.%Y', errors='coerce')) | |
| parsed = parsed.fillna(pd.to_datetime(s, format='%Y-%m-%d', errors='coerce')) | |
| return parsed | |
| def localize_to_moscow(ts: pd.Series, assume_tz: str = 'local') -> pd.Series: | |
| ts = pd.to_datetime(ts, errors='coerce') | |
| if ts.dt.tz is None: | |
| if assume_tz == 'utc': | |
| ts = ts.dt.tz_localize('UTC').dt.tz_convert('Europe/Moscow') | |
| elif assume_tz == 'local': | |
| ts = ts.dt.tz_localize('Europe/Moscow') | |
| else: | |
| pass | |
| else: | |
| ts = ts.dt.tz_convert('Europe/Moscow') | |
| return ts | |
| def detect_outliers_iqr(col: pd.Series) -> pd.Series: | |
| q1 = col.quantile(0.25) | |
| q3 = col.quantile(0.75) | |
| iqr = q3 - q1 | |
| lo = q1 - 1.5 * iqr | |
| hi = q3 + 1.5 * iqr | |
| return (col < lo) | (col > hi) | |
| def winsorize_series(col: pd.Series, lower_q: float = 0.01, upper_q: float = 0.99) -> pd.Series: | |
| low = col.quantile(lower_q) | |
| high = col.quantile(upper_q) | |
| return col.clip(lower=low, upper=high) | |
| # ---------------- Preprocessing (3.2) ---------------- | |
| def preprocess_timeseries( | |
| df: pd.DataFrame, | |
| date_col: str, | |
| tz_assume: str = 'local', | |
| numeric_missing_strategy: str = 'interpolate', | |
| cat_missing_strategy: str = 'mode', | |
| outlier_strategy: str = 'interpolate', | |
| resample_freq: Optional[str] = None, | |
| ) -> Tuple[pd.DataFrame, Dict]: | |
| info: Dict = {} | |
| df2 = df.copy() | |
| parsed = try_parse_dates(df2[date_col]) | |
| info['parse_success'] = float(parsed.notna().mean()) | |
| df2['timestamp'] = parsed | |
| df2['timestamp'] = localize_to_moscow(df2['timestamp'], assume_tz=tz_assume) | |
| before = len(df2) | |
| df2 = df2.dropna(subset=['timestamp']).reset_index(drop=True) | |
| info['dropped_no_timestamp'] = before - len(df2) | |
| df2 = df2.sort_values('timestamp').drop_duplicates(subset=['timestamp']).reset_index(drop=True) | |
| num_cols = df2.select_dtypes(include=[np.number]).columns.tolist() | |
| cat_cols = [c for c in df2.columns if c not in num_cols and c != 'timestamp' and c != date_col] | |
| info['num_cols'] = num_cols | |
| info['cat_cols'] = cat_cols | |
| info['missing_before'] = df2[num_cols].isna().sum().to_dict() | |
| if numeric_missing_strategy == 'drop': | |
| df2 = df2.dropna(subset=num_cols).reset_index(drop=True) | |
| elif numeric_missing_strategy == 'interpolate': | |
| df2 = df2.set_index('timestamp') | |
| df2[num_cols] = df2[num_cols].interpolate(method='time', limit_direction='both') | |
| df2 = df2.reset_index() | |
| elif numeric_missing_strategy == 'rolling': | |
| for c in num_cols: | |
| df2[c] = df2[c].fillna(df2[c].rolling(window=7, min_periods=1).mean()) | |
| else: | |
| raise ValueError('unknown numeric_missing_strategy') | |
| for c in cat_cols: | |
| if cat_missing_strategy == 'mode': | |
| mode = df2[c].mode() | |
| fill = mode[0] if not mode.empty else 'unknown' | |
| df2[c] = df2[c].fillna(fill) | |
| else: | |
| df2[c] = df2[c].fillna('unknown') | |
| info['missing_after'] = df2[num_cols].isna().sum().to_dict() | |
| outlier_summary = [] | |
| for c in num_cols: | |
| col = df2[c] | |
| iqr_mask = detect_outliers_iqr(col) | |
| outlier_summary.append({'column': c, 'iqr_count': int(iqr_mask.sum())}) | |
| info['outlier_summary'] = outlier_summary | |
| if outlier_strategy == 'mark': | |
| pass | |
| elif outlier_strategy == 'interpolate': | |
| df2 = df2.set_index('timestamp') | |
| for c in num_cols: | |
| mask = detect_outliers_iqr(df2[c]) | |
| df2.loc[mask, c] = np.nan | |
| df2[num_cols] = df2[num_cols].interpolate(method='time', limit_direction='both') | |
| df2 = df2.reset_index() | |
| elif outlier_strategy == 'winsorize': | |
| for c in num_cols: | |
| df2[c] = winsorize_series(df2[c]) | |
| elif outlier_strategy == 'drop': | |
| for c in num_cols: | |
| mask = detect_outliers_iqr(df2[c]) | |
| df2 = df2.loc[~mask].reset_index(drop=True) | |
| else: | |
| raise ValueError('unknown outlier_strategy') | |
| if resample_freq is not None: | |
| df2 = df2.set_index('timestamp') | |
| agg = {} | |
| for c in num_cols: | |
| lname = c.lower() | |
| if any(k in lname for k in ('case', 'count', 'death', 'new', 'confirmed', 'positive', 'tests')): | |
| agg[c] = 'sum' | |
| else: | |
| agg[c] = 'mean' | |
| res = df2.resample(resample_freq).agg(agg) | |
| for c in cat_cols: | |
| res[c] = df2[c].resample(resample_freq).first() | |
| res = res.reset_index() | |
| df2 = res | |
| if 'timestamp' in df2.columns: | |
| ts = pd.to_datetime(df2['timestamp']) | |
| if ts.dt.tz is None: | |
| df2['timestamp'] = ts.dt.tz_localize('Europe/Moscow') | |
| else: | |
| df2['timestamp'] = ts.dt.tz_convert('Europe/Moscow') | |
| info['final_shape'] = df2.shape | |
| return df2, info | |
| # ---------------- Descriptive (3.3) ---------------- | |
| def descriptive_statistics(df: pd.DataFrame, numeric_cols: List[str]) -> pd.DataFrame: | |
| rows = [] | |
| for c in numeric_cols: | |
| s = df[c].dropna() | |
| rows.append({ | |
| 'column': c, | |
| 'count': int(s.count()), | |
| 'mean': float(s.mean()) if not s.empty else None, | |
| 'median': float(s.median()) if not s.empty else None, | |
| 'std': float(s.std()) if not s.empty else None, | |
| 'min': float(s.min()) if not s.empty else None, | |
| 'q1': float(s.quantile(0.25)) if not s.empty else None, | |
| 'q3': float(s.quantile(0.75)) if not s.empty else None, | |
| 'max': float(s.max()) if not s.empty else None, | |
| 'skew': float(s.skew()) if not s.empty else None, | |
| 'kurtosis': float(s.kurtosis()) if not s.empty else None, | |
| 'missing_pct': float(df[c].isna().mean()) | |
| }) | |
| return pd.DataFrame(rows).set_index('column') | |
| # ---------------- Stationarity (3.4) helpers ---------------- | |
| def run_adf(series: pd.Series) -> Dict: | |
| try: | |
| res = adfuller(series.dropna().values, autolag='AIC') | |
| return {'statistic': res[0], 'pvalue': res[1], 'usedlag': res[2], 'nobs': res[3]} | |
| except Exception as e: | |
| return {'error': str(e)} | |
| def run_kpss(series: pd.Series) -> Dict: | |
| try: | |
| res = kpss(series.dropna().values, nlags='auto') | |
| return {'statistic': res[0], 'pvalue': res[1], 'nlags': res[2]} | |
| except Exception as e: | |
| return {'error': str(e)} | |
| # ---------------- Lag & Rolling (3.5) ---------------- | |
| def create_lags_and_rolls(df: pd.DataFrame, target: str, lags: List[int], roll_windows: List[int], extra_features: List[str] = None) -> pd.DataFrame: | |
| df2 = df.copy().set_index('timestamp') | |
| df2 = df2.sort_index() | |
| for l in lags: | |
| df2[f'{target}_lag_{l}'] = df2[target].shift(l) | |
| if extra_features: | |
| for feat in extra_features: | |
| for l in lags: | |
| df2[f'{feat}_lag_{l}'] = df2[feat].shift(l) | |
| for w in roll_windows: | |
| df2[f'{target}_roll_mean_{w}'] = df2[target].rolling(window=w, min_periods=1).mean() | |
| df2[f'{target}_roll_std_{w}'] = df2[target].rolling(window=w, min_periods=1).std() | |
| return df2.reset_index() | |
| def compute_lag_correlations(df: pd.DataFrame, target: str, lags: List[int]) -> pd.DataFrame: | |
| cols = [f'{target}_lag_{l}' for l in lags if f'{target}_lag_{l}' in df.columns] | |
| corr_rows = [] | |
| for c in cols: | |
| corr = df[[target, c]].dropna().corr().iloc[0, 1] | |
| corr_rows.append({'lag_col': c, 'corr_with_target': float(corr) if pd.notna(corr) else None}) | |
| return pd.DataFrame(corr_rows).set_index('lag_col') | |
| def compute_vif(df: pd.DataFrame, features: List[str]) -> pd.DataFrame: | |
| X = df[features].dropna() | |
| if X.shape[0] == 0: | |
| return pd.DataFrame({'feature': features, 'VIF': [None] * len(features)}).set_index('feature') | |
| X_const = add_constant(X) | |
| vif_vals = [] | |
| for i, col in enumerate(X.columns): | |
| try: | |
| v = variance_inflation_factor(X_const.values, i + 1) | |
| except Exception: | |
| v = np.nan | |
| vif_vals.append({'feature': col, 'VIF': float(v) if pd.notna(v) else None}) | |
| return pd.DataFrame(vif_vals).set_index('feature') | |
| # ---------------- ACF/PACF helpers (3.6) ---------------- | |
| def get_acf_pacf_with_conf(series: pd.Series, nlags: int = 40, alpha: float = 0.05): | |
| acf_vals, acf_confint = sm_acf(series.dropna().values, nlags=nlags, alpha=alpha) | |
| pacf_vals, pacf_confint = sm_pacf(series.dropna().values, nlags=nlags, alpha=alpha) | |
| return acf_vals, acf_confint, pacf_vals, pacf_confint | |
| def significant_lags_from_conf(vals: np.ndarray, confint: np.ndarray) -> List[int]: | |
| sig = [] | |
| for i in range(1, len(vals)): | |
| lower, upper = confint[i] | |
| v = vals[i] | |
| if (v < lower) or (v > upper): | |
| sig.append(i) | |
| return sig | |
| def plotly_acf_pacf(acf_vals, acf_conf, pacf_vals, pacf_conf, max_lag, title_prefix=''): | |
| # build ACF bar + conf intervals | |
| lags = list(range(len(acf_vals)))[: max_lag + 1] | |
| acf_fig = go.Figure() | |
| acf_fig.add_trace(go.Bar(x=lags, y=acf_vals[:len(lags)], name='ACF')) | |
| # conf intervals as lines | |
| if acf_conf is not None and len(acf_conf) >= len(lags): | |
| lower = [acf_conf[i][0] for i in lags] | |
| upper = [acf_conf[i][1] for i in lags] | |
| acf_fig.add_trace(go.Scatter(x=lags, y=upper, mode='lines', line=dict(width=1), name='conf_upper')) | |
| acf_fig.add_trace(go.Scatter(x=lags, y=lower, mode='lines', line=dict(width=1), name='conf_lower')) | |
| acf_fig.update_layout(title=f'{title_prefix} ACF', xaxis_title='lag') | |
| lags_p = list(range(len(pacf_vals)))[: max_lag + 1] | |
| pacf_fig = go.Figure() | |
| pacf_fig.add_trace(go.Bar(x=lags_p, y=pacf_vals[:len(lags_p)], name='PACF')) | |
| if pacf_conf is not None and len(pacf_conf) >= len(lags_p): | |
| lowerp = [pacf_conf[i][0] for i in lags_p] | |
| upperp = [pacf_conf[i][1] for i in lags_p] | |
| pacf_fig.add_trace(go.Scatter(x=lags_p, y=upperp, mode='lines', line=dict(width=1), name='conf_upper')) | |
| pacf_fig.add_trace(go.Scatter(x=lags_p, y=lowerp, mode='lines', line=dict(width=1), name='conf_lower')) | |
| pacf_fig.update_layout(title=f'{title_prefix} PACF', xaxis_title='lag') | |
| return acf_fig, pacf_fig | |
| # ---------------- Report generation (3.8 helpers) ---------------- | |
| def generate_html_report( | |
| df: pd.DataFrame, | |
| target: str, | |
| features: List[str], | |
| params: Dict, | |
| figs: Dict[str, any], | |
| tables: Dict[str, pd.DataFrame] | |
| ) -> str: | |
| parts = [] | |
| parts.append(f"<h1>Отчёт по временным рядам — target: {target}</h1>") | |
| parts.append(f"<p>Параметры: {params}</p>") | |
| # include time series fig | |
| if 'series' in figs: | |
| parts.append('<h2>Временной ряд</h2>') | |
| parts.append(figs['series'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| if 'decomp' in figs: | |
| parts.append('<h2>Декомпозиция</h2>') | |
| parts.append(figs['decomp_observed'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| parts.append(figs['decomp_trend'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| parts.append(figs['decomp_seasonal'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| parts.append(figs['decomp_resid'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| if 'corr' in figs: | |
| parts.append('<h2>Матрица корреляций</h2>') | |
| parts.append(figs['corr'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| if 'acf' in figs and 'pacf' in figs: | |
| parts.append('<h2>ACF / PACF</h2>') | |
| parts.append(figs['acf'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| parts.append(figs['pacf'].to_html(full_html=False, include_plotlyjs='cdn')) | |
| # tables | |
| for name, table in tables.items(): | |
| parts.append(f'<h3>{name}</h3>') | |
| parts.append(table.to_html(classes="table table-striped", index=True)) | |
| html = '<html><head><meta charset="utf-8"></head><body>' + ''.join(parts) + '</body></html>' | |
| return html | |
| # ---------------- Streamlit UI ---------------- | |
| st.title("Временные ряды — предобработка, EDA, стационарность, лаги, ACF/PACF, декомпозиция и экспорт (3.2–3.8)") | |
| # Sidebar | |
| st.sidebar.header("Настройки") | |
| uploaded_file = st.sidebar.file_uploader("Загрузите CSV/Parquet", type=['csv', 'parquet']) | |
| # small built-in example option (uses local file if present) | |
| sample_option = None | |
| if os.path.exists('russia_covid_dataset.csv'): | |
| sample_option = 'russia_covid_dataset.csv' | |
| sample_choice = st.sidebar.selectbox('Или выбрать предзагруженный пример', options=[None, sample_option] if sample_option else [None]) | |
| tz_assume = st.sidebar.selectbox("Как трактовать tz-naive метки?", | |
| options=['local', 'utc', 'keep'], index=0, | |
| format_func=lambda x: {'local': 'локально (Europe/Moscow)', 'utc': 'UTC->Moscow', 'keep': 'не трогать'}[x]) | |
| numeric_missing_strategy = st.sidebar.selectbox("Заполнение пропусков (числ.)", options=['interpolate', 'drop', 'rolling'], index=0) | |
| cat_missing_strategy = st.sidebar.selectbox("Заполнение пропусков (категор.)", options=['mode', 'unknown'], index=0) | |
| outlier_strategy = st.sidebar.selectbox("Обработка выбросов", options=['interpolate', 'winsorize', 'drop', 'mark'], index=0) | |
| resample_freq = st.sidebar.selectbox("Ресемплить к частоте (если нужно)", options=[None, 'D', 'W', 'M'], index=1) | |
| # load dataset and persist | |
| if 'df_in' not in st.session_state: | |
| st.session_state['df_in'] = None | |
| if uploaded_file is not None: | |
| try: | |
| if uploaded_file.name.endswith('.parquet'): | |
| df_in = pd.read_parquet(uploaded_file) | |
| else: | |
| df_in = pd.read_csv(uploaded_file, low_memory=False) | |
| st.session_state['df_in'] = df_in | |
| st.success(f"Загружен файл: {uploaded_file.name} ({df_in.shape[0]}×{df_in.shape[1]})") | |
| except Exception as e: | |
| st.error(f"Ошибка загрузки: {e}") | |
| st.stop() | |
| elif sample_choice: | |
| st.session_state['df_in'] = pd.read_csv(sample_choice, low_memory=False) | |
| st.info(f"Выбран пример: {sample_choice}") | |
| else: | |
| local_path = 'russia_covid_dataset.csv' | |
| if st.session_state['df_in'] is None and os.path.exists(local_path): | |
| st.session_state['df_in'] = pd.read_csv(local_path, low_memory=False) | |
| st.info(f"Авто-загружен локальный файл {local_path}") | |
| elif st.session_state['df_in'] is None: | |
| st.info("Загрузите файл или поместите russia_covid_dataset.csv в рабочую папку.") | |
| st.stop() | |
| df_in = st.session_state['df_in'] | |
| st.subheader("Preview входного датасета") | |
| st.dataframe(df_in.head(8)) | |
| # detect date column | |
| detected = detect_date_column(df_in) | |
| col_for_date = st.text_input("Колонка с временной меткой", value=detected if detected else "") | |
| if not col_for_date: | |
| st.error("Укажите колонку с временной меткой.") | |
| st.stop() | |
| # Run buttons | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| run_btn = st.button("Run Preprocessing") | |
| with col2: | |
| force_btn = st.button("Force Recompute (пересчитать)") | |
| # session keys | |
| st.session_state.setdefault('preprocessed', False) | |
| st.session_state.setdefault('df_clean', None) | |
| st.session_state.setdefault('info', {}) | |
| st.session_state.setdefault('df_lags', None) | |
| if run_btn or force_btn or (not st.session_state['preprocessed'] and st.session_state['df_clean'] is None): | |
| df_clean, info = preprocess_timeseries( | |
| df_in, | |
| date_col=col_for_date, | |
| tz_assume=tz_assume, | |
| numeric_missing_strategy=numeric_missing_strategy, | |
| cat_missing_strategy=cat_missing_strategy, | |
| outlier_strategy=outlier_strategy, | |
| resample_freq=resample_freq, | |
| ) | |
| st.session_state['df_clean'] = df_clean | |
| st.session_state['info'] = info | |
| st.session_state['preprocessed'] = True | |
| # Main UI after preprocess | |
| if st.session_state.get('preprocessed'): | |
| df_clean = st.session_state['df_clean'] | |
| info = st.session_state['info'] | |
| st.subheader("Финальный датасет (первые строки)") | |
| st.dataframe(df_clean.head(10)) | |
| st.markdown(f"**Размер до/после:** {df_in.shape} → {info.get('final_shape')}") | |
| st.markdown(f"**Доля распарсенных дат:** {info.get('parse_success', 0):.2%}") | |
| st.markdown(f"**Удалено строк без даты:** {info.get('dropped_no_timestamp', 0)}") | |
| st.download_button("Скачать final_dataset.csv", data=df_clean.to_csv(index=False).encode('utf-8'), file_name='final_dataset.csv', mime='text/csv') | |
| # 3.3 Descriptive | |
| st.header("Этап 3.3 — Описательная статистика и визуализация") | |
| numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist() | |
| if not numeric_cols: | |
| st.warning("Нет числовых колонок для анализа.") | |
| else: | |
| stats_df = descriptive_statistics(df_clean, numeric_cols) | |
| st.subheader("Дескриптивная статистика") | |
| st.dataframe(stats_df) | |
| st.subheader("Гистограммы / Boxplot / Pairwise") | |
| sel = st.multiselect("Выбрать колонки для графиков", numeric_cols, default=numeric_cols[:3]) | |
| for c in sel: | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| fig = px.histogram(df_clean, x=c, nbins=60, title=f'Histogram: {c}') | |
| st.plotly_chart(fig, use_container_width=True) | |
| with c2: | |
| figb = go.Figure() | |
| figb.add_trace(go.Box(y=df_clean[c], name=c)) | |
| st.plotly_chart(figb, use_container_width=True) | |
| if len(sel) >= 2: | |
| st.subheader("Scatter matrix") | |
| figm = px.scatter_matrix(df_clean, dimensions=sel[:6], title='Scatter matrix (часть признаков)') | |
| st.plotly_chart(figm, use_container_width=True) | |
| st.subheader("Матрица корреляций") | |
| corr_method = st.selectbox("Тип корреляции", options=['pearson', 'spearman'], index=0) | |
| corr = df_clean[numeric_cols].corr(method=corr_method) | |
| figc = px.imshow(corr, text_auto=True, title=f'Correlation ({corr_method})') | |
| st.plotly_chart(figc, use_container_width=True) | |
| # 3.4 Stationarity | |
| st.header("Этап 3.4 — Проверка на стационарность (ADF/KPSS) и визуальная диагностика") | |
| if not numeric_cols: | |
| st.info("Нет числовых колонок для тестов.") | |
| else: | |
| station_target = st.selectbox("Выберите колонку для тестов", options=numeric_cols, index=0, key='station_target') | |
| window1 = st.number_input("Окно rolling mean/std (точки)", min_value=3, max_value=365, value=30) | |
| s = df_clean.set_index('timestamp')[station_target].dropna() | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=s.index, y=s.values, name='series')) | |
| roll_mean = s.rolling(window=window1, min_periods=1).mean() | |
| roll_std = s.rolling(window=window1, min_periods=1).std() | |
| fig.add_trace(go.Scatter(x=roll_mean.index, y=roll_mean.values, name=f'rolling_mean_{window1}')) | |
| fig.update_layout(title=f'Series & rolling mean ({station_target})', height=400) | |
| st.plotly_chart(fig, use_container_width=True) | |
| fig2 = go.Figure() | |
| fig2.add_trace(go.Scatter(x=roll_std.index, y=roll_std.values, name=f'rolling_std_{window1}')) | |
| fig2.update_layout(title=f'Rolling std ({station_target})', height=300) | |
| st.plotly_chart(fig2, use_container_width=True) | |
| if st.button("Run stationarity tests"): | |
| adf_res = run_adf(s) | |
| kpss_res = run_kpss(s) | |
| alpha = 0.05 | |
| adf_stationary = ('pvalue' in adf_res) and (adf_res['pvalue'] < alpha) | |
| kpss_stationary = ('pvalue' in kpss_res) and (kpss_res['pvalue'] > alpha) | |
| st.subheader("Результаты тестов") | |
| st.write("ADF:", adf_res) | |
| st.write("KPSS:", kpss_res) | |
| st.markdown(f"Интерпретация при α={alpha}: ") | |
| st.write(f"- ADF говорит, что ряд {'стационарен' if adf_stationary else 'НЕ стационарен'} (p={adf_res.get('pvalue','?')})") | |
| st.write(f"- KPSS говорит, что ряд {'стационарен' if kpss_stationary else 'НЕ стационарен'} (p={kpss_res.get('pvalue','?')})") | |
| if adf_stationary and kpss_stationary: | |
| st.success("Оба теста согласны: ряд, скорее всего, стационарен.") | |
| elif (not adf_stationary) and (not kpss_stationary): | |
| st.warning("Оба теста указывают на нестационарность → рекомендуем дифференцирование / детренд / лог-трансформацию.") | |
| else: | |
| st.info("Тесты противоречат друг другу — смотрите графики rolling mean/std и пробуйте трансформации (log/diff).") | |
| st.subheader("Применить дифференцирование и повторить тесты") | |
| diff_order = st.number_input("Порядок дифференцирования (целое >=1)", min_value=1, max_value=5, value=1, step=1) | |
| if st.button("Apply diff & Re-test"): | |
| s_diff = s.diff(periods=diff_order).dropna() | |
| adf_res = run_adf(s_diff) | |
| kpss_res = run_kpss(s_diff) | |
| st.write(f"Результаты для {diff_order}-го диффа:") | |
| st.write("ADF:", adf_res) | |
| st.write("KPSS:", kpss_res) | |
| figd = px.line(x=s_diff.index, y=s_diff.values, title=f'Differenced series (order={diff_order})') | |
| st.plotly_chart(figd, use_container_width=True) | |
| if st.checkbox("Сохранить дифференцированный ряд в session (переопределит final_dataset)", value=False): | |
| df_store = df_clean.copy() | |
| df_store[station_target] = df_store[station_target].diff(periods=diff_order) | |
| df_store = df_store.dropna(subset=[station_target]).reset_index(drop=True) | |
| st.session_state['df_clean'] = df_store | |
| st.success("Дифференцированный ряд сохранён в final_dataset (session).") | |
| # 3.5 Lag & Rolling features | |
| st.header("Этап 3.5 — Создание лагов и скользящих статистик") | |
| if not numeric_cols: | |
| st.info("Нет числовых колонок для создания лагов.") | |
| else: | |
| st.subheader("Параметры генерации лагов/скользящих") | |
| target_col = st.selectbox("Выберите целевую колонку (target)", options=numeric_cols, index=0, key='lag_target') | |
| default_lags = st.text_input("Список лагов через запятую (напр. 1,7,30)", value='1,7,30') | |
| default_rolls = st.text_input("Список окон для скользящих через запятую (напр. 7,30)", value='7,30') | |
| extra_feats_raw = st.text_input("Доп. признаки для лагов (через запятую), необязательно", value='') | |
| try: | |
| lags = [int(x.strip()) for x in default_lags.split(',') if x.strip()] | |
| except Exception: | |
| lags = [1, 7, 30] | |
| try: | |
| rolls = [int(x.strip()) for x in default_rolls.split(',') if x.strip()] | |
| except Exception: | |
| rolls = [7, 30] | |
| extra_feats = [x.strip() for x in extra_feats_raw.split(',') if x.strip()] | |
| extra_feats = [f for f in extra_feats if f in df_clean.columns] | |
| if st.button('Generate lags & rolls'): | |
| df_lags = create_lags_and_rolls(df_clean, target_col, lags, rolls, extra_features=extra_feats) | |
| st.session_state['df_lags'] = df_lags | |
| st.success(f'Создан датасет с лагами: shape={df_lags.shape}') | |
| if st.session_state.get('df_lags') is not None: | |
| df_lags = st.session_state['df_lags'] | |
| st.subheader('Первые строки с лагами') | |
| st.dataframe(df_lags.head(10)) | |
| st.subheader('Корреляция лагов с target') | |
| corr_lags = compute_lag_correlations(df_lags, target_col, lags) | |
| st.dataframe(corr_lags) | |
| st.subheader('Heatmap корреляций (лаги + target + дополнительные фичи)') | |
| lag_cols = [f'{target_col}_lag_{l}' for l in lags if f'{target_col}_lag_{l}' in df_lags.columns] | |
| numeric_subset = [target_col] + lag_cols + [c for c in extra_feats if c in df_lags.select_dtypes(include=[np.number]).columns] | |
| if len(numeric_subset) >= 2: | |
| corr2 = df_lags[numeric_subset].corr() | |
| figh = px.imshow(corr2, text_auto=True, title='Lag correlations heatmap') | |
| st.plotly_chart(figh, use_container_width=True) | |
| st.subheader('Проверка мультиколлинеарности (VIF) для признаков с лагами') | |
| candidate_feats = st.multiselect('Выберите признаки для VIF (по умолчанию lag-колонки)', options=numeric_subset, default=lag_cols) | |
| if candidate_feats: | |
| vif_df = compute_vif(df_lags, candidate_feats) | |
| st.dataframe(vif_df) | |
| st.download_button('Скачать датасет с лагами (CSV)', data=df_lags.to_csv(index=False).encode('utf-8'), file_name='dataset_with_lags.csv', mime='text/csv') | |
| if st.checkbox('Сохранить датасет с лагами в session (df_clean <- df_lags конвертировать)', value=False): | |
| st.session_state['df_clean'] = df_lags | |
| st.success('final_dataset в session заменён на датасет с лагами.') | |
| # 3.6 ACF / PACF | |
| st.header("Этап 3.6 — Анализ автокорреляции: ACF и PACF") | |
| if not numeric_cols: | |
| st.info('Нет числовых колонок для ACF/PACF.') | |
| else: | |
| acf_target = st.selectbox('Выберите колонку для ACF/PACF', options=numeric_cols, index=0, key='acf_target') | |
| max_lag = st.number_input('Максимальный лаг (nlags)', min_value=10, max_value=500, value=40, step=1) | |
| alpha = st.slider('Уровень значимости для доверительного интервала (alpha)', min_value=0.01, max_value=0.2, value=0.05, step=0.01) | |
| s_acf = df_clean.set_index('timestamp')[acf_target].dropna() | |
| if len(s_acf) < 2: | |
| st.warning('Недостаточно наблюдений для ACF/PACF.') | |
| else: | |
| try: | |
| acf_vals, acf_conf, pacf_vals, pacf_conf = get_acf_pacf_with_conf(s_acf, nlags=int(max_lag), alpha=float(alpha)) | |
| except Exception as e: | |
| st.error(f'Ошибка при вычислении ACF/PACF: {e}') | |
| acf_vals = pacf_vals = np.array([]) | |
| acf_conf = pacf_conf = np.array([]) | |
| fig_acf = plt.figure(figsize=(10, 4)) | |
| plot_acf(s_acf.values, lags=int(max_lag), alpha=alpha, zero=True, title=f'ACF: {acf_target}', ax=fig_acf.gca()) | |
| st.pyplot(fig_acf) | |
| fig_pacf = plt.figure(figsize=(10, 4)) | |
| plot_pacf(s_acf.values, lags=int(max_lag), alpha=alpha, method='ywm', title=f'PACF: {acf_target}', ax=fig_pacf.gca()) | |
| st.pyplot(fig_pacf) | |
| sig_acf = significant_lags_from_conf(acf_vals, acf_conf) if acf_vals.size else [] | |
| sig_pacf = significant_lags_from_conf(pacf_vals, pacf_conf) if pacf_vals.size else [] | |
| st.subheader('Статистически значимые лаги (по доверительным интервалам)') | |
| st.write('ACF значимые лаги:', sig_acf) | |
| st.write('PACF значимые лаги:', sig_pacf) | |
| acf_rows = [] | |
| for i in range(min(len(acf_vals), int(max_lag) + 1)): | |
| lower, upper = acf_conf[i] if acf_conf.size else (None, None) | |
| acf_rows.append({'lag': i, 'acf': float(acf_vals[i]), 'conf_low': float(lower) if lower is not None else None, 'conf_high': float(upper) if upper is not None else None}) | |
| pacf_rows = [] | |
| for i in range(min(len(pacf_vals), int(max_lag) + 1)): | |
| lower, upper = pacf_conf[i] if pacf_conf.size else (None, None) | |
| pacf_rows.append({'lag': i, 'pacf': float(pacf_vals[i]), 'conf_low': float(lower) if lower is not None else None, 'conf_high': float(upper) if upper is not None else None}) | |
| st.subheader('ACF values (таблица)') | |
| st.dataframe(pd.DataFrame(acf_rows).set_index('lag')) | |
| st.subheader('PACF values (таблица)') | |
| st.dataframe(pd.DataFrame(pacf_rows).set_index('lag')) | |
| st.markdown('**Интерпретация (упрощённо):** - Резкий обрыв в PACF на лаге p → возможный порядок AR(p). - Плавное затухание в ACF → возможный порядок MA(q). - Лаги, выходящие за доверительный интервал — статистически значимы.') | |
| # 3.7 Decomposition | |
| st.header("Этап 3.7 — Декомпозиция временного ряда") | |
| if not numeric_cols: | |
| st.info('Нет числовых колонок для декомпозиции.') | |
| else: | |
| decomp_target = st.selectbox('Выберите колонку для декомпозиции', options=numeric_cols, index=0, key='decomp_target') | |
| model_choice = st.radio('Модель декомпозиции', options=['additive', 'multiplicative'], index=0) | |
| period_option = st.selectbox('Период сезонности (если известен)', options=['auto', '7', '30', '365', 'custom'], index=0) | |
| custom_period = None | |
| if period_option == 'custom': | |
| custom_period = st.number_input('Введите период (целое >1)', min_value=2, value=30, step=1) | |
| if period_option == 'auto': | |
| inferred = None | |
| try: | |
| tmp = df_clean.set_index('timestamp')[decomp_target].dropna() | |
| inferred_freq = pd.infer_freq(tmp.index) | |
| if inferred_freq in ('D', 'B'): | |
| suggested = 7 | |
| elif inferred_freq == 'W': | |
| suggested = 52 | |
| else: | |
| suggested = None | |
| inferred = suggested | |
| except Exception: | |
| inferred = None | |
| else: | |
| inferred = int(period_option) if period_option in ('7', '30', '365') else None | |
| period = custom_period if custom_period is not None else inferred | |
| st.write(f'Выбранная модель: {model_choice}. Период: {period if period is not None else "не задан (нужен для правильной декомпозиции)"}.') | |
| if st.button('Run decomposition'): | |
| s = df_clean.set_index('timestamp')[decomp_target].dropna() | |
| if period is None: | |
| st.error('Период не определён. Укажите период (например 7 для недельной сезонности) или используйте custom.') | |
| elif len(s) < period * 2: | |
| st.error(f'Недостаточно точек для надёжной декомпозиции при периоде={period}. Нужно >= 2*period наблюдений. У вас {len(s)}.') | |
| else: | |
| try: | |
| decomp = seasonal_decompose(s, period=int(period), model=model_choice, extrapolate_trend='freq') | |
| st.session_state['decomp'] = decomp | |
| comp_df = pd.DataFrame({'timestamp': s.index, 'observed': decomp.observed, 'trend': decomp.trend, 'seasonal': decomp.seasonal, 'resid': decomp.resid}).reset_index(drop=True) | |
| st.session_state['decomp_df'] = comp_df | |
| st.subheader('Графики компонентов') | |
| st.plotly_chart(px.line(comp_df, x='timestamp', y='observed', title='Observed'), use_container_width=True) | |
| st.plotly_chart(px.line(comp_df, x='timestamp', y='trend', title='Trend'), use_container_width=True) | |
| st.plotly_chart(px.line(comp_df, x='timestamp', y='seasonal', title='Seasonal'), use_container_width=True) | |
| st.plotly_chart(px.line(comp_df, x='timestamp', y='resid', title='Residuals'), use_container_width=True) | |
| st.success('Декомпозиция выполнена и сохранена в сессии (decomp, decomp_df).') | |
| st.subheader('Анализ компонентов') | |
| trend_nonnull = comp_df['trend'].dropna() | |
| if len(trend_nonnull) > 2: | |
| xnum = np.arange(len(trend_nonnull)) | |
| coef = np.polyfit(xnum, trend_nonnull.values, 1) | |
| slope = coef[0] | |
| st.write(f'- Приблизительный линейный наклон тренда: {slope:.6f} ({"вырос" if slope>0 else "упал"}).') | |
| else: | |
| st.write('- Слишком мало данных в компоненте trend для оценки наклона.') | |
| seasonal = comp_df['seasonal'].dropna() | |
| if not seasonal.empty: | |
| amp = seasonal.max() - seasonal.min() | |
| st.write(f'- Амплитуда сезонной компоненты: {amp:.4f} (max={seasonal.max():.4f}, min={seasonal.min():.4f}).') | |
| resid = comp_df['resid'].dropna() | |
| st.subheader('Диагностика остатков') | |
| st.write(f'- Длина остатков: {len(resid)}') | |
| if len(resid) > 3: | |
| adf_r = run_adf(resid) | |
| kpss_r = run_kpss(resid) | |
| st.write('ADF (resid):', adf_r) | |
| st.write('KPSS (resid):', kpss_r) | |
| a_stat = ('pvalue' in adf_r) and (adf_r['pvalue'] < 0.05) | |
| k_stat = ('pvalue' in kpss_r) and (kpss_r['pvalue'] > 0.05) | |
| if a_stat and k_stat: | |
| st.success('Остатки выглядят стационарными по ADF и KPSS — декомпозиция адекватна.') | |
| else: | |
| st.warning('Остатки, возможно, нестационарны. Посмотрите на график остатков и подумайте о дополнительных преобразованиях или изменении периода/модели.') | |
| else: | |
| st.info('Недостаточно данных для тестов остатков.') | |
| st.download_button('Скачать компоненты (CSV)', data=comp_df.to_csv(index=False).encode('utf-8'), file_name='decomposition_components.csv', mime='text/csv') | |
| except Exception as e: | |
| st.error(f'Ошибка при декомпозиции: {e}') | |
| st.info('Этап 3.7 завершён. Дальше можно делать ACF/PACF на остатках, моделирование или формирование отчёта.') | |
| # ---------------- 3.8 Web interface & report export ---------------- | |
| st.header('Этап 3.8 — Веб-интерфейс, конфигурация и экспорт отчёта') | |
| st.markdown('Здесь собраны управляющие элементы для быстрой генерации HTML-отчёта и экспорта результатов. Отчёт включает: график ряда, скользящее среднее, матрицу корреляций, ACF/PACF и декомпозицию.') | |
| # Unified controls | |
| with st.expander('Параметры для отчёта'): | |
| report_target = st.selectbox('Target для отчёта', options=numeric_cols, index=0) | |
| report_features = st.multiselect('Доп. признаки для отчёта (включаются в корреляции)', options=numeric_cols, default=[c for c in numeric_cols if c != report_target][:2]) | |
| report_roll = st.number_input('Окно для скользящего среднего в отчёте', min_value=2, max_value=365, value=30) | |
| report_acf_lags = st.number_input('nlags для ACF/PACF в отчёте', min_value=10, max_value=500, value=40) | |
| report_period = st.selectbox('Период для декомпозиции в отчёте', options=[None, 7, 30, 365], index=1) | |
| if st.button('Сгенерировать и показать отчёт (вкладки ниже)'): | |
| # prepare figures | |
| figs = {} | |
| # time series with rolling | |
| s = df_clean.set_index('timestamp')[report_target].dropna() | |
| fig_series = go.Figure() | |
| fig_series.add_trace(go.Scatter(x=s.index, y=s.values, mode='lines', name='observed')) | |
| fig_series.add_trace(go.Scatter(x=s.rolling(window=report_roll, min_periods=1).mean().index, y=s.rolling(window=report_roll, min_periods=1).mean().values, mode='lines', name=f'roll_mean_{report_roll}')) | |
| fig_series.update_layout(title=f'Series: {report_target}', height=350) | |
| figs['series'] = fig_series | |
| # corr | |
| corr_cols = [report_target] + report_features | |
| corr_df = df_clean[corr_cols].corr() | |
| figs['corr'] = px.imshow(corr_df, text_auto=True, title='Correlation matrix') | |
| # decomposition (if available) | |
| if 'decomp_df' in st.session_state: | |
| comp_df = st.session_state['decomp_df'] | |
| figs['decomp_observed'] = px.line(comp_df, x='timestamp', y='observed', title='Observed') | |
| figs['decomp_trend'] = px.line(comp_df, x='timestamp', y='trend', title='Trend') | |
| figs['decomp_seasonal'] = px.line(comp_df, x='timestamp', y='seasonal', title='Seasonal') | |
| figs['decomp_resid'] = px.line(comp_df, x='timestamp', y='resid', title='Residuals') | |
| else: | |
| figs['decomp_observed'] = figs['decomp_trend'] = figs['decomp_seasonal'] = figs['decomp_resid'] = None | |
| # acf/pacf (plotly version) | |
| try: | |
| acf_vals, acf_conf, pacf_vals, pacf_conf = get_acf_pacf_with_conf(s, nlags=int(report_acf_lags), alpha=0.05) | |
| acf_fig, pacf_fig = plotly_acf_pacf(acf_vals, acf_conf, pacf_vals, pacf_conf, max_lag=int(report_acf_lags), title_prefix=report_target) | |
| figs['acf'] = acf_fig | |
| figs['pacf'] = pacf_fig | |
| except Exception: | |
| figs['acf'] = figs['pacf'] = None | |
| # tables | |
| tables = {'Descriptive': descriptive_statistics(df_clean, corr_cols), 'Correlation': corr_df} | |
| # show in tabs | |
| tab1, tab2, tab3 = st.tabs(['Графики', 'Таблицы', 'Экспорт']) | |
| with tab1: | |
| st.subheader('Временной ряд и rolling') | |
| st.plotly_chart(figs['series'], use_container_width=True) | |
| st.subheader('Матрица корреляций') | |
| st.plotly_chart(figs['corr'], use_container_width=True) | |
| if figs.get('decomp_observed') is not None: | |
| st.subheader('Декомпозиция') | |
| st.plotly_chart(figs['decomp_observed'], use_container_width=True) | |
| st.plotly_chart(figs['decomp_trend'], use_container_width=True) | |
| st.plotly_chart(figs['decomp_seasonal'], use_container_width=True) | |
| st.plotly_chart(figs['decomp_resid'], use_container_width=True) | |
| if figs.get('acf') is not None: | |
| st.subheader('ACF / PACF') | |
| st.plotly_chart(figs['acf'], use_container_width=True) | |
| st.plotly_chart(figs['pacf'], use_container_width=True) | |
| with tab2: | |
| st.subheader('Таблицы') | |
| for name, table in tables.items(): | |
| st.write(name) | |
| st.dataframe(table) | |
| with tab3: | |
| st.subheader('Экспорт отчёта') | |
| params = {'roll': int(report_roll), 'acf_lags': int(report_acf_lags), 'period': report_period} | |
| html = generate_html_report(df_clean, report_target, report_features, params, figs, tables) | |
| html_bytes = html.encode('utf-8') | |
| st.download_button('Скачать HTML-отчёт', data=html_bytes, file_name='ts_report.html', mime='text/html') | |
| # try PDF (if pdfkit available) | |
| try: | |
| import pdfkit | |
| # Попытка конвертировать HTML в PDF (требует установленного wkhtmltopdf) | |
| pdf_bytes = pdfkit.from_string(html, False) | |
| st.download_button('Скачать PDF-отчёт', data=pdf_bytes, file_name='ts_report.pdf', | |
| mime='application/pdf') | |
| except Exception: | |
| st.info( | |
| 'PDF-конверсия недоступна (pdfkit/wkhtmltopdf не установлены). Скачайте HTML и конвертируйте локально, если нужно.') | |