Spaces:

AmirTrader
/

PennyStockShortBacktester

Running

App Files Files Community

AlirezaX2 commited on Jan 11

Commit

a84c47e

0 Parent(s):

Initial clean deploy

Browse files

Files changed (13) hide show

.env.example +2 -0
.gitignore +13 -0
.python-version +1 -0
Dockerfile +46 -0
README.md +0 -0
backtester.py +246 -0
create_mock_data.py +44 -0
dashboard.py +310 -0
hello.py +6 -0
pyproject.toml +17 -0
test_conversion.py +22 -0
utils.py +86 -0
uv.lock +0 -0

.env.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ HF_TOKEN=your_huggingface_token_here
2	+ TARGET_FILE=marketsession_post_polygon_2020-01-01_2025-12-01.parquet_with_premarketvolume900K_marketcap1B.parquet

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Data files
+*.parquet

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+# Use a specialized UV image for building
+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder
+# Enable bytecode compilation and set UV options
+ENV UV_COMPILE_BYTECODE=1
+ENV UV_LINK_MODE=copy
+WORKDIR /app
+# Install dependencies separately to leverage Docker layer caching
+# This uses cache mounts for the uv cache and binds for configuration files
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --frozen --no-install-project --no-dev
+# Copy the rest of the application code
+COPY . /app
+# Sync the project (installs the current project package)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-dev
+# --- Final Stage ---
+FROM python:3.12-slim-bookworm
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PATH="/app/.venv/bin:$PATH"
+WORKDIR /app
+# Copy the synced environment from the builder
+COPY --from=builder /app /app
+# Create necessary cache directories with appropriate permissions
+# (Useful for certain cloud environments or local dockers)
+RUN mkdir -p /.cache && chmod 777 /.cache
+# Expose the dashboard port
+EXPOSE 5010
+# Run the Panel dashboard
+# Using direct 'panel serve' as it's more robust in container environments
+CMD ["panel", "serve", "dashboard.py", "--address", "0.0.0.0", "--port", "5010", "--allow-websocket-origin", "*"]

README.md ADDED Viewed

File without changes

backtester.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import pandas as pd
+import numpy as np
+def run_backtest(
+    df,
+    risk_per_trade,
+    stop_loss_pct,
+    take_profit_pct,
+    initial_capital,
+    start_date,
+    end_date,
+    max_trades_per_day,
+    commission_amount=2.0,
+):
+    """
+    Runs the backtest logic on the provided dataframe with given parameters.
+    """
+    # Filter by date
+    start_ts = pd.Timestamp(start_date)
+    end_ts = pd.Timestamp(end_date)
+    mask = (df["datetime"] >= start_ts) & (df["datetime"] <= end_ts)
+    sub_df = df[mask].copy()
+    if sub_df.empty:
+        return pd.DataFrame()
+    dates = sorted(sub_df["date"].unique())
+    trades = []
+    capital_net = initial_capital
+    capital_gross = initial_capital
+    total_comm_accum = 0
+    # Market‐session price columns
+    ms_columns = [
+        "marketsession_1min",
+        "marketsession_3min",
+        "marketsession_5min",
+        "marketsession_10min",
+        "marketsession_15min",
+        "marketsession_30min",
+        "marketsession_60min",
+        "marketsession_120min",
+        "marketsession_high",
+    ]
+    for current_date in dates:
+        countertradesperday = 0
+        day_df = sub_df[sub_df["date"] == current_date]
+        for _, row in day_df.iterrows():
+            if row.get("Ticker") == "QMMM":  # specific exclusion from user script
+                continue
+            entry_price = row["premarket_close"]
+            current_risk_amt = capital_net * risk_per_trade
+            size = current_risk_amt  # size is dollar amount? user script: size = capital_net * RISK_PER_TRADE
+            # User script:
+            # stop_price = entry_price * (1 + STOP_LOSS_PCT)
+            # target_price = entry_price * (1 - TAKE_PROFIT_PCT)
+            # Short setup
+            stop_price = entry_price * (1 + stop_loss_pct)
+            target_price = entry_price * (1 - take_profit_pct)
+            exit_price = None
+            exit_type = None
+            for col in ms_columns:
+                if col not in row or pd.isna(row[col]):
+                    continue
+                price = row[col]
+                # Stop-loss
+                if price >= stop_price:
+                    exit_price = stop_price
+                    exit_type = "stop"
+                    break
+                # Take-profit
+                if price <= target_price:
+                    exit_price = target_price
+                    exit_type = "target"
+                    break
+            if exit_price is None:
+                exit_price = row["marketsession_close"]
+                exit_type = "close"
+            # Pnl for short
+            pnl_gross = (entry_price - exit_price) / entry_price * size
+            # Commission logic as requested/defined
+            comission_entry = commission_amount * size / entry_price / 200
+            comission_exit = comission_entry  # using user's approximation
+            total_comm = comission_entry + comission_exit
+            pnl_net = pnl_gross - total_comm
+            # Update capitals
+            capital_net += pnl_net
+            capital_gross += pnl_gross
+            total_comm_accum += total_comm
+            pnl_perc = (
+                pnl_net / (capital_net - pnl_net) * 100
+                if (capital_net - pnl_net) != 0
+                else 0
+            )
+            if capital_net < initial_capital / 2:
+                # Stop out logic
+                break
+            trades.append(
+                {
+                    "date": current_date,
+                    "ticker": row.get("Ticker"),
+                    "entry_price": entry_price,
+                    "exit_price": exit_price,
+                    "exit_type": exit_type,
+                    "size": size,
+                    "pnl": pnl_net,
+                    "pnl_gross": pnl_gross,
+                    "pnl_perc": pnl_perc,
+                    "capital_net": capital_net,
+                    "capital_gross": capital_gross,
+                    "comm": total_comm,
+                    "cumulative_comm": total_comm_accum,
+                }
+            )
+            countertradesperday += 1
+            if countertradesperday >= max_trades_per_day:
+                break
+        if capital_net < initial_capital / 2:
+            break
+    return pd.DataFrame(trades)
+def analyze_day_trading(trades_df):
+    """
+    Analyze day trading performance based on trade logs.
+    Returns results dict and enriched df.
+    """
+    if trades_df.empty:
+        return {}, trades_df
+    df = trades_df.copy()
+    # Calculate additional metrics
+    df["is_win"] = df["pnl"] > 0
+    df["cumulative_pnl"] = df["pnl"].cumsum()
+    df["cumulative_pnl_gross"] = df["pnl_gross"].cumsum()
+    df["running_max"] = df["cumulative_pnl"].cummax()
+    df["drawdown"] = df["running_max"] - df["cumulative_pnl"]
+    df["drawdown_pct"] = (df["pnl_gross"] / df["capital_gross"]) * 100
+    # Return per trade
+    # Note: user used pnl_perc which is pnl/running_capital*100.
+    df["return"] = df["pnl_perc"] / 100
+    total_trades = len(df)
+    profitable_trades = sum(df["is_win"])
+    losing_trades = total_trades - profitable_trades
+    win_rate = profitable_trades / total_trades if total_trades > 0 else 0
+    total_pnl = df["pnl"].sum()
+    avg_pnl = df["pnl"].mean()
+    max_pnl = df["pnl"].max()
+    min_pnl = df["pnl"].min()
+    avg_pnl_perc = df["pnl_perc"].mean()
+    avg_win = df.loc[df["is_win"], "pnl"].mean() if profitable_trades > 0 else 0
+    avg_loss = df.loc[~df["is_win"], "pnl"].mean() if losing_trades > 0 else 0
+    risk_reward_ratio = abs(avg_win / avg_loss) if avg_loss != 0 else float("inf")
+    max_drawdown = df["drawdown"].max()
+    max_drawdown_perc = (
+        max_drawdown / df["running_max"].max() * 100
+        if df["running_max"].max() > 0
+        else 0
+    )
+    mean_return = df["return"].mean()
+    std_return = df["return"].std()
+    sharpe_ratio = (mean_return * 252**0.5) / std_return if std_return > 0 else 0
+    expectancy = (win_rate * avg_win) + ((1 - win_rate) * avg_loss)
+    total_profit = df.loc[df["is_win"], "pnl"].sum() if profitable_trades > 0 else 0
+    total_loss = abs(df.loc[~df["is_win"], "pnl"].sum()) if losing_trades > 0 else 1
+    profit_factor = total_profit / total_loss if total_loss > 0 else float("inf")
+    by_date = df.groupby("date")["pnl"].sum().reset_index()
+    by_ticker = df.groupby("ticker").agg(
+        {"pnl": ["sum", "mean", "count"], "is_win": "mean"}
+    )
+    # New Metrics Calculation
+    initial_capital_inferred = (
+        df.iloc[0]["capital_net"] - df.iloc[0]["pnl"] if not df.empty else 0
+    )
+    return_on_initial_capital = (
+        (total_pnl / initial_capital_inferred * 100)
+        if initial_capital_inferred != 0
+        else 0
+    )
+    total_commissions = df["comm"].sum() if not df.empty else 0
+    commission_impact_pct = (
+        (total_commissions / total_pnl * 100) if total_pnl != 0 else 0
+    )
+    results = {
+        "total_trades": total_trades,
+        "profitable_trades": profitable_trades,
+        "losing_trades": losing_trades,
+        "win_rate": win_rate,
+        "total_pnl": total_pnl,
+        "return_on_init_cap_pct": return_on_initial_capital,
+        "total_commissions": total_commissions,
+        "comm_to_pnl_pct": commission_impact_pct,
+        "avg_pnl": avg_pnl,
+        "max_pnl": max_pnl,
+        "min_pnl": min_pnl,
+        "avg_pnl_perc": avg_pnl_perc,
+        "avg_win": avg_win,
+        "avg_loss": avg_loss,
+        "risk_reward_ratio": risk_reward_ratio,
+        "max_drawdown": max_drawdown,
+        "max_drawdown_perc": max_drawdown_perc,
+        "sharpe_ratio": sharpe_ratio,
+        "expectancy": expectancy,
+        "profit_factor": profit_factor,
+        # 'by_date': by_date, # Keep dataframes out of strict dict if not needed for simple display, or keep them.
+        # 'by_ticker': by_ticker
+    }
+    return results, df

create_mock_data.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+import numpy as np
+import datetime
+def create_mock_data():
+    dates = pd.date_range(start="2024-10-01", end="2024-11-01", freq="D")
+    tickers = ["AAPL", "TSLA", "AMZN", "GOOG", "MSFT"]
+    rows = []
+    for d in dates:
+        for t in tickers:
+            # Randomize values to meet filters most of the time
+            entry = 10.0 + np.random.randn()
+            # columns: premarket_change_from_perviousday_perc > 8 and premarket_close > 2 and `Shares Float`>1e6 and `Market Capitalization`<100e6
+            rows.append(
+                {
+                    "datetime": d,
+                    "Ticker": t,
+                    "premarket_change_from_perviousday_perc": 10.0 + np.random.randn(),
+                    "premarket_close": entry,
+                    "Shares Float": 2e6,
+                    "Market Capitalization": 50e6,
+                    "marketsession_1min": entry * (1 - 0.01 * np.random.randn()),
+                    "marketsession_3min": entry * (1 - 0.02 * np.random.randn()),
+                    "marketsession_5min": entry * (1 - 0.03 * np.random.randn()),
+                    "marketsession_10min": entry * (1 - 0.04 * np.random.randn()),
+                    "marketsession_15min": entry * (1 - 0.05 * np.random.randn()),
+                    "marketsession_30min": entry * (1 - 0.06 * np.random.randn()),
+                    "marketsession_60min": entry * (1 - 0.07 * np.random.randn()),
+                    "marketsession_120min": entry * (1 - 0.08 * np.random.randn()),
+                    "marketsession_high": entry * 1.1,
+                    "marketsession_close": entry * 0.9,
+                }
+            )
+    df = pd.DataFrame(rows)
+    filename = "marketsession_post_polygon_2020-01-01_2025-12-01.parquet_with_premarketvolume900K_marketcap1B.parquet"
+    df.to_parquet(filename)
+    print(f"Mock data created: {filename}")
+if __name__ == "__main__":
+    create_mock_data()

dashboard.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import panel as pn
+import hvplot.pandas
+import pandas as pd
+import numpy as np
+from utils import load_data, DEFAULT_FILTER_QUERY
+from backtester import run_backtest, analyze_day_trading
+pn.extension("tabulator")
+# --- 1. Load Data ---
+# Initial load
+print("Loading data... (this might take a moment if downloading)")
+try:
+    # Cache the data in memory so we don't reload on every callback
+    # In a production app you might handle this differently
+    GLOBAL_DF = load_data()
+    print(f"Data loaded. Rows: {len(GLOBAL_DF)}")
+except Exception as e:
+    GLOBAL_DF = pd.DataFrame()
+    print(f"Error loading data: {e}")
+# --- 2. Widgets ---
+query_input = pn.widgets.TextAreaInput(
+    name="Filter Query (Pandas Syntax)",
+    value=DEFAULT_FILTER_QUERY,
+    height=100,
+    sizing_mode="stretch_width",
+)
+risk_per_trade_input = pn.widgets.FloatSlider(
+    name="Risk Per Trade (%)", start=0.01, end=1.00, step=0.01, value=0.15
+)
+stop_loss_input = pn.widgets.FloatSlider(
+    name="Stop Loss (%)", start=0.01, end=1.00, step=0.01, value=0.35
+)
+take_profit_input = pn.widgets.FloatSlider(
+    name="Take Profit (%)", start=0.01, end=1.00, step=0.01, value=0.55
+)
+initial_capital_input = pn.widgets.FloatInput(
+    name="Initial Capital ($)", value=10000.0, step=100
+)
+max_trades_input = pn.widgets.IntSlider(
+    name="Max Trades Per Day", start=1, end=20, value=6
+)
+commission_amount_input = pn.widgets.FloatInput(
+    name="Commission Amount ($ per 200 shares)", value=2.0, step=0.1
+)
+# Date Range (default based on user script)
+default_start = pd.Timestamp("2024-10-07").date()
+default_end = pd.Timestamp("2025-12-01").date()  # Future date from user script
+date_range_input = pn.widgets.DateRangeSlider(
+    name="Date Range",
+    start=pd.Timestamp("2020-01-01").date(),
+    end=pd.Timestamp("2026-01-01").date(),
+    value=(default_start, default_end),
+)
+run_button = pn.widgets.Button(name="Run Backtest", button_type="primary")
+# --- 3. Callbacks & Logic ---
+def execute_backtest(event=None):
+    # Determine if we need to reload data based on query
+    # To be safe and simple, we reload if the user changed the query or if we just want to ensure consistency.
+    # Given the file is small (parquet), we can reload or filter.
+    # Note: load_data() handles reading and filtering.
+    current_query = query_input.value
+    # We will reload the data with the specific query
+    # If this becomes slow, we can optimize to cache the unfiltered raw data and filter here.
+    try:
+        current_df = load_data(filter_query=current_query)
+    except Exception as e:
+        return pn.pane.Markdown(f"## Error loading data/applying query: {e}")
+    if current_df.empty:
+        return pn.pane.Markdown("## Error: No Data Loaded (Empty after filter)")
+    # Get values
+    rpt = risk_per_trade_input.value
+    sl = stop_loss_input.value
+    tp = take_profit_input.value
+    init_cap = initial_capital_input.value
+    max_trades = max_trades_input.value
+    comm_amt = commission_amount_input.value
+    start_date, end_date = date_range_input.value
+    trades_df = run_backtest(
+        current_df,
+        rpt,
+        sl,
+        tp,
+        init_cap,
+        start_date,
+        end_date,
+        max_trades,
+        commission_amount=comm_amt,
+    )
+    if trades_df.empty:
+        return pn.pane.Markdown("## No trades found for this configuration")
+    # Analyze
+    results, analysis_df = analyze_day_trading(trades_df)
+    # --- Visuals ---
+    # 1. Equity Curve (Net vs Gross)
+    equity_plot = analysis_df.hvplot.line(
+        x="index",
+        y=["capital_net", "capital_gross"],
+        value_label="Capital ($)",
+        title="Account Growth (Net vs Gross)",
+        ylabel="Capital ($)",
+        xlabel="Trade #",
+        grid=True,
+        height=400,
+        responsive=True,
+        color=["#4CAF50", "#2196F3"],
+        hover_cols=["ticker", "pnl"],
+    )
+    # 1b. Capital & Profit over Days
+    daily_stats = analysis_df.groupby("date").agg({
+        "capital_net": "last",
+        "capital_gross": "last",
+        "pnl": "sum",
+        "pnl_gross": "sum"
+    }).reset_index()
+    capital_days_plot = daily_stats.hvplot.line(
+        x="date",
+        y=["capital_net", "capital_gross"],
+        title="Capital over Days",
+        ylabel="Capital ($)",
+        grid=True,
+        height=300,
+        responsive=True,
+        color=["#4CAF50", "#2196F3"],
+    )
+    profit_days_plot = daily_stats.hvplot.bar(
+        x="date",
+        y=["pnl", "pnl_gross"],
+        title="Daily Profit (Net vs Gross)",
+        ylabel="Profit ($)",
+        grid=True,
+        height=300,
+        responsive=True,
+        alpha=0.6,
+        color=["#4CAF50", "#2196F3"],
+        yformatter="%.0f",
+    )
+    # 2. Cumulative Commission
+    comm_plot = analysis_df.hvplot.line(
+        x="index",
+        y="cumulative_comm",
+        title="Cumulative Commissions Paid",
+        ylabel="Total Commission ($)",
+        xlabel="Trade #",
+        grid=True,
+        height=200,
+        responsive=True,
+        color="#FF9800",
+    )
+    # 2. Drawdown
+    drawdown_plot = analysis_df.hvplot.area(
+        y="drawdown",
+        title="Drawdown",
+        ylabel="Drawdown ($)",
+        grid=True,
+        height=200,
+        responsive=True,
+        color="red",
+        alpha=0.3,
+    )
+    # 2b. Drawdown %
+    drawdown_pct_plot = analysis_df.hvplot.area(
+        y="drawdown_pct",
+        title="Drawdown %",
+        ylabel="Drawdown (%)",
+        grid=True,
+        height=200,
+        responsive=True,
+        color="red",
+        alpha=0.3,
+    )
+    # 3. P&L Distribution
+    pnl_dist_plot = analysis_df.hvplot.hist(
+        y="pnl", title="P&L Distribution", bins=30, height=300, responsive=True
+    )
+    # 4. Ticker Performance (Top/Bottom 10)
+    ticker_stats = analysis_df.groupby("ticker")["pnl"].sum().sort_values()
+    if len(ticker_stats) > 20:
+        # Show top 10 and bottom 10
+        top = ticker_stats.tail(10)
+        bottom = ticker_stats.head(10)
+        subset = pd.concat([bottom, top])
+    else:
+        subset = ticker_stats
+    ticker_plot = subset.hvplot.bar(
+        title="P&L by Ticker (Best/Worst)", rot=45, height=400, responsive=True
+    )
+    # 5. Metrics Table
+    # Format metrics for display
+    metrics_df = pd.DataFrame(
+        [
+            {"Metric": k, "Value": f"{v:.2f}" if isinstance(v, float) else v}
+            for k, v in results.items()
+            if not isinstance(v, pd.DataFrame)
+        ]
+    )
+    metrics_table = pn.widgets.Tabulator(metrics_df, disabled=True, show_index=False)
+    # 6. Trades Table (Paginated)
+    display_trades_df = trades_df.copy()
+    for col in display_trades_df.select_dtypes(include=['float', 'float64']).columns:
+        display_trades_df[col] = display_trades_df[col].fillna(0).astype(int)
+    trades_table = pn.widgets.Tabulator(
+        display_trades_df,
+        pagination="local",
+        page_size=10,
+        sizing_mode="stretch_width",
+    )
+    # Layout
+    dashboard = pn.Column(
+        pn.Row(
+            pn.Column(metrics_table, width=300),
+            pn.Column(
+                equity_plot,
+                drawdown_plot,
+                drawdown_pct_plot,
+                comm_plot,
+                capital_days_plot,
+                profit_days_plot,
+            ),
+        ),
+        pn.Row(pnl_dist_plot, ticker_plot),
+        pn.layout.Divider(),
+        "### Trade Log",
+        trades_table,
+    )
+    return dashboard
+# Bind the function to the button
+# We effectively want to replace the main content when button is clicked
+# pn.bind is one way, or just updating a dynamic map.
+# Simplest: use a Column that we clear and append to.
+output_area = pn.Column()
+def on_click(event):
+    output_area.clear()
+    output_area.append(pn.indicators.LoadingSpinner(value=True, width=50, height=50))
+    try:
+        content = execute_backtest()
+        output_area.clear()
+        output_area.append(content)
+    except Exception as e:
+        output_area.clear()
+        output_area.append(pn.pane.Markdown(f"## Error during execution: {e}"))
+run_button.on_click(on_click)
+# --- Layout ---
+sidebar = pn.Column(
+    "## Configuration",
+    query_input,
+    risk_per_trade_input,
+    stop_loss_input,
+    take_profit_input,
+    initial_capital_input,
+    max_trades_input,
+    commission_amount_input,
+    date_range_input,
+    run_button,
+    pn.layout.Divider(),
+    "**Note**: Ensure `HF_TOKEN` is set in `.env` to download data.",
+)
+template = pn.template.FastListTemplate(
+    title="Penny Stock Short GAP UP Strategy Backtester",
+    sidebar=[sidebar],
+    main=[output_area],
+    accent_base_color="#1f77b4",
+    header_background="#1f77b4",
+)
+# Servable
+template.servable()
+if __name__ == "__main__":
+    # If run as script
+    pn.serve(template, show=False, port=5010)

hello.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from myantigravity1!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[project]
+name = "backtesting-penny-short"
+version = "0.1.0"
+description = "Penny Stock Strategy Backtester with Panel Dashboard"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "datasets>=4.4.2",
+    "huggingface-hub>=1.2.3",
+    "matplotlib>=3.10.8",
+    "pandas>=2.3.3",
+    "python-dotenv>=1.2.1",
+    "panel>=1.3.8",
+    "hvplot>=0.9.2",
+    "pyarrow>=15.0.0",
+    "fastparquet>=2024.2.0"
+]

test_conversion.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import pandas as pd
+import numpy as np
+print("Creating mock dataframe...")
+trades_df = pd.DataFrame({
+    'float_col': [1.1, 2.9, 3.5, np.nan],
+    'int_col': [10, 20, 30, 40],
+    'str_col': ['a', 'b', 'c', 'd']
+})
+print("Original Types:")
+print(trades_df.dtypes)
+print(trades_df)
+print("\nConverting...")
+display_trades_df = trades_df.copy()
+for col in display_trades_df.select_dtypes(include=['float', 'float64']).columns:
+    display_trades_df[col] = display_trades_df[col].fillna(0).astype(int)
+print("\nResult Types:")
+print(display_trades_df.dtypes)
+print(display_trades_df)

utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import pandas as pd
+from huggingface_hub import HfApi, hf_hub_download
+from dotenv import load_dotenv
+load_dotenv()
+def download_from_hf_dataset(file_path, dataset_name, token=None, repo_type="dataset"):
+    """
+    Download a file from a Hugging Face dataset repository.
+    """
+    if token is None:
+        token = os.getenv("HF_TOKEN")
+    try:
+        local_path = hf_hub_download(
+            repo_id=dataset_name,
+            filename=file_path,
+            repo_type=repo_type,
+            local_dir=".",
+            token=token,
+        )
+        print(
+            f"Successfully downloaded {file_path} from {dataset_name} to {local_path}"
+        )
+        return local_path
+    except Exception as e:
+        print(f"Error downloading file: {str(e)}")
+        # Check if file exists locally as fallback
+        if os.path.exists(file_path):
+            print(f"Found local copy of {file_path}, using that.")
+            return file_path
+        return None
+DEFAULT_FILTER_QUERY = (
+    "premarket_change_from_perviousday_perc > 8 and "
+    "premarket_close > 2 and "
+    "`Shares Float`>1e6 and "
+    "`Market Capitalization`<100e6"
+)
+def load_data(filter_query=DEFAULT_FILTER_QUERY):
+    """
+    Loads and preprocesses the specific penny stock dataset.
+    """
+    token = os.getenv("HF_TOKEN")
+    dataset_name = "AmirTrader/PennyStocks"
+    # Original logic for file name construction
+    # Get file name from environment variable or use default
+    default_file = "marketsession_post_polygon_2020-01-01_2025-12-01.parquet_with_premarketvolume900K_marketcap1B.parquet"
+    target_file = os.getenv("TARGET_FILE", default_file)
+    # Attempt download
+    local_path = download_from_hf_dataset(
+        file_path=target_file, dataset_name=dataset_name, token=token
+    )
+    if not local_path or not os.path.exists(local_path):
+        raise FileNotFoundError(f"Could not find or download dataset: {target_file}")
+    df = pd.read_parquet(local_path)
+    # Pre-filtering based on user's script
+    # This queries the "universe" of stocks
+    if filter_query:
+        try:
+            df = df.query(filter_query).copy()
+        except Exception as e:
+            print(f"Error applying query '{filter_query}': {e}")
+            # Fallback or re-raise? Let's re-raise to notify user in dashboard
+            raise e
+    else:
+        df = df.copy()
+    # Ensure datetime
+    if "datetime" in df.columns:
+        # Check if it needs conversion (it likely is already datetime in parquet)
+        # Using errors='ignore' in case it's already correct to avoid overhead
+        df["datetime"] = pd.to_datetime(df["datetime"])
+        df["date"] = df["datetime"].dt.date
+    return df

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff