File size: 2,714 Bytes
a84c47e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
from dotenv import load_dotenv

load_dotenv()


def download_from_hf_dataset(file_path, dataset_name, token=None, repo_type="dataset"):
    """
    Download a file from a Hugging Face dataset repository.
    """
    if token is None:
        token = os.getenv("HF_TOKEN")

    try:
        local_path = hf_hub_download(
            repo_id=dataset_name,
            filename=file_path,
            repo_type=repo_type,
            local_dir=".",
            token=token,
        )
        print(
            f"Successfully downloaded {file_path} from {dataset_name} to {local_path}"
        )
        return local_path
    except Exception as e:
        print(f"Error downloading file: {str(e)}")
        # Check if file exists locally as fallback
        if os.path.exists(file_path):
            print(f"Found local copy of {file_path}, using that.")
            return file_path
        return None


DEFAULT_FILTER_QUERY = (
    "premarket_change_from_perviousday_perc > 8 and "
    "premarket_close > 2 and "
    "`Shares Float`>1e6 and "
    "`Market Capitalization`<100e6"
)


def load_data(filter_query=DEFAULT_FILTER_QUERY):
    """
    Loads and preprocesses the specific penny stock dataset.
    """
    token = os.getenv("HF_TOKEN")
    dataset_name = "AmirTrader/PennyStocks"

    # Original logic for file name construction
    # Get file name from environment variable or use default
    default_file = "marketsession_post_polygon_2020-01-01_2025-12-01.parquet_with_premarketvolume900K_marketcap1B.parquet"
    target_file = os.getenv("TARGET_FILE", default_file)

    # Attempt download
    local_path = download_from_hf_dataset(
        file_path=target_file, dataset_name=dataset_name, token=token
    )

    if not local_path or not os.path.exists(local_path):
        raise FileNotFoundError(f"Could not find or download dataset: {target_file}")

    df = pd.read_parquet(local_path)

    # Pre-filtering based on user's script
    # This queries the "universe" of stocks
    if filter_query:
        try:
            df = df.query(filter_query).copy()
        except Exception as e:
            print(f"Error applying query '{filter_query}': {e}")
            # Fallback or re-raise? Let's re-raise to notify user in dashboard
            raise e
    else:
        df = df.copy()

    # Ensure datetime
    if "datetime" in df.columns:
        # Check if it needs conversion (it likely is already datetime in parquet)
        # Using errors='ignore' in case it's already correct to avoid overhead
        df["datetime"] = pd.to_datetime(df["datetime"])
        df["date"] = df["datetime"].dt.date

    return df