File size: 6,239 Bytes
2491dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import duckdb
import pandas as pd
import json
import os
from huggingface_hub import HfApi, hf_hub_download
import time

# --- Configuration ---
SOURCE_REPO = "SII-WANGZJ/Polymarket_data"
DEST_REPO = "Vycka12/Base" # User's target dataset
TRADES_FILE = "trades.parquet"
MARKETS_FILE = "markets.parquet"

def run_extraction():
    try:
        token = os.environ.get("HF_TOKEN")
        if not token:
            return "❌ Missing HF_TOKEN! Please add it to Space Secrets (Settings -> Secrets)."

        api = HfApi()
        
        # 1. Initialize DuckDB with HTTP extension
        print("Initializing DuckDB with HTTPFS...")
        con = duckdb.connect(database=':memory:')
        con.execute("INSTALL httpfs; LOAD httpfs;")
        
        # Get direct download URLs (requires token)
        trades_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{TRADES_FILE}"
        markets_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{MARKETS_FILE}"
        
        # 2. Identify winning assets (Reading markets.parquet remotely)
        print("Mapping winning assets from cloud...")
        winning_mapping_query = f"""
        CREATE TABLE winning_assets AS 
        WITH cleaned_markets AS (
            SELECT 
                token1, token2, 
                CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
            FROM read_parquet('{markets_url}')
            WHERE closed = 1 AND outcome_prices IS NOT NULL AND outcome_prices != ''
        )
        SELECT token1 as asset_id FROM cleaned_markets WHERE prices[1] = 1.0
        UNION
        SELECT token2 as asset_id FROM cleaned_markets WHERE prices[2] = 1.0
        """
        con.execute(winning_mapping_query)
        
        losing_mapping_query = """
        CREATE TABLE losing_assets AS 
        SELECT token2 as asset_id FROM winning_assets -- Simplify for this batch
        UNION
        SELECT token1 as asset_id FROM winning_assets
        """
        # (Better to just reuse logic but let's keep it simple for now)
        
        # 3. Analyze trades (32GB JOIN remotely over HTTP)
        # DuckDB will only fetch the needed columns and rows via range requests
        print("Calculating wallet stats from 32GB trades cloud file...")
        stats_query = f"""
        CREATE TABLE wallet_stats AS
        WITH all_trades AS (
            SELECT maker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE maker_direction = 'BUY'
            UNION ALL
            SELECT taker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE taker_direction = 'BUY'
        ),
        wins AS (
            SELECT wallet, count(*) as win_count FROM all_trades 
            WHERE asset_id IN (SELECT asset_id FROM winning_assets)
            GROUP BY wallet
        ),
        losses AS (
            SELECT wallet, count(*) as loss_count FROM all_trades 
            WHERE asset_id IN (SELECT asset_id FROM (
                SELECT token2 as asset_id FROM (
                    SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
                    FROM read_parquet('{markets_url}')
                    WHERE closed = 1 AND (token1 IS NOT NULL OR token2 IS NOT NULL)
                ) WHERE prices[1] = 1.0
                UNION
                SELECT token1 as asset_id FROM (
                     SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
                    FROM read_parquet('{markets_url}')
                    WHERE closed = 1
                ) WHERE prices[2] = 1.0
            ))
            GROUP BY wallet
        )
        SELECT 
            COALESCE(w.wallet, l.wallet) as wallet,
            COALESCE(w.win_count, 0) as wins,
            COALESCE(l.loss_count, 0) as losses,
            (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as total_resolved_trades,
            CAST(COALESCE(w.win_count, 0) AS DOUBLE) / (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as win_rate
        FROM wins w
        FULL OUTER JOIN losses l ON w.wallet = l.wallet
        WHERE (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) >= 100
        """
        con.execute(stats_query)
        
        # 4. Filter Results
        print("Filtering final results...")
        df = con.execute("SELECT * FROM wallet_stats").df()
        
        whales = df[(df['total_resolved_trades'] >= 400) & (df['win_rate'] >= 0.70)].to_dict('records')
        dumb = df[(df['total_resolved_trades'] >= 100) & (df['win_rate'] <= 0.40)].to_dict('records')
        
        # 5. Save and Upload to HF
        os.makedirs("results", exist_ok=True)
        whale_path = "results/mega_whales.json"
        dumb_path = "results/mega_dumb.json"
        
        with open(whale_path, "w") as f:
            json.dump(whales, f, indent=2)
        with open(dumb_path, "w") as f:
            json.dump(dumb, f, indent=2)
            
        print(f"Uploading Results to {DEST_REPO}...")
        api.upload_file(
            path_or_fileobj=whale_path,
            path_in_repo="results/mega_whales.json",
            repo_id=DEST_REPO,
            repo_type="dataset",
            token=token
        )
        api.upload_file(
            path_or_fileobj=dumb_path,
            path_in_repo="results/mega_dumb.json",
            repo_id=DEST_REPO,
            repo_type="dataset",
            token=token
        )
        
        return f"βœ… SUCCESS!\n- Found {len(whales)} Mega Whales\n- Found {len(dumb)} Mega Dumb\n- Results uploaded to {DEST_REPO}/results/"
        
    except Exception as e:
        return f"❌ ERROR: {str(e)}"

# --- Gradio UI ---
with gr.Blocks(title="Mega Extraction Space") as demo:
    gr.Markdown("# πŸ‹ Mega Wallet Extractor (DuckDB Cloud)")
    gr.Markdown("Click the button below to process 32GB of Polymarket data directly from Hugging Face.")
    
    with gr.Row():
        run_btn = gr.Button("πŸš€ Run Full Extraction", variant="primary")
    
    status_box = gr.Textbox(label="Status Log", lines=10)
    
    run_btn.click(fn=run_extraction, outputs=status_box)

if __name__ == "__main__":
    demo.launch()