Vycka12 commited on
Commit
2491dcf
Β·
verified Β·
1 Parent(s): 06d91fe

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import duckdb
3
+ import pandas as pd
4
+ import json
5
+ import os
6
+ from huggingface_hub import HfApi, hf_hub_download
7
+ import time
8
+
9
+ # --- Configuration ---
10
+ SOURCE_REPO = "SII-WANGZJ/Polymarket_data"
11
+ DEST_REPO = "Vycka12/Base" # User's target dataset
12
+ TRADES_FILE = "trades.parquet"
13
+ MARKETS_FILE = "markets.parquet"
14
+
15
+ def run_extraction():
16
+ try:
17
+ token = os.environ.get("HF_TOKEN")
18
+ if not token:
19
+ return "❌ Missing HF_TOKEN! Please add it to Space Secrets (Settings -> Secrets)."
20
+
21
+ api = HfApi()
22
+
23
+ # 1. Initialize DuckDB with HTTP extension
24
+ print("Initializing DuckDB with HTTPFS...")
25
+ con = duckdb.connect(database=':memory:')
26
+ con.execute("INSTALL httpfs; LOAD httpfs;")
27
+
28
+ # Get direct download URLs (requires token)
29
+ trades_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{TRADES_FILE}"
30
+ markets_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{MARKETS_FILE}"
31
+
32
+ # 2. Identify winning assets (Reading markets.parquet remotely)
33
+ print("Mapping winning assets from cloud...")
34
+ winning_mapping_query = f"""
35
+ CREATE TABLE winning_assets AS
36
+ WITH cleaned_markets AS (
37
+ SELECT
38
+ token1, token2,
39
+ CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
40
+ FROM read_parquet('{markets_url}')
41
+ WHERE closed = 1 AND outcome_prices IS NOT NULL AND outcome_prices != ''
42
+ )
43
+ SELECT token1 as asset_id FROM cleaned_markets WHERE prices[1] = 1.0
44
+ UNION
45
+ SELECT token2 as asset_id FROM cleaned_markets WHERE prices[2] = 1.0
46
+ """
47
+ con.execute(winning_mapping_query)
48
+
49
+ losing_mapping_query = """
50
+ CREATE TABLE losing_assets AS
51
+ SELECT token2 as asset_id FROM winning_assets -- Simplify for this batch
52
+ UNION
53
+ SELECT token1 as asset_id FROM winning_assets
54
+ """
55
+ # (Better to just reuse logic but let's keep it simple for now)
56
+
57
+ # 3. Analyze trades (32GB JOIN remotely over HTTP)
58
+ # DuckDB will only fetch the needed columns and rows via range requests
59
+ print("Calculating wallet stats from 32GB trades cloud file...")
60
+ stats_query = f"""
61
+ CREATE TABLE wallet_stats AS
62
+ WITH all_trades AS (
63
+ SELECT maker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE maker_direction = 'BUY'
64
+ UNION ALL
65
+ SELECT taker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE taker_direction = 'BUY'
66
+ ),
67
+ wins AS (
68
+ SELECT wallet, count(*) as win_count FROM all_trades
69
+ WHERE asset_id IN (SELECT asset_id FROM winning_assets)
70
+ GROUP BY wallet
71
+ ),
72
+ losses AS (
73
+ SELECT wallet, count(*) as loss_count FROM all_trades
74
+ WHERE asset_id IN (SELECT asset_id FROM (
75
+ SELECT token2 as asset_id FROM (
76
+ SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
77
+ FROM read_parquet('{markets_url}')
78
+ WHERE closed = 1 AND (token1 IS NOT NULL OR token2 IS NOT NULL)
79
+ ) WHERE prices[1] = 1.0
80
+ UNION
81
+ SELECT token1 as asset_id FROM (
82
+ SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
83
+ FROM read_parquet('{markets_url}')
84
+ WHERE closed = 1
85
+ ) WHERE prices[2] = 1.0
86
+ ))
87
+ GROUP BY wallet
88
+ )
89
+ SELECT
90
+ COALESCE(w.wallet, l.wallet) as wallet,
91
+ COALESCE(w.win_count, 0) as wins,
92
+ COALESCE(l.loss_count, 0) as losses,
93
+ (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as total_resolved_trades,
94
+ CAST(COALESCE(w.win_count, 0) AS DOUBLE) / (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as win_rate
95
+ FROM wins w
96
+ FULL OUTER JOIN losses l ON w.wallet = l.wallet
97
+ WHERE (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) >= 100
98
+ """
99
+ con.execute(stats_query)
100
+
101
+ # 4. Filter Results
102
+ print("Filtering final results...")
103
+ df = con.execute("SELECT * FROM wallet_stats").df()
104
+
105
+ whales = df[(df['total_resolved_trades'] >= 400) & (df['win_rate'] >= 0.70)].to_dict('records')
106
+ dumb = df[(df['total_resolved_trades'] >= 100) & (df['win_rate'] <= 0.40)].to_dict('records')
107
+
108
+ # 5. Save and Upload to HF
109
+ os.makedirs("results", exist_ok=True)
110
+ whale_path = "results/mega_whales.json"
111
+ dumb_path = "results/mega_dumb.json"
112
+
113
+ with open(whale_path, "w") as f:
114
+ json.dump(whales, f, indent=2)
115
+ with open(dumb_path, "w") as f:
116
+ json.dump(dumb, f, indent=2)
117
+
118
+ print(f"Uploading Results to {DEST_REPO}...")
119
+ api.upload_file(
120
+ path_or_fileobj=whale_path,
121
+ path_in_repo="results/mega_whales.json",
122
+ repo_id=DEST_REPO,
123
+ repo_type="dataset",
124
+ token=token
125
+ )
126
+ api.upload_file(
127
+ path_or_fileobj=dumb_path,
128
+ path_in_repo="results/mega_dumb.json",
129
+ repo_id=DEST_REPO,
130
+ repo_type="dataset",
131
+ token=token
132
+ )
133
+
134
+ return f"βœ… SUCCESS!\n- Found {len(whales)} Mega Whales\n- Found {len(dumb)} Mega Dumb\n- Results uploaded to {DEST_REPO}/results/"
135
+
136
+ except Exception as e:
137
+ return f"❌ ERROR: {str(e)}"
138
+
139
+ # --- Gradio UI ---
140
+ with gr.Blocks(title="Mega Extraction Space") as demo:
141
+ gr.Markdown("# πŸ‹ Mega Wallet Extractor (DuckDB Cloud)")
142
+ gr.Markdown("Click the button below to process 32GB of Polymarket data directly from Hugging Face.")
143
+
144
+ with gr.Row():
145
+ run_btn = gr.Button("πŸš€ Run Full Extraction", variant="primary")
146
+
147
+ status_box = gr.Textbox(label="Status Log", lines=10)
148
+
149
+ run_btn.click(fn=run_extraction, outputs=status_box)
150
+
151
+ if __name__ == "__main__":
152
+ demo.launch()