Spaces:
Runtime error
Runtime error
File size: 6,239 Bytes
2491dcf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | import gradio as gr
import duckdb
import pandas as pd
import json
import os
from huggingface_hub import HfApi, hf_hub_download
import time
# --- Configuration ---
SOURCE_REPO = "SII-WANGZJ/Polymarket_data"
DEST_REPO = "Vycka12/Base" # User's target dataset
TRADES_FILE = "trades.parquet"
MARKETS_FILE = "markets.parquet"
def run_extraction():
try:
token = os.environ.get("HF_TOKEN")
if not token:
return "β Missing HF_TOKEN! Please add it to Space Secrets (Settings -> Secrets)."
api = HfApi()
# 1. Initialize DuckDB with HTTP extension
print("Initializing DuckDB with HTTPFS...")
con = duckdb.connect(database=':memory:')
con.execute("INSTALL httpfs; LOAD httpfs;")
# Get direct download URLs (requires token)
trades_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{TRADES_FILE}"
markets_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{MARKETS_FILE}"
# 2. Identify winning assets (Reading markets.parquet remotely)
print("Mapping winning assets from cloud...")
winning_mapping_query = f"""
CREATE TABLE winning_assets AS
WITH cleaned_markets AS (
SELECT
token1, token2,
CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
FROM read_parquet('{markets_url}')
WHERE closed = 1 AND outcome_prices IS NOT NULL AND outcome_prices != ''
)
SELECT token1 as asset_id FROM cleaned_markets WHERE prices[1] = 1.0
UNION
SELECT token2 as asset_id FROM cleaned_markets WHERE prices[2] = 1.0
"""
con.execute(winning_mapping_query)
losing_mapping_query = """
CREATE TABLE losing_assets AS
SELECT token2 as asset_id FROM winning_assets -- Simplify for this batch
UNION
SELECT token1 as asset_id FROM winning_assets
"""
# (Better to just reuse logic but let's keep it simple for now)
# 3. Analyze trades (32GB JOIN remotely over HTTP)
# DuckDB will only fetch the needed columns and rows via range requests
print("Calculating wallet stats from 32GB trades cloud file...")
stats_query = f"""
CREATE TABLE wallet_stats AS
WITH all_trades AS (
SELECT maker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE maker_direction = 'BUY'
UNION ALL
SELECT taker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE taker_direction = 'BUY'
),
wins AS (
SELECT wallet, count(*) as win_count FROM all_trades
WHERE asset_id IN (SELECT asset_id FROM winning_assets)
GROUP BY wallet
),
losses AS (
SELECT wallet, count(*) as loss_count FROM all_trades
WHERE asset_id IN (SELECT asset_id FROM (
SELECT token2 as asset_id FROM (
SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
FROM read_parquet('{markets_url}')
WHERE closed = 1 AND (token1 IS NOT NULL OR token2 IS NOT NULL)
) WHERE prices[1] = 1.0
UNION
SELECT token1 as asset_id FROM (
SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices
FROM read_parquet('{markets_url}')
WHERE closed = 1
) WHERE prices[2] = 1.0
))
GROUP BY wallet
)
SELECT
COALESCE(w.wallet, l.wallet) as wallet,
COALESCE(w.win_count, 0) as wins,
COALESCE(l.loss_count, 0) as losses,
(COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as total_resolved_trades,
CAST(COALESCE(w.win_count, 0) AS DOUBLE) / (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as win_rate
FROM wins w
FULL OUTER JOIN losses l ON w.wallet = l.wallet
WHERE (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) >= 100
"""
con.execute(stats_query)
# 4. Filter Results
print("Filtering final results...")
df = con.execute("SELECT * FROM wallet_stats").df()
whales = df[(df['total_resolved_trades'] >= 400) & (df['win_rate'] >= 0.70)].to_dict('records')
dumb = df[(df['total_resolved_trades'] >= 100) & (df['win_rate'] <= 0.40)].to_dict('records')
# 5. Save and Upload to HF
os.makedirs("results", exist_ok=True)
whale_path = "results/mega_whales.json"
dumb_path = "results/mega_dumb.json"
with open(whale_path, "w") as f:
json.dump(whales, f, indent=2)
with open(dumb_path, "w") as f:
json.dump(dumb, f, indent=2)
print(f"Uploading Results to {DEST_REPO}...")
api.upload_file(
path_or_fileobj=whale_path,
path_in_repo="results/mega_whales.json",
repo_id=DEST_REPO,
repo_type="dataset",
token=token
)
api.upload_file(
path_or_fileobj=dumb_path,
path_in_repo="results/mega_dumb.json",
repo_id=DEST_REPO,
repo_type="dataset",
token=token
)
return f"β
SUCCESS!\n- Found {len(whales)} Mega Whales\n- Found {len(dumb)} Mega Dumb\n- Results uploaded to {DEST_REPO}/results/"
except Exception as e:
return f"β ERROR: {str(e)}"
# --- Gradio UI ---
with gr.Blocks(title="Mega Extraction Space") as demo:
gr.Markdown("# π Mega Wallet Extractor (DuckDB Cloud)")
gr.Markdown("Click the button below to process 32GB of Polymarket data directly from Hugging Face.")
with gr.Row():
run_btn = gr.Button("π Run Full Extraction", variant="primary")
status_box = gr.Textbox(label="Status Log", lines=10)
run_btn.click(fn=run_extraction, outputs=status_box)
if __name__ == "__main__":
demo.launch()
|