import gradio as gr import duckdb import pandas as pd import json import os from huggingface_hub import HfApi, hf_hub_download import time # --- Configuration --- SOURCE_REPO = "SII-WANGZJ/Polymarket_data" DEST_REPO = "Vycka12/Base" # User's target dataset TRADES_FILE = "trades.parquet" MARKETS_FILE = "markets.parquet" def run_extraction(): try: token = os.environ.get("HF_TOKEN") if not token: return "❌ Missing HF_TOKEN! Please add it to Space Secrets (Settings -> Secrets)." api = HfApi() # 1. Initialize DuckDB with HTTP extension print("Initializing DuckDB with HTTPFS...") con = duckdb.connect(database=':memory:') con.execute("INSTALL httpfs; LOAD httpfs;") # Get direct download URLs (requires token) trades_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{TRADES_FILE}" markets_url = f"https://huggingface.co/datasets/{SOURCE_REPO}/resolve/main/{MARKETS_FILE}" # 2. Identify winning assets (Reading markets.parquet remotely) print("Mapping winning assets from cloud...") winning_mapping_query = f""" CREATE TABLE winning_assets AS WITH cleaned_markets AS ( SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices FROM read_parquet('{markets_url}') WHERE closed = 1 AND outcome_prices IS NOT NULL AND outcome_prices != '' ) SELECT token1 as asset_id FROM cleaned_markets WHERE prices[1] = 1.0 UNION SELECT token2 as asset_id FROM cleaned_markets WHERE prices[2] = 1.0 """ con.execute(winning_mapping_query) losing_mapping_query = """ CREATE TABLE losing_assets AS SELECT token2 as asset_id FROM winning_assets -- Simplify for this batch UNION SELECT token1 as asset_id FROM winning_assets """ # (Better to just reuse logic but let's keep it simple for now) # 3. Analyze trades (32GB JOIN remotely over HTTP) # DuckDB will only fetch the needed columns and rows via range requests print("Calculating wallet stats from 32GB trades cloud file...") stats_query = f""" CREATE TABLE wallet_stats AS WITH all_trades AS ( SELECT maker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE maker_direction = 'BUY' UNION ALL SELECT taker as wallet, asset_id FROM read_parquet('{trades_url}') WHERE taker_direction = 'BUY' ), wins AS ( SELECT wallet, count(*) as win_count FROM all_trades WHERE asset_id IN (SELECT asset_id FROM winning_assets) GROUP BY wallet ), losses AS ( SELECT wallet, count(*) as loss_count FROM all_trades WHERE asset_id IN (SELECT asset_id FROM ( SELECT token2 as asset_id FROM ( SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices FROM read_parquet('{markets_url}') WHERE closed = 1 AND (token1 IS NOT NULL OR token2 IS NOT NULL) ) WHERE prices[1] = 1.0 UNION SELECT token1 as asset_id FROM ( SELECT token1, token2, CAST(replace(replace(outcome_prices, '''', '"'), ' ', '') AS DOUBLE[]) as prices FROM read_parquet('{markets_url}') WHERE closed = 1 ) WHERE prices[2] = 1.0 )) GROUP BY wallet ) SELECT COALESCE(w.wallet, l.wallet) as wallet, COALESCE(w.win_count, 0) as wins, COALESCE(l.loss_count, 0) as losses, (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as total_resolved_trades, CAST(COALESCE(w.win_count, 0) AS DOUBLE) / (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) as win_rate FROM wins w FULL OUTER JOIN losses l ON w.wallet = l.wallet WHERE (COALESCE(w.win_count, 0) + COALESCE(l.loss_count, 0)) >= 100 """ con.execute(stats_query) # 4. Filter Results print("Filtering final results...") df = con.execute("SELECT * FROM wallet_stats").df() whales = df[(df['total_resolved_trades'] >= 400) & (df['win_rate'] >= 0.70)].to_dict('records') dumb = df[(df['total_resolved_trades'] >= 100) & (df['win_rate'] <= 0.40)].to_dict('records') # 5. Save and Upload to HF os.makedirs("results", exist_ok=True) whale_path = "results/mega_whales.json" dumb_path = "results/mega_dumb.json" with open(whale_path, "w") as f: json.dump(whales, f, indent=2) with open(dumb_path, "w") as f: json.dump(dumb, f, indent=2) print(f"Uploading Results to {DEST_REPO}...") api.upload_file( path_or_fileobj=whale_path, path_in_repo="results/mega_whales.json", repo_id=DEST_REPO, repo_type="dataset", token=token ) api.upload_file( path_or_fileobj=dumb_path, path_in_repo="results/mega_dumb.json", repo_id=DEST_REPO, repo_type="dataset", token=token ) return f"✅ SUCCESS!\n- Found {len(whales)} Mega Whales\n- Found {len(dumb)} Mega Dumb\n- Results uploaded to {DEST_REPO}/results/" except Exception as e: return f"❌ ERROR: {str(e)}" # --- Gradio UI --- with gr.Blocks(title="Mega Extraction Space") as demo: gr.Markdown("# 🐋 Mega Wallet Extractor (DuckDB Cloud)") gr.Markdown("Click the button below to process 32GB of Polymarket data directly from Hugging Face.") with gr.Row(): run_btn = gr.Button("🚀 Run Full Extraction", variant="primary") status_box = gr.Textbox(label="Status Log", lines=10) run_btn.click(fn=run_extraction, outputs=status_box) if __name__ == "__main__": demo.launch()