Spaces:

cyberkyne
/

quant-knowledge-extractor

Sleeping

App Files Files Community

cyberkyne commited on Mar 19

Commit

6a6d9aa

verified ·

1 Parent(s): 5a3ee96

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -482

app.py DELETED Viewed

@@ -1,482 +0,0 @@
-"""
-app.py — HuggingFace Spaces entry point.
-Architecture:
-  Python  : Gradio UI, Claude API calls, HF I/O, PDF processing
-  Julia   : Indicators, BacktestEngine, WalkForwardOptimizer, SignalCompiler
-Python NEVER does numerical computation. It only:
-  1. Calls Claude API (extraction + strategy code generation)
-  2. Calls Julia via juliacall for all math
-  3. Reads/writes HuggingFace datasets
-  4. Renders Gradio UI
-"""
-import io, json, zipfile, tempfile
-from pathlib import Path
-from datetime import datetime
-import gradio as gr
-from loguru import logger
-import utils.config as cfg
-import utils.hf_io as hf
-from pipeline.pdf_processor import PDFProcessor
-from pipeline.extractor import AIExtractor, Deduplicator
-from pipeline.julia_bridge import full_backtest_pipeline, julia_available
-from pipeline.exporter import (
-    slugify, strategy_md, formula_md,
-    backtest_report_md, optimal_json, mt5_set,
-    julia_config, index_md,
-)
-# ── Lazy KB ───────────────────────────────────────────
-_kb = None
-def get_kb():
-    global _kb
-    if _kb is None: _kb = hf.kb_load()
-    return _kb
-def reset_kb():
-    global _kb; _kb = hf.kb_load()
-# ═══════════════════════════════════════════════════
-#  TAB 1 — UPLOAD & EXTRACT
-# ═══════════════════════════════════════════════════
-def run_extraction(pdf_files, progress=gr.Progress()):
-    if not pdf_files: return "⚠️ No PDFs uploaded.", ""
-    if not cfg.ANTHROPIC_API_KEY: return "❌ ANTHROPIC_API_KEY secret not set.", ""
-    if not cfg.HF_DATASET_REPO:   return "❌ HF_DATASET_REPO secret not set.", ""
-    proc  = PDFProcessor()
-    ai    = AIExtractor()
-    dedup = Deduplicator()
-    kb    = get_kb()
-    log   = []
-    totals = {k:{"added":0,"merged":0,"skipped":0} for k in ("strategies","formulas","systems")}
-    hf_files = []
-    for i, pdf_file in enumerate(pdf_files):
-        path = Path(pdf_file.name)
-        progress(i/len(pdf_files), desc=f"{path.name}")
-        log.append(f"\n📖 [{i+1}/{len(pdf_files)}] {path.name}")
-        try:
-            chunks = list(proc.process(path))
-            log.append(f"  → {len(chunks)} chunks")
-        except Exception as e:
-            log.append(f"  ❌ {e}"); continue
-        for chunk in chunks:
-            extracted = ai.extract(chunk)
-            stats     = dedup.process(extracted, kb)
-            for kind in ("strategies","formulas","systems"):
-                for act in ("added","merged","skipped"):
-                    totals[kind][act] += stats[kind][act]
-        log.append(f"  → New: {totals['strategies']['added']} strats, {totals['formulas']['added']} formulas")
-        if cfg.HF_TOKEN: hf.pdf_upload(path)
-    for cid, rec in kb["strategies"].items():
-        hf_files.append((f"extracted/strategies/{slugify(rec.get('name',''))}.md",
-                         strategy_md(rec).encode()))
-    for cid, rec in kb["formulas"].items():
-        hf_files.append((f"extracted/formulas/{slugify(rec.get('name',''))}.md",
-                         formula_md(rec).encode()))
-    progress(0.9, desc="Saving to HuggingFace…")
-    hf.kb_save(kb)
-    if hf_files and cfg.HF_TOKEN:
-        pushed = hf.push_batch(hf_files, "Update extracted knowledge")
-        log.append(f"\n☁️ Pushed {pushed} files to HuggingFace")
-    reset_kb()
-    counts = {k: len(kb[k]) for k in kb}
-    summary = f"""✅ Extraction Complete
-PDFs processed: {len(pdf_files)}
-Strategies  — added: {totals['strategies']['added']}  merged: {totals['strategies']['merged']}  skipped: {totals['strategies']['skipped']}
-Formulas    — added: {totals['formulas']['added']}  merged: {totals['formulas']['merged']}  skipped: {totals['formulas']['skipped']}
-Systems     — added: {totals['systems']['added']}  merged: {totals['systems']['merged']}  skipped: {totals['systems']['skipped']}
-KB totals: {counts['strategies']} strategies · {counts['formulas']} formulas · {counts['systems']} systems
-Tokens used: {ai.tokens_used:,}"""
-    return summary, "\n".join(log[-40:])
-# ═══════════════════════════════════════════════════
-#  TAB 2 — BROWSE KB
-# ═══════════════════════════════════════════════════
-def search_strategies(query, category):
-    kb = get_kb(); items = list(kb["strategies"].values())
-    if category and category != "All":
-        items = [x for x in items if x.get("category") == category]
-    if query:
-        q = query.lower()
-        items = [x for x in items if q in x.get("name","").lower() or q in x.get("description","").lower()]
-    rows = [[x.get("name","")[:50], x.get("category",""),
-             x.get("description","")[:100],
-             ", ".join(x.get("sources",[]))[:40], len(x.get("layers",[]))]
-            for x in items[:100]]
-    return rows, f"{len(items)} strategies"
-def search_formulas(query):
-    kb = get_kb(); items = list(kb["formulas"].values())
-    if query:
-        q = query.lower()
-        items = [x for x in items if q in x.get("name","").lower() or q in x.get("purpose","").lower()]
-    return [[x.get("name","")[:50], x.get("category",""),
-             x.get("purpose","")[:80],
-             "✅" if x.get("latex") else "—",
-             ", ".join(x.get("sources",[]))[:40]] for x in items[:100]]
-def dl_strategy(name):
-    kb = get_kb()
-    for rec in kb["strategies"].values():
-        if rec.get("name","").lower() == name.strip().lower():
-            tmp = tempfile.mktemp(suffix=".md")
-            Path(tmp).write_text(strategy_md(rec), encoding="utf-8")
-            return tmp
-    return None
-def dl_all_strategies_zip(category):
-    kb = get_kb(); items = list(kb["strategies"].values())
-    if category and category != "All":
-        items = [x for x in items if x.get("category") == category]
-    tmp = tempfile.mktemp(suffix=".zip")
-    with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
-        for rec in items:
-            zf.writestr(f"{slugify(rec.get('name','unknown'))}.md", strategy_md(rec))
-    return tmp
-# ═══════════════════════════════════════════════════
-#  TAB 3 — BACKTEST (Julia Engine)
-# ═══════════════════════════════════════════════════
-def load_symbols():
-    syms = hf.tick_list_symbols()
-    return gr.update(choices=syms, value=syms[:2] if len(syms)>=2 else syms)
-def run_backtests(selected_symbols, selected_timeframes,
-                  strategy_filter, max_strategies, viable_only,
-                  progress=gr.Progress()):
-    if not cfg.HF_TICK_REPO:       return "❌ HF_TICK_REPO not set.", ""
-    if not cfg.ANTHROPIC_API_KEY:  return "❌ ANTHROPIC_API_KEY not set.", ""
-    if not julia_available():      return "❌ Julia runtime not available. Check build logs.", ""
-    ai   = AIExtractor()
-    kb   = get_kb()
-    strats = list(kb["strategies"].values())
-    if strategy_filter:
-        strats = [s for s in strats if strategy_filter.lower() in s.get("name","").lower()]
-    if max_strategies > 0:
-        strats = strats[:int(max_strategies)]
-    if not strats: return "⚠️ No strategies. Run extraction first.", ""
-    symbols    = selected_symbols or hf.tick_list_symbols()[:2]
-    timeframes = selected_timeframes or ["1h"]
-    log, all_results, viable_count = [], [], 0
-    for si, rec in enumerate(strats):
-        name = rec.get("name","?")
-        progress(si/len(strats), desc=f"[{si+1}/{len(strats)}] {name[:35]}")
-        # 1. Generate Julia signal code via Claude
-        jl_code = ai.compile_strategy_code(rec)
-        if not jl_code:
-            log.append(f"❌ Code gen failed: {name[:40]}"); continue
-        log.append(f"✅ Julia code generated: {name[:40]}")
-        for sym in symbols:
-            for tf in timeframes:
-                df = hf.tick_load(sym, tf)
-                if df is None or len(df) < 200:
-                    log.append(f"  ⚠️ {sym} {tf}: no data"); continue
-                # 2. Full Julia pipeline (compile → optimize → backtest)
-                result = full_backtest_pipeline(
-                    strategy_code  = jl_code,
-                    strategy_name  = name,
-                    open_p         = df["open"].values,
-                    high           = df["high"].values,
-                    low            = df["low"].values,
-                    close          = df["close"].values,
-                    volume         = df["volume"].values,
-                    timeframe      = tf,
-                    symbol         = sym,
-                    n_windows      = cfg.WF_WINDOWS,
-                    is_ratio       = cfg.WF_IS_RATIO,
-                    min_trades     = cfg.MIN_TRADES,
-                    min_sharpe     = cfg.MIN_SHARPE,
-                    max_combos     = cfg.MAX_PARAM_COMBOS,
-                    initial_equity = cfg.INITIAL_EQUITY,
-                    commission_pct = cfg.COMMISSION_PCT,
-                    risk_per_trade = cfg.RISK_PER_TRADE,
-                )
-                all_results.append(result)
-                # 3. Build + push output files
-                if cfg.HF_TOKEN and cfg.HF_DATASET_REPO:
-                    if not viable_only or result.get("is_viable"):
-                        hf.push_result(
-                            name, sym, tf,
-                            backtest_report_md(result, rec),
-                            optimal_json(result, rec),
-                            mt5_set(result, rec),
-                            julia_config(result),
-                        )
-                status = "✅" if result.get("is_viable") else "❌"
-                log.append(
-                    f"  {status} {sym} {tf}: "
-                    f"Sharpe={result.get('oos_sharpe_mean',0):.2f} "
-                    f"DD={result.get('oos_max_dd',0):.1f}% "
-                    f"Score={result.get('robustness',0):.0f}")
-                if result.get("is_viable"): viable_count += 1
-    # 4. Push master index
-    if all_results and cfg.HF_TOKEN:
-        hf.push_index(index_md(all_results), {
-            "generated": datetime.now().isoformat(),
-            "engine": "Julia 1.10",
-            "total_strategies": len(all_results),
-            "viable_count": viable_count,
-            "strategies": all_results,
-        })
-    summary = f"""🏁 Julia Backtest Complete
-Engine:               Julia 1.10 BacktestEngine.jl
-Strategies compiled:  {len(strats)}
-Combinations tested:  {len(all_results)}
-Viable strategies:    {viable_count}
-Pass rate:            {viable_count/max(len(all_results),1)*100:.1f}%
-Results on HuggingFace:
-  {cfg.HF_DATASET_REPO}/optimal_sets/BACKTEST_INDEX.md"""
-    return summary, "\n".join(log[-60:])
-# ═══════════════════════════════════════════════════
-#  TAB 4 — RESULTS
-# ═══════════════════════════════════════════════════
-def load_results():
-    data = hf.fetch_index()
-    if not data: return [], "No results yet."
-    strats  = data.get("strategies",[])
-    viable  = sorted([s for s in strats if s.get("is_viable")],
-                     key=lambda x: x.get("oos_sharpe_mean",0), reverse=True)
-    rows    = [[s.get("strategy","")[:45], s.get("symbol",""), s.get("timeframe",""),
-                f'{s.get("oos_sharpe_mean",0):.2f}', f'{s.get("oos_max_dd",0):.1f}%',
-                f'{s.get("oos_win_rate",0):.1f}%', f'{s.get("oos_pf_mean",0):.2f}',
-                f'{s.get("robustness",0):.0f}'] for s in viable]
-    count   = (f"✅ {len(viable)} viable / {len(strats)} tested | "
-               f"Engine: Julia | {data.get('generated','')[:16]}")
-    return rows, count
-def dl_result_file(name, symbol, tf, ftype):
-    sl  = slugify(name); sym = symbol.upper().strip()
-    pre = f"{sl}_{sym}_{tf}"
-    ext_map = {"MT5 .set file": f"optimal_sets/{pre}.set",
-               "Optimal JSON":  f"optimal_sets/{pre}_optimal.json",
-               "Julia config":  f"optimal_sets/{pre}_config.jl",
-               "Full report":   f"backtests/{sl}/{pre}_report.md"}
-    remote = ext_map.get(ftype,"")
-    if not remote: return None
-    data = hf.fetch_file(remote)
-    if not data: return None
-    tmp = tempfile.mktemp(suffix=Path(remote).suffix)
-    Path(tmp).write_bytes(data)
-    return tmp
-def dl_all_sets():
-    data = hf.fetch_index()
-    if not data: return None
-    tmp = tempfile.mktemp(suffix=".zip")
-    with zipfile.ZipFile(tmp,"w",zipfile.ZIP_DEFLATED) as zf:
-        for s in data.get("strategies",[]):
-            if not s.get("is_viable"): continue
-            sl = slugify(s["strategy"]); sym = s["symbol"]; tf = s["timeframe"]
-            content = hf.fetch_file(f"optimal_sets/{sl}_{sym}_{tf}.set")
-            if content: zf.writestr(f"{sl}_{sym}_{tf}.set", content)
-    return tmp
-# ═══════════════════════════════════════════════════
-#  TAB 5 — SETUP
-# ═══════════════════════════════════════════════════
-def check_config():
-    checks = [
-        ("ANTHROPIC_API_KEY", cfg.ANTHROPIC_API_KEY, "Claude API"),
-        ("HF_TOKEN",          cfg.HF_TOKEN,           "HF write access"),
-        ("HF_DATASET_REPO",   cfg.HF_DATASET_REPO,    "Results storage"),
-        ("HF_TICK_REPO",      cfg.HF_TICK_REPO,        "Tick data source"),
-    ]
-    kb      = get_kb()
-    symbols = hf.tick_list_symbols() if cfg.HF_TICK_REPO else []
-    jl_ok   = julia_available()
-    lines = ["## Configuration Status", ""]
-    for name, val, desc in checks:
-        icon = "✅" if val else "❌"
-        lines.append(f"{icon} `{name}` — {desc}")
-    lines += ["", "## Julia Engine", "",
-              f"{'✅' if jl_ok else '❌'} Julia runtime: {'available' if jl_ok else 'not available (check build logs)'}",
-              "", "## Data Status", "",
-              f"- Tick symbols: **{len(symbols)}** — {', '.join(symbols[:8])}",
-              f"- Strategies in KB: **{len(kb['strategies'])}**",
-              f"- Formulas in KB: **{len(kb['formulas'])}**",
-              "", "## Backtest Settings", "",
-              f"- WF Windows: `{cfg.WF_WINDOWS}` · IS Ratio: `{cfg.WF_IS_RATIO}`",
-              f"- Min Trades: `{cfg.MIN_TRADES}` · Min Sharpe: `{cfg.MIN_SHARPE}`",
-              f"- Commission: `{cfg.COMMISSION_PCT*100:.3f}%` · Risk/trade: `{cfg.RISK_PER_TRADE*100:.1f}%`",
-              f"- Timeframes: `{', '.join(cfg.BACKTEST_TFS)}`"]
-    return "\n".join(lines)
-# ═══════════════════════════════════════════════════
-#  BUILD APP
-# ═══════════════════════════════════════════════════
-CATS = ["All"] + cfg.CATEGORIES
-with gr.Blocks(
-    title="Quant Knowledge Extractor — Julia Engine",
-    theme=gr.themes.Base(primary_hue="green", neutral_hue="gray"),
-    css=".status-box{font-family:monospace;font-size:.82em}"
-) as demo:
-    gr.HTML("""
-    <div style="text-align:center;padding:1.2em 0 .3em">
-      <h1 style="font-size:2em;color:#16a34a;margin:0">📊 Quant Knowledge Extractor</h1>
-      <p style="color:#6b7280;margin:.4em 0 0">
-        Julia 1.10 Engine · BacktestEngine.jl · WalkForward Optimizer · MT5 .set Output
-      </p>
-    </div>""")
-    with gr.Tabs():
-        # Tab 1 — Extract
-        with gr.Tab("📤 Upload & Extract"):
-            gr.Markdown("### Upload algorithmic trading PDFs — OCR applied automatically")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    pdf_in  = gr.File(label="Drop PDFs here", file_count="multiple", file_types=[".pdf"])
-                    ext_btn = gr.Button("🚀 Extract Knowledge", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    ext_out = gr.Textbox(label="Result", lines=14, interactive=False, elem_classes=["status-box"])
-            ext_log = gr.Textbox(label="Log", lines=8, interactive=False, elem_classes=["status-box"])
-            ext_btn.click(fn=run_extraction, inputs=[pdf_in], outputs=[ext_out, ext_log])
-        # Tab 2 — Browse
-        with gr.Tab("📚 Knowledge Base"):
-            with gr.Tabs():
-                with gr.Tab("📈 Strategies"):
-                    with gr.Row():
-                        sq = gr.Textbox(label="Search", placeholder="RSI, breakout, Kelly…")
-                        sc = gr.Dropdown(choices=CATS, value="All", label="Category")
-                        sb = gr.Button("🔍 Search", variant="primary")
-                    st = gr.Dataframe(headers=["Name","Category","Description","Sources","Variants"],
-                                     datatype=["str"]*4+["number"], interactive=False)
-                    sn = gr.Markdown("")
-                    with gr.Row():
-                        sni = gr.Textbox(label="Name to download")
-                        sdb = gr.Button("⬇️ Download MD"); sdf = gr.File(label="")
-                    szb = gr.Button("📦 Category ZIP"); szf = gr.File(label="")
-                    sb.click(fn=search_strategies, inputs=[sq,sc], outputs=[st,sn])
-                    sdb.click(fn=dl_strategy, inputs=[sni], outputs=[sdf])
-                    szb.click(fn=dl_all_strategies_zip, inputs=[sc], outputs=[szf])
-                with gr.Tab("∑ Formulas"):
-                    with gr.Row():
-                        fq = gr.Textbox(label="Search", placeholder="Sharpe, Kelly, ATR…")
-                        fb = gr.Button("🔍 Search", variant="primary")
-                    ft = gr.Dataframe(headers=["Name","Category","Purpose","LaTeX","Sources"],
-                                     datatype=["str"]*5, interactive=False)
-                    fb.click(fn=search_formulas, inputs=[fq], outputs=[ft])
-        # Tab 3 — Backtest
-        with gr.Tab("🔬 Julia Backtest"):
-            gr.Markdown(
-                "### Walk-Forward Backtest — Julia Engine\n"
-                "Claude generates Julia signal code → Julia compiles + optimizes → "
-                "MT5 `.set` files pushed to HuggingFace."
-            )
-            with gr.Row():
-                with gr.Column(scale=2):
-                    bt_load  = gr.Button("🔄 Load Symbols from HF")
-                    bt_syms  = gr.CheckboxGroup(label="Symbols", choices=[], value=[])
-                    bt_tfs   = gr.CheckboxGroup(
-                        label="Timeframes", value=["1h","4h"],
-                        choices=["1m","5m","15m","30m","1h","4h","1d"])
-                    bt_filt  = gr.Textbox(label="Strategy filter (optional)")
-                    bt_max   = gr.Slider(0, 500, value=0, step=10, label="Max strategies (0=all)")
-                    bt_viable= gr.Checkbox(label="Push only VIABLE to HuggingFace", value=True)
-                    bt_run   = gr.Button("🚀 Run Julia Backtests", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    bt_out = gr.Textbox(label="Summary", lines=12, interactive=False, elem_classes=["status-box"])
-            bt_log = gr.Textbox(label="Log", lines=12, interactive=False, elem_classes=["status-box"])
-            bt_load.click(fn=load_symbols, outputs=[bt_syms])
-            bt_run.click(fn=run_backtests,
-                         inputs=[bt_syms, bt_tfs, bt_filt, bt_max, bt_viable],
-                         outputs=[bt_out, bt_log])
-        # Tab 4 — Results
-        with gr.Tab("🏆 Results"):
-            gr.Markdown("### Viable Strategies — Download MT5 `.set` & Julia Configs")
-            res_ref = gr.Button("🔄 Refresh from HuggingFace", variant="primary")
-            res_tbl = gr.Dataframe(
-                headers=["Strategy","Symbol","TF","Sharpe","Max DD","Win%","PF","Score"],
-                datatype=["str"]*8, interactive=False)
-            res_cnt = gr.Markdown("")
-            gr.Markdown("#### Download individual file")
-            with gr.Row():
-                rn = gr.Textbox(label="Strategy name"); rs = gr.Textbox(label="Symbol")
-                rt = gr.Textbox(label="Timeframe")
-                rf = gr.Dropdown(choices=["MT5 .set file","Optimal JSON",
-                                           "Julia config","Full report"],
-                                 value="MT5 .set file", label="File type")
-            rdb = gr.Button("⬇️ Download", variant="primary"); rdf = gr.File(label="")
-            gr.Markdown("#### Batch download all viable strategies")
-            with gr.Row():
-                rsb = gr.Button("🎯 All MT5 .set (ZIP)"); rsf = gr.File(label="")
-            res_ref.click(fn=load_results, outputs=[res_tbl, res_cnt])
-            rdb.click(fn=dl_result_file, inputs=[rn,rs,rt,rf], outputs=[rdf])
-            rsb.click(fn=dl_all_sets, outputs=[rsf])
-            demo.load(fn=load_results, outputs=[res_tbl, res_cnt])
-        # Tab 5 — Setup
-        with gr.Tab("⚙️ Setup & Status"):
-            gr.Markdown("""### Required Secrets (Space Settings → Variables and Secrets)
-| Secret | Description |
-|--------|-------------|
-| `ANTHROPIC_API_KEY` | Claude API key |
-| `HF_TOKEN` | HuggingFace write token |
-| `HF_DATASET_REPO` | `your-username/quant-knowledge-base` |
-| `HF_TICK_REPO` | `your-username/tick-data` |
-### Tick Data Format
-Upload to your `tick-data` dataset:
-```
-EURUSD/ticks.parquet   (columns: timestamp, bid, ask OR open,high,low,close,volume)
-BTCUSDT/1h.parquet     (pre-built OHLCV — faster)
-```
-""")
-            cfg_ref = gr.Button("🔄 Check Status")
-            cfg_out = gr.Markdown(check_config())
-            cfg_ref.click(fn=check_config, outputs=[cfg_out])
-    gr.HTML("""<div style="text-align:center;padding:.8em;color:#9ca3af;font-size:.75em">
-      Quant Knowledge Extractor · Julia 1.10 Engine · HuggingFace Spaces
-    </div>""")
-if __name__ == "__main__":
-    demo.launch()