Spaces:

broadfield-dev
/

OverThinker

Paused

App Files Files Community

broadfield-dev commited on 19 days ago

Commit

0ec3e05

verified ·

1 Parent(s): 63af298

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -30

app.py CHANGED Viewed

@@ -3,6 +3,9 @@
 Overthinker - Local 4B Quantized Edition (Nemotron 3 Nano 4B)
 Uses a local 4B model (NVIDIA Nemotron 3 Nano 4B) loaded in 4-bit quantization if supported,
 otherwise falls back to BF16 (which fits easily on 24GB GPUs).
 """
 import os
@@ -12,14 +15,14 @@ import uuid
 import sqlite3
 import torch
 from pathlib import Path
-from typing import Optional, Dict, List, Any
 from gradio import Server
 from fastapi import HTTPException
 from starlette.responses import HTMLResponse, PlainTextResponse, JSONResponse
 from datasets import Dataset, concatenate_datasets, load_dataset
 import pandas as pd
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
 from bag import (
     BASE_URL,
     LLMS_TXT,
@@ -29,7 +32,7 @@ from bag import (
     VIDEO_PAGE_HTML,
     README_MD
 )
-os.system("pip install torch && git clone https://github.com/state-spaces/mamba.git && cd mamba && python setup.py install")
 # ---------------------------------------------------------------------------
 # Application Setup
 # ---------------------------------------------------------------------------
@@ -39,14 +42,23 @@ DATA_DIR = Path("data")
 DATA_DIR.mkdir(exist_ok=True)
 # ---------- Local Model Configuration ----------
-# Using NVIDIA Nemotron 3 Nano 4B (BF16) - a compact Mamba2-Transformer hybrid SLM
-# 4-bit quantization via BitsAndBytes may not support Mamba layers fully;
-# we attempt it first, then fall back to BF16 (model is ~8GB, fits on A10G/T4)
-MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8"
 print("[Overthinker] Attempting to load Nemotron 3 Nano 4B with 4-bit quantization...")
-# Try 4-bit first; if incompatibility with Mamba layers, fallback to BF16
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -54,22 +66,30 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
-        torch_dtype=torch.bfloat16,
         trust_remote_code=True,
-        device_map="auto"
     )
     print(f"[Overthinker] Model loaded in 4-bit quantization on device: {model.device}")
     loaded_quantized = True
 except Exception as e:
     print(f"[Overthinker] 4-bit quantization failed: {e}")
     print("[Overthinker] Falling back to BF16 (no quantization) - model is only ~8GB.")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         device_map="auto",
         trust_remote_code=True,
         torch_dtype=torch.bfloat16
@@ -109,7 +129,7 @@ def init_session(session_id: str):
             type TEXT NOT NULL,
             label TEXT NOT NULL,
             description TEXT DEFAULT '',
-            emoji TEXT DEFAULT '\U0001f539',
             tips TEXT DEFAULT '[]',
             order_index INTEGER DEFAULT 0,
             created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
@@ -118,7 +138,7 @@ def init_session(session_id: str):
     root_id = str(uuid.uuid4())
     conn.execute(
         "INSERT INTO nodes (id, parent_id, type, label, description, emoji) VALUES (?, ?, ?, ?, ?, ?)",
-        (root_id, None, "root", "What decision do you want to explore?", "", "\U0001f333")
     )
     conn.commit()
     conn.close()
@@ -162,7 +182,7 @@ def get_children_db(session_id: str, parent_id: str) -> List[Dict]:
     return result
 def add_node_db(session_id: str, parent_id: str, node_type: str, label: str,
-                description: str = "", emoji: str = "\U0001f539",
                 tips: list = None, order_index: int = 0) -> Dict:
     node_id = str(uuid.uuid4())
     tips_json = json.dumps(tips or [])
@@ -219,7 +239,7 @@ def build_path_string(session_id: str, node_id: str) -> str:
             parts.append(f"[INPUT] {label}")
         elif t == "outcome":
             parts.append(f"[OUTCOME] {label}")
-    return " \u2192 ".join(parts)
 def get_root_node(session_id: str) -> Optional[Dict]:
     db_path = get_db_path(session_id)
@@ -378,7 +398,7 @@ def parse_json_response(text: str) -> Optional[dict]:
         return None
 # ---------------------------------------------------------------------------
-# Routes (All POST, no GET except for serving index)
 # ---------------------------------------------------------------------------
 @app.get("/")
@@ -412,7 +432,7 @@ async def create_tree(request: dict):
         raise HTTPException(status_code=500, detail="Failed to generate root node. Please check model availability.")
     label = parsed.get('label', f'Overthinking: {decision[:40]}')
     description = parsed.get('description', f'You are overthinking: {decision}')
-    emoji = parsed.get('emoji', '\U0001f333')
     tips = parsed.get('tips', ['Start by exploring options.'])
     update_root_db(session_id, label, description)
     db_path = get_db_path(session_id)
@@ -472,7 +492,7 @@ async def get_children(request: dict):
     for i, child in enumerate(children_data):
         label = child.get('label', 'Unknown')
         description = child.get('description', '')
-        emoji = child.get('emoji', '\U0001f539')
         tips = child.get('tips', [f'Consider this {next_type}.'])
         existing = get_children_db(session_id, node_id)
         existing_labels = [c['label'] for c in existing]
@@ -513,7 +533,7 @@ async def add_options(request: dict):
     for i, child in enumerate(children_data):
         label = child.get('label', 'Unknown')
         description = child.get('description', '')
-        emoji = child.get('emoji', '\U0001f539')
         tips = child.get('tips', [f'Additional {next_type}.'])
         existing = get_children_db(session_id, node_id)
         existing_labels = [c['label'] for c in existing]
@@ -590,15 +610,15 @@ async def export_path_md(request: dict):
     if not session_id or not node_id:
         raise HTTPException(status_code=400, detail="Missing session_id or node_id")
     path = get_path_db(session_id, node_id)
-    md = '# \U0001f9e0 Overthinker \u2014 Decision Path\n\n'
     for i, node in enumerate(path):
         indent = '  ' * i
-        emoji = {'root': '\U0001f333', 'input': '\U0001f9e0', 'outcome': '\U0001f4ca'}.get(node.get('type', ''), '\U0001f4cc')
         md += f'{indent}{emoji} **{node.get("label", "")}**\n'
         if node.get('description'):
             md += f'{indent}  > {node.get("description", "")}\n'
         if node.get('tips') and len(node['tips']) > 0:
-            md += f'{indent}  > \U0001f4a1 {node["tips"][0]}\n'
         md += '\n'
     return PlainTextResponse(content=md, status_code=200)
@@ -630,17 +650,18 @@ async def get_video():
 # Launch
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
-    print(f"\U0001f9e0 Overthinker \u2014 Local 4B Quantized Edition on port {PORT}")
-    print(f"\U0001f916 Model: {MODEL_NAME}")
     if loaded_quantized:
-        print("\U0001f4be Quantization: 4-bit NF4 (BitsAndBytes)")
     else:
-        print("\U0001f4be Quantization: None (BF16 fallback)")
-    print(f"\U0001f310 Open http://localhost:{PORT} in your browser")
     if not HF_TOKEN or not HF_DATASET_REPO:
-        print("\u26a0\ufe0f No HF_TOKEN or HF_DATASET_REPO set. Upload will fail.")
     app.launch(
         server_port=PORT,
         show_error=True,
         share=False
-    )

 Overthinker - Local 4B Quantized Edition (Nemotron 3 Nano 4B)
 Uses a local 4B model (NVIDIA Nemotron 3 Nano 4B) loaded in 4-bit quantization if supported,
 otherwise falls back to BF16 (which fits easily on 24GB GPUs).
+Handles mamba-ssm dependency gracefully by disabling use_mamba_kernels in config
+to use transformers' native PyTorch fallback implementation when mamba-ssm is not available.
 """
 import os
 import sqlite3
 import torch
 from pathlib import Path
+from typing import Optional, Dict, List
 from gradio import Server
 from fastapi import HTTPException
 from starlette.responses import HTMLResponse, PlainTextResponse, JSONResponse
 from datasets import Dataset, concatenate_datasets, load_dataset
 import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, AutoConfig
 from bag import (
     BASE_URL,
     LLMS_TXT,
     VIDEO_PAGE_HTML,
     README_MD
 )
 # ---------------------------------------------------------------------------
 # Application Setup
 # ---------------------------------------------------------------------------
 DATA_DIR.mkdir(exist_ok=True)
 # ---------- Local Model Configuration ----------
+MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16"
 print("[Overthinker] Attempting to load Nemotron 3 Nano 4B with 4-bit quantization...")
+# Load config and disable mamba kernels to avoid mamba-ssm dependency
+print("[Overthinker] Loading model config...")
+config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
+# Disable mamba kernels to use transformers' native PyTorch fallback
+# This avoids needing mamba-ssm and causal-conv1d packages
+if hasattr(config, 'use_mamba_kernels'):
+    config.use_mamba_kernels = False
+    print("[Overthinker] Disabled use_mamba_kernels - using PyTorch fallback for Mamba layers")
+else:
+    print("[Overthinker] Warning: Config does not have use_mamba_kernels attribute")
+# Try 4-bit first; if incompatibility, fallback to BF16
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_compute_dtype=torch.bfloat16
 )
+loaded_quantized = False
 try:
+    print("[Overthinker] Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    print("[Overthinker] Loading model with 4-bit quantization...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
+        config=config,
+        quantization_config=bnb_config,
+        device_map="auto",
         trust_remote_code=True,
+        torch_dtype=torch.bfloat16
     )
     print(f"[Overthinker] Model loaded in 4-bit quantization on device: {model.device}")
     loaded_quantized = True
 except Exception as e:
     print(f"[Overthinker] 4-bit quantization failed: {e}")
     print("[Overthinker] Falling back to BF16 (no quantization) - model is only ~8GB.")
+    if hasattr(config, 'use_mamba_kernels'):
+        config.use_mamba_kernels = False
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
+        config=config,
         device_map="auto",
         trust_remote_code=True,
         torch_dtype=torch.bfloat16
             type TEXT NOT NULL,
             label TEXT NOT NULL,
             description TEXT DEFAULT '',
+            emoji TEXT DEFAULT '🔹',
             tips TEXT DEFAULT '[]',
             order_index INTEGER DEFAULT 0,
             created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
     root_id = str(uuid.uuid4())
     conn.execute(
         "INSERT INTO nodes (id, parent_id, type, label, description, emoji) VALUES (?, ?, ?, ?, ?, ?)",
+        (root_id, None, "root", "What decision do you want to explore?", "", "🌳")
     )
     conn.commit()
     conn.close()
     return result
 def add_node_db(session_id: str, parent_id: str, node_type: str, label: str,
+                description: str = "", emoji: str = "🔹",
                 tips: list = None, order_index: int = 0) -> Dict:
     node_id = str(uuid.uuid4())
     tips_json = json.dumps(tips or [])
             parts.append(f"[INPUT] {label}")
         elif t == "outcome":
             parts.append(f"[OUTCOME] {label}")
+    return " → ".join(parts)
 def get_root_node(session_id: str) -> Optional[Dict]:
     db_path = get_db_path(session_id)
         return None
 # ---------------------------------------------------------------------------
+# Routes
 # ---------------------------------------------------------------------------
 @app.get("/")
         raise HTTPException(status_code=500, detail="Failed to generate root node. Please check model availability.")
     label = parsed.get('label', f'Overthinking: {decision[:40]}')
     description = parsed.get('description', f'You are overthinking: {decision}')
+    emoji = parsed.get('emoji', '🌳')
     tips = parsed.get('tips', ['Start by exploring options.'])
     update_root_db(session_id, label, description)
     db_path = get_db_path(session_id)
     for i, child in enumerate(children_data):
         label = child.get('label', 'Unknown')
         description = child.get('description', '')
+        emoji = child.get('emoji', '🔹')
         tips = child.get('tips', [f'Consider this {next_type}.'])
         existing = get_children_db(session_id, node_id)
         existing_labels = [c['label'] for c in existing]
     for i, child in enumerate(children_data):
         label = child.get('label', 'Unknown')
         description = child.get('description', '')
+        emoji = child.get('emoji', '🔹')
         tips = child.get('tips', [f'Additional {next_type}.'])
         existing = get_children_db(session_id, node_id)
         existing_labels = [c['label'] for c in existing]
     if not session_id or not node_id:
         raise HTTPException(status_code=400, detail="Missing session_id or node_id")
     path = get_path_db(session_id, node_id)
+    md = '# 🧠 Overthinker — Decision Path\n\n'
     for i, node in enumerate(path):
         indent = '  ' * i
+        emoji = {'root': '🌳', 'input': '🧠', 'outcome': '📊'}.get(node.get('type', ''), '📌')
         md += f'{indent}{emoji} **{node.get("label", "")}**\n'
         if node.get('description'):
             md += f'{indent}  > {node.get("description", "")}\n'
         if node.get('tips') and len(node['tips']) > 0:
+            md += f'{indent}  > 💡 {node["tips"][0]}\n'
         md += '\n'
     return PlainTextResponse(content=md, status_code=200)
 # Launch
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
+    print(f"🧠 Overthinker — Local 4B Quantized Edition on port {PORT}")
+    print(f"🤖 Model: {MODEL_NAME}")
+    print("🔋 Mamba kernels: Disabled (using PyTorch fallback - no mamba-ssm/causal-conv1d needed)")
     if loaded_quantized:
+        print("💾 Quantization: 4-bit NF4 (BitsAndBytes)")
     else:
+        print("💾 Quantization: None (BF16 fallback - fits in 16GB VRAM)")
+    print(f"🌐 Open http://localhost:{PORT} in your browser")
     if not HF_TOKEN or not HF_DATASET_REPO:
+        print("⚠️ No HF_TOKEN or HF_DATASET_REPO set. Upload will fail.")
     app.launch(
         server_port=PORT,
         show_error=True,
         share=False
+    )