""" UncheatableEval Visualization - Hugging Face Space Compare byte-level prediction performance between Qwen3-1.7B-Base and RWKV7-G1C-1.5B. """ import gc import os from pathlib import Path import gradio as gr import torch # Detect device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" IS_CPU = DEVICE == "cpu" # Model configuration QWEN_MODEL_ID = "Qwen/Qwen3-1.7B-Base" RWKV_MODEL_URL = "https://huggingface.co/BlinkDL/rwkv7-g1/resolve/main/rwkv7-g1c-1.5b-20260110-ctx8192.pth" RWKV_MODEL_FILENAME = "rwkv7-g1c-1.5b-20260110-ctx8192.pth" # Get the directory where this script is located SCRIPT_DIR = Path(__file__).parent.absolute() MODELS_DIR = SCRIPT_DIR / "models" SUPPORT_DIR = SCRIPT_DIR / "support" # Text length limits MAX_TEXT_LENGTH = 8192 MIN_TEXT_LENGTH = 1 # Global model cache _qwen_model = None _qwen_tokenizer = None _rwkv_model = None _rwkv_tokenizer = None _rwkv_model_path = None _stats_manager = None # Precomputed example cache _precomputed_html = None _precomputed_text = None PRECOMPUTED_DIR = SCRIPT_DIR / "precomputed" def download_rwkv_model(progress=None): """Download RWKV7 model if not exists.""" from huggingface_hub import hf_hub_download model_path = MODELS_DIR / RWKV_MODEL_FILENAME if model_path.exists(): return str(model_path) MODELS_DIR.mkdir(parents=True, exist_ok=True) # Download from HuggingFace Hub downloaded_path = hf_hub_download( repo_id="BlinkDL/rwkv7-g1", filename=RWKV_MODEL_FILENAME, local_dir=str(MODELS_DIR), local_dir_use_symlinks=False ) return downloaded_path def load_qwen_model(): """Load Qwen3-1.7B-Base model.""" from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_ID, trust_remote_code=True) # Configure based on device if IS_CPU: model_kwargs = {"torch_dtype": torch.float32, "device_map": None, "trust_remote_code": True, "low_cpu_mem_usage": True} model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval() else: model_kwargs = {"torch_dtype": torch.bfloat16, "device_map": "auto", "trust_remote_code": True} try: model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, attn_implementation="flash_attention_2", **model_kwargs).eval() except Exception: model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval() return model, tokenizer def load_rwkv7_model(model_path: str): """Load RWKV7-G1C-1.5B model.""" os.environ["RWKV_JIT_ON"] = "1" os.environ["RWKV_V7_ON"] = "1" # Set CUDA flag based on device if IS_CPU: os.environ["RWKV_CUDA_ON"] = "0" else: os.environ["RWKV_CUDA_ON"] = "1" from rwkv.model import RWKV from rwkv.rwkv_tokenizer import TRIE_TOKENIZER # Use appropriate strategy for device if IS_CPU: strategy = "cpu fp32" else: strategy = "cuda fp16" # RWKV library automatically adds .pth extension, so remove it if present if model_path.endswith(".pth"): model_path = model_path[:-4] model = RWKV(model=model_path, strategy=strategy) vocab_path = str(SUPPORT_DIR / "rwkv_vocab_v20230424.txt") tokenizer = TRIE_TOKENIZER(vocab_path) return model, tokenizer def validate_input(text: str) -> tuple[bool, str]: """Validate input text.""" if not text or not text.strip(): return False, "Please enter some text to analyze." text = text.strip() if len(text) < MIN_TEXT_LENGTH: return False, f"Text is too short. Minimum {MIN_TEXT_LENGTH} characters required." if len(text) > MAX_TEXT_LENGTH: return False, f"Text is too long. Maximum {MAX_TEXT_LENGTH} characters allowed. Current: {len(text)}" return True, text def load_precomputed_example(): """Load precomputed example visualization.""" global _precomputed_html, _precomputed_text html_path = PRECOMPUTED_DIR / "example_visualization.html" metadata_path = PRECOMPUTED_DIR / "example_metadata.json" if html_path.exists() and metadata_path.exists(): import json with open(html_path, "r", encoding="utf-8") as f: _precomputed_html = f.read() with open(metadata_path, "r", encoding="utf-8") as f: metadata = json.load(f) _precomputed_text = metadata.get("example_text", "") print(f"Loaded precomputed example ({len(_precomputed_text)} chars)") return True else: print("No precomputed example found. Run precompute_example.py first.") return False def initialize_models(): """Initialize and cache both models at startup.""" global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _rwkv_model_path, _stats_manager print("Initializing models...") # Load precomputed example first load_precomputed_example() # Download RWKV model if needed print("Checking RWKV7 model...") _rwkv_model_path = download_rwkv_model() # Load Qwen model print("Loading Qwen3-1.7B-Base...") _qwen_model, _qwen_tokenizer = load_qwen_model() # Load RWKV7 model print("Loading RWKV7-G1C-1.5B...") _rwkv_model, _rwkv_tokenizer = load_rwkv7_model(_rwkv_model_path) # Initialize stats manager from core.inference_stats import InferenceStatsManager _stats_manager = InferenceStatsManager() print("Models loaded successfully!") def wrap_html_in_iframe(html: str) -> str: """Wrap HTML in an iframe for Gradio display.""" # For srcdoc attribute, we only need to escape quotes # The HTML entities inside (like ", ) should remain as-is escaped = html.replace('"', """) return f"""
""" def run_evaluation(text: str, progress=gr.Progress()): """Run evaluation on both models and generate visualization.""" from core.evaluator import evaluate_hf_single_sample, evaluate_rwkv7_single_sample from visualization.html_generator import generate_comparison_html # Use cached models global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _stats_manager # Validate input valid, result = validate_input(text) if not valid: raise gr.Error(result) text = result # Use cleaned text try: # Get token counts for prediction first qwen_inputs = _qwen_tokenizer(text, return_tensors="pt", add_special_tokens=False) qwen_token_count = qwen_inputs["input_ids"].shape[-1] qwen_predicted_time = _stats_manager.predict_time("qwen", qwen_token_count) rwkv_tokenized = _rwkv_tokenizer.encode(text) rwkv_token_count = len(rwkv_tokenized.ids if hasattr(rwkv_tokenized, "ids") else rwkv_tokenized) rwkv_predicted_time = _stats_manager.predict_time("rwkv", rwkv_token_count) # Step 1: Evaluate Qwen (using cached model) if qwen_predicted_time is not None: progress(0, desc=f"Evaluating with Qwen3... (estimated: {qwen_predicted_time:.1f}s)") else: progress(0, desc="Evaluating with Qwen3...") result_qwen = evaluate_hf_single_sample(_qwen_model, _qwen_tokenizer, text, bos_mode="add_newline_token") # Save stats and print comparison _stats_manager.add_record("qwen", qwen_token_count, result_qwen["inference_time"]) if qwen_predicted_time is not None: print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s (predicted: {qwen_predicted_time:.2f}s)") else: print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s") # Step 2: Evaluate RWKV7 (using cached model) if rwkv_predicted_time is not None: progress(0, desc=f"Evaluating with RWKV7... (estimated: {rwkv_predicted_time:.1f}s)") else: progress(0, desc="Evaluating with RWKV7...") result_rwkv = evaluate_rwkv7_single_sample(_rwkv_model, _rwkv_tokenizer, text) # Save stats and print comparison _stats_manager.add_record("rwkv", rwkv_token_count, result_rwkv["inference_time"]) if rwkv_predicted_time is not None: print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s (predicted: {rwkv_predicted_time:.2f}s)") else: print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s") # Step 3: Generate visualization progress(0, desc="Generating visualization...") html = generate_comparison_html( text=text, byte_losses_a=result_rwkv["byte_wise_losses"], byte_losses_b=result_qwen["byte_wise_losses"], model_a_name="RWKV7-G1C-1.5B", model_b_name="Qwen3-1.7B-Base", topk_predictions_a=result_rwkv["top5_predictions"], topk_predictions_b=result_qwen["top5_predictions"], tokenizer_a=result_rwkv["tokenizer"], tokenizer_b=result_qwen["tokenizer"], model_type_a="rwkv7", model_type_b="hf", ) # Wrap HTML for iframe display wrapped_html = wrap_html_in_iframe(html) return wrapped_html except torch.cuda.OutOfMemoryError: if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() raise gr.Error("GPU memory insufficient. Please try:\n" "1. Use shorter text\n" "2. Wait a moment and try again") except Exception as e: if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() raise gr.Error(f"Evaluation failed: {str(e)}") def clear_inputs(): """Clear all inputs and outputs.""" return "", None def get_default_example(): """Get the default example for display on page load.""" global _precomputed_html, _precomputed_text if _precomputed_html and _precomputed_text: wrapped_html = wrap_html_in_iframe(_precomputed_html) return _precomputed_text, wrapped_html else: return "", None # Build Gradio UI with gr.Blocks(title="Compression-Lens: RWKV-7 vs Qwen3", theme=gr.themes.Soft()) as demo: gr.HTML( """

🔬 Compression-Lens: RWKV-7 vs Qwen3 Byte-Level Comparison

Compare the byte-level prediction performance between RWKV7-G1C-1.5B and Qwen3-1.7B-Base.

GitHub Project Leaderboard
""" ) with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox( label="Input Text", placeholder=f"Enter text to analyze (max {MAX_TEXT_LENGTH} characters)...", lines=10, max_lines=20, ) with gr.Row(): clear_btn = gr.Button("Clear", variant="secondary") run_btn = gr.Button("â–¶ Run Comparison", variant="primary") gr.Markdown("---") with gr.Row(): with gr.Column(): output_html = gr.HTML(label="Visualization") # Event handlers clear_btn.click(fn=clear_inputs, outputs=[text_input, output_html]) run_btn.click(fn=run_evaluation, inputs=[text_input], outputs=[output_html]) # Load default example on page load demo.load(fn=get_default_example, outputs=[text_input, output_html]) if __name__ == "__main__": # Initialize models before launching the app initialize_models() # Launch the Gradio app demo.launch(server_name="0.0.0.0", server_port=7860, share=False)