Spaces:

llaa33219
/

context-window-extender

Running on Zero

App Files Files Community

llaa33219 commited on 28 days ago

Commit

60026f3

verified ·

1 Parent(s): fe6b518

Upload 3 files

Browse files

Files changed (3) hide show

README.md +27 -7
app.py +351 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,14 +1,34 @@
 ---
 title: Context Window Extender
-emoji: 🐨
-colorFrom: yellow
-colorTo: blue
 sdk: gradio
-sdk_version: 6.9.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
-short_description: llm context-window-extender
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Context Window Extender
+emoji: 🧠
+colorFrom: purple
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
+suggested_hardware: cpu-basic
 pinned: false
 ---
+# Context Window Extender
+Load any causal language model from Hugging Face Hub and extend its context window.
+## Features
+- **Model Loading**: Enter any Hugging Face model ID
+- **Context Extension**:
+  - Raw: Simply increase max_position_embeddings
+  - RoPE: Apply RoPE scaling (linear, dynamic, yarn)
+- **CPU Only**: Runs on free CPU hardware
+## Usage
+1. Enter a Hugging Face model ID (e.g., `gpt2`, `meta-llama/Llama-2-7b-hf`)
+2. Choose extension method:
+   - **None**: Use original context
+   - **Raw**: Increase max_position_embeddings
+   - **RoPE**: Apply RoPE scaling
+3. If RoPE selected, choose type and factor
+4. Set target context length
+5. Enter prompt and click Generate

app.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import warnings
+import os
+warnings.filterwarnings("ignore")
+# Global model cache to avoid reloading
+model_cache = {}
+def load_model_with_extension(
+    model_id: str,
+    extension_method: str,
+    new_context_length: int,
+    rope_type: str,
+    rope_factor: float
+):
+    """
+    Load model with optional context window extension.
+    Args:
+        model_id: Hugging Face model ID
+        extension_method: "none", "raw", or "rope"
+        new_context_length: Target context length
+        rope_type: "linear", "dynamic", or "yarn"
+        rope_factor: RoPE scaling factor
+    """
+    # Create cache key based on all parameters
+    cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}"
+    if cache_key in model_cache:
+        return model_cache[cache_key]
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load config and modify
+    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    original_context = getattr(config, "max_position_embeddings", 4096)
+    # Apply extension based on method
+    if extension_method == "raw":
+        # Raw extension: just increase max_position_embeddings
+        config.max_position_embeddings = new_context_length
+    elif extension_method == "rope":
+        # RoPE scaling extension
+        config.max_position_embeddings = new_context_length
+        # Set RoPE scaling if model supports it
+        if hasattr(config, "rope_theta"):
+            # Get original rope_theta
+            original_theta = getattr(config, "rope_theta", 10000.0)
+            # Apply scaling based on type
+            if rope_type == "linear":
+                # Linear scaling - adjust theta by factor
+                config.rope_theta = original_theta * rope_factor
+            elif rope_type == "dynamic":
+                # Dynamic scaling - use higher base frequency
+                config.rope_theta = original_theta * (rope_factor - 1) + original_theta * rope_factor
+            elif rope_type == "yarn":
+                # YaRN - more sophisticated scaling
+                config.rope_scaling = {
+                    "type": "yarn",
+                    "factor": rope_factor,
+                    "original_max_position_embeddings": original_context,
+                    "attn_factor": 1.0,
+                    "beta_fast": 32.0,
+                    "beta_slow": 1.0,
+                }
+                config.rope_theta = original_theta
+    # Load model on CPU
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        config=config,
+        torch_dtype=torch.float32,
+        device_map="cpu",
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    result = {
+        "model": model,
+        "tokenizer": tokenizer,
+        "original_context": original_context,
+        "applied_context": new_context_length,
+        "extension_method": extension_method
+    }
+    model_cache[cache_key] = result
+    return result
+def generate(
+    model_id: str,
+    extension_method: str,
+    new_context_length: int,
+    rope_type: str,
+    rope_factor: float,
+    prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+):
+    """
+    Generate text with the loaded model.
+    """
+    # Validate inputs
+    if not model_id.strip():
+        return "Error: Please enter a model ID"
+    if not prompt.strip():
+        return "Error: Please enter a prompt"
+    # Set default context length if not provided
+    if new_context_length <= 0:
+        new_context_length = 4096
+    # Load or get model from cache
+    try:
+        model_data = load_model_with_extension(
+            model_id,
+            extension_method,
+            new_context_length,
+            rope_type,
+            rope_factor
+        )
+    except Exception as e:
+        return f"Error loading model: {str(e)}"
+    model = model_data["model"]
+    tokenizer = model_data["tokenizer"]
+    # Tokenize input
+    try:
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=False,
+            padding=False
+        )
+    except Exception as e:
+        return f"Error tokenizing input: {str(e)}"
+    # Generate
+    try:
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=temperature > 0,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        # Decode output
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # If generation is same as input, return a message
+        if generated_text.strip() == prompt.strip():
+            return "Model generated same text as input. Try adjusting parameters."
+        return generated_text
+    except Exception as e:
+        return f"Error during generation: {str(e)}"
+def update_rope_options(extension_method: str):
+    """
+    Update visibility of RoPE options based on extension method.
+    """
+    if extension_method == "rope":
+        return gr.update(visible=True)
+    else:
+        return gr.update(visible=False)
+# Build Gradio UI
+with gr.Blocks(title="Context Window Extender") as demo:
+    gr.Markdown("""
+    # 🧠 Model Context Window Extender
+    Load any causal language model from Hugging Face Hub and extend its context window.
+    Supports both **Raw Extension** and **RoPE Scaling** methods.
+    **Extension Methods:**
+    - **None**: Use model's original context length
+    - **Raw**: Simply increase max_position_embeddings (simple but may degrade quality)
+    - **RoPE**: Apply RoPE scaling for better quality (supports linear, dynamic, yarn)
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            model_id = gr.Textbox(
+                label="🤗 Model ID",
+                placeholder="meta-llama/Llama-2-7b-hf, gpt2, EleutherAI/gpt-neo-1.3B",
+                value="gpt2",
+                info="Enter Hugging Face model ID"
+            )
+            gr.Examples(
+                examples=[
+                    ["gpt2"],
+                    ["EleutherAI/gpt-neo-1.3B"],
+                    ["microsoft/phi-2"],
+                    ["facebook/opt-1.3b"],
+                ],
+                inputs=model_id
+            )
+        with gr.Column(scale=1):
+            extension_method = gr.Radio(
+                choices=["none", "raw", "rope"],
+                value="none",
+                label="Extension Method",
+                info="Choose how to extend context window"
+            )
+    # RoPE options (shown when rope is selected)
+    with gr.Row():
+        with gr.Column(scale=1):
+            rope_type = gr.Dropdown(
+                choices=["linear", "dynamic", "yarn"],
+                value="linear",
+                label="RoPE Type",
+                visible=False,
+                info="linear: simple scaling, dynamic: better quality, yarn: best quality"
+            )
+        with gr.Column(scale=1):
+            rope_factor = gr.Slider(
+                minimum=1.0,
+                maximum=8.0,
+                step=0.5,
+                value=2.0,
+                label="RoPE Factor",
+                visible=False,
+                info="Multiply context by this factor"
+            )
+    with gr.Row():
+        new_context_length = gr.Slider(
+            minimum=512,
+            maximum=32768,
+            step=512,
+            value=2048,
+            label="Target Context Length",
+            info="Desired context window size (tokens)"
+        )
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(
+                label="📝 Prompt",
+                lines=6,
+                placeholder="Enter your prompt here...",
+                info="Input text for generation"
+            )
+        with gr.Column():
+            with gr.Row():
+                max_new_tokens = gr.Slider(
+                    minimum=10,
+                    maximum=1024,
+                    step=10,
+                    value=100,
+                    label="Max New Tokens"
+                )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    step=0.1,
+                    value=0.7,
+                    label="Temperature"
+                )
+            with gr.Row():
+                top_p = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.05,
+                    value=0.9,
+                    label="Top-p"
+                )
+    generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
+    output = gr.Textbox(
+        label="📄 Generated Output",
+        lines=10
+    )
+    # Event handlers
+    extension_method.change(
+        fn=update_rope_options,
+        inputs=[extension_method],
+        outputs=[rope_type, rope_factor]
+    )
+    generate_btn.click(
+        fn=generate,
+        inputs=[
+            model_id,
+            extension_method,
+            new_context_length,
+            rope_type,
+            rope_factor,
+            prompt,
+            max_new_tokens,
+            temperature,
+            top_p
+        ],
+        outputs=[output]
+    )
+    # Also allow Enter key to generate
+    prompt.submit(
+        fn=generate,
+        inputs=[
+            model_id,
+            extension_method,
+            new_context_length,
+            rope_type,
+            rope_factor,
+            prompt,
+            max_new_tokens,
+            temperature,
+            top_p
+        ],
+        outputs=[output]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+transformers>=4.35.0
+torch>=2.0.0
+accelerate>=0.25.0