Spaces:

llaa33219
/

context-window-extender

Running on Zero

App Files Files Community

llaa33219 commited on about 1 month ago

Commit

4b77ff5

verified ·

1 Parent(s): e9cb424

Upload 3 files

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +96 -274
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -3,8 +3,8 @@ title: Context Window Extender
 emoji: 🧠
 colorFrom: purple
 colorTo: indigo
-sdk: gradio
-sdk_version: 4.44.0
 app_file: app.py
 suggested_hardware: cpu-basic
 pinned: false

 emoji: 🧠
 colorFrom: purple
 colorTo: indigo
+sdk: streamlit
+sdk_version: 1.40.0
 app_file: app.py
 suggested_hardware: cpu-basic
 pinned: false

app.py CHANGED Viewed

@@ -1,12 +1,20 @@
-import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-import warnings
-import os
-warnings.filterwarnings("ignore")
-# Global model cache to avoid reloading
 model_cache = {}
 def load_model_with_extension(
@@ -16,60 +24,34 @@ def load_model_with_extension(
     rope_type: str,
     rope_factor: float
 ):
-    """
-    Load model with optional context window extension.
-    Args:
-        model_id: Hugging Face model ID
-        extension_method: "none", "raw", or "rope"
-        new_context_length: Target context length
-        rope_type: "linear", "dynamic", or "yarn"
-        rope_factor: RoPE scaling factor
-    """
-    # Create cache key based on all parameters
     cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}"
     if cache_key in model_cache:
         return model_cache[cache_key]
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_id,
-        trust_remote_code=True
-    )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    # Load config and modify
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
     original_context = getattr(config, "max_position_embeddings", 4096)
-    # Apply extension based on method
     if extension_method == "raw":
-        # Raw extension: just increase max_position_embeddings
         config.max_position_embeddings = new_context_length
     elif extension_method == "rope":
-        # RoPE scaling extension
         config.max_position_embeddings = new_context_length
-        # Set RoPE scaling if model supports it
         if hasattr(config, "rope_theta"):
-            # Get original rope_theta
             original_theta = getattr(config, "rope_theta", 10000.0)
-            # Apply scaling based on type
             if rope_type == "linear":
-                # Linear scaling - adjust theta by factor
                 config.rope_theta = original_theta * rope_factor
             elif rope_type == "dynamic":
-                # Dynamic scaling - use higher base frequency
                 config.rope_theta = original_theta * (rope_factor - 1) + original_theta * rope_factor
             elif rope_type == "yarn":
-                # YaRN - more sophisticated scaling
                 config.rope_scaling = {
                     "type": "yarn",
                     "factor": rope_factor,
@@ -80,7 +62,6 @@ def load_model_with_extension(
                 }
                 config.rope_theta = original_theta
-    # Load model on CPU
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         config=config,
@@ -96,256 +77,97 @@ def load_model_with_extension(
         "tokenizer": tokenizer,
         "original_context": original_context,
         "applied_context": new_context_length,
-        "extension_method": extension_method
     }
     model_cache[cache_key] = result
     return result
-def generate(
-    model_id: str,
-    extension_method: str,
-    new_context_length: int,
-    rope_type: str,
-    rope_factor: float,
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-):
-    """
-    Generate text with the loaded model.
-    """
-    # Validate inputs
-    if not model_id.strip():
-        return "Error: Please enter a model ID"
-    if not prompt.strip():
-        return "Error: Please enter a prompt"
-    # Set default context length if not provided
-    if new_context_length <= 0:
-        new_context_length = 4096
-    # Load or get model from cache
-    try:
-        model_data = load_model_with_extension(
-            model_id,
-            extension_method,
-            new_context_length,
-            rope_type,
-            rope_factor
-        )
-    except Exception as e:
-        return f"Error loading model: {str(e)}"
-    model = model_data["model"]
-    tokenizer = model_data["tokenizer"]
-    # Tokenize input
-    try:
-        inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            truncation=False,
-            padding=False
         )
-    except Exception as e:
-        return f"Error tokenizing input: {str(e)}"
-    # Generate
-    try:
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                do_sample=temperature > 0,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-        # Decode output
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # If generation is same as input, return a message
-        if generated_text.strip() == prompt.strip():
-            return "Model generated same text as input. Try adjusting parameters."
-        return generated_text
-    except Exception as e:
-        return f"Error during generation: {str(e)}"
-def update_rope_options(extension_method: str):
-    """
-    Update visibility of RoPE options based on extension method.
-    """
-    if extension_method == "rope":
-        return gr.update(visible=True)
-    else:
-        return gr.update(visible=False)
-# Build Gradio UI
-with gr.Blocks(title="Context Window Extender") as demo:
-    gr.Markdown("""
-    # 🧠 Model Context Window Extender
-    Load any causal language model from Hugging Face Hub and extend its context window.
-    Supports both **Raw Extension** and **RoPE Scaling** methods.
-    **Extension Methods:**
-    - **None**: Use model's original context length
-    - **Raw**: Simply increase max_position_embeddings (simple but may degrade quality)
-    - **RoPE**: Apply RoPE scaling for better quality (supports linear, dynamic, yarn)
-    """)
-    with gr.Row():
-        with gr.Column(scale=2):
-            model_id = gr.Textbox(
-                label="🤗 Model ID",
-                placeholder="meta-llama/Llama-2-7b-hf, gpt2, EleutherAI/gpt-neo-1.3B",
-                value="gpt2",
-                info="Enter Hugging Face model ID"
-            )
-            gr.Examples(
-                examples=[
-                    ["gpt2"],
-                    ["EleutherAI/gpt-neo-1.3B"],
-                    ["microsoft/phi-2"],
-                    ["facebook/opt-1.3b"],
-                ],
-                inputs=model_id
-            )
-        with gr.Column(scale=1):
-            extension_method = gr.Radio(
-                choices=["none", "raw", "rope"],
-                value="none",
-                label="Extension Method",
-                info="Choose how to extend context window"
-            )
-    # RoPE options (shown when rope is selected)
-    with gr.Row():
-        with gr.Column(scale=1):
-            rope_type = gr.Dropdown(
-                choices=["linear", "dynamic", "yarn"],
-                value="linear",
-                label="RoPE Type",
-                visible=False,
-                info="linear: simple scaling, dynamic: better quality, yarn: best quality"
-            )
-        with gr.Column(scale=1):
-            rope_factor = gr.Slider(
-                minimum=1.0,
-                maximum=8.0,
-                step=0.5,
-                value=2.0,
-                label="RoPE Factor",
-                visible=False,
-                info="Multiply context by this factor"
-            )
-    with gr.Row():
-        new_context_length = gr.Slider(
-            minimum=512,
-            maximum=32768,
-            step=512,
-            value=2048,
-            label="Target Context Length",
-            info="Desired context window size (tokens)"
-        )
-    with gr.Row():
-        with gr.Column():
-            prompt = gr.Textbox(
-                label="📝 Prompt",
-                lines=6,
-                placeholder="Enter your prompt here...",
-                info="Input text for generation"
-            )
-        with gr.Column():
-            with gr.Row():
-                max_new_tokens = gr.Slider(
-                    minimum=10,
-                    maximum=1024,
-                    step=10,
-                    value=100,
-                    label="Max New Tokens"
-                )
-            with gr.Row():
-                temperature = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    step=0.1,
-                    value=0.7,
-                    label="Temperature"
-                )
-            with gr.Row():
-                top_p = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    step=0.05,
-                    value=0.9,
-                    label="Top-p"
-                )
-    generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
-    output = gr.Textbox(
-        label="📄 Generated Output",
-        lines=10
-    )
-    # Event handlers
-    extension_method.change(
-        fn=update_rope_options,
-        inputs=[extension_method],
-        outputs=[rope_type, rope_factor]
-    )
-    generate_btn.click(
-        fn=generate,
-        inputs=[
-            model_id,
-            extension_method,
-            new_context_length,
-            rope_type,
-            rope_factor,
-            prompt,
-            max_new_tokens,
-            temperature,
-            top_p
-        ],
-        outputs=[output]
-    )
-    # Also allow Enter key to generate
-    prompt.submit(
-        fn=generate,
-        inputs=[
-            model_id,
-            extension_method,
-            new_context_length,
-            rope_type,
-            rope_factor,
-            prompt,
-            max_new_tokens,
-            temperature,
-            top_p
-        ],
-        outputs=[output]
-    )
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

+import streamlit as st
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+st.set_page_config(page_title="Context Window Extender", page_icon="🧠")
+st.title("🧠 Model Context Window Extender")
+st.markdown("""
+Load any causal language model from Hugging Face Hub and extend its context window.
+**Extension Methods:**
+- **None**: Use model's original context length
+- **Raw**: Simply increase max_position_embeddings (simple but may degrade quality)
+- **RoPE**: Apply RoPE scaling for better quality (supports linear, dynamic, yarn)
+""")
 model_cache = {}
 def load_model_with_extension(
     rope_type: str,
     rope_factor: float
 ):
     cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}"
     if cache_key in model_cache:
         return model_cache[cache_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
     original_context = getattr(config, "max_position_embeddings", 4096)
     if extension_method == "raw":
         config.max_position_embeddings = new_context_length
     elif extension_method == "rope":
         config.max_position_embeddings = new_context_length
         if hasattr(config, "rope_theta"):
             original_theta = getattr(config, "rope_theta", 10000.0)
             if rope_type == "linear":
                 config.rope_theta = original_theta * rope_factor
             elif rope_type == "dynamic":
                 config.rope_theta = original_theta * (rope_factor - 1) + original_theta * rope_factor
             elif rope_type == "yarn":
                 config.rope_scaling = {
                     "type": "yarn",
                     "factor": rope_factor,
                 }
                 config.rope_theta = original_theta
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         config=config,
         "tokenizer": tokenizer,
         "original_context": original_context,
         "applied_context": new_context_length,
     }
     model_cache[cache_key] = result
     return result
+col1, col2 = st.columns([2, 1])
+with col1:
+    model_id = st.text_input(
+        "🤗 Model ID",
+        value="gpt2",
+        help="Enter Hugging Face model ID"
+    )
+    st.caption("Examples: gpt2, EleutherAI/gpt-neo-1.3B, microsoft/phi-2")
+with col2:
+    extension_method = st.radio(
+        "Extension Method",
+        ["none", "raw", "rope"],
+        index=0,
+        help="Choose how to extend context window"
+    )
+if extension_method == "rope":
+    col_rope1, col_rope2 = st.columns(2)
+    with col_rope1:
+        rope_type = st.selectbox(
+            "RoPE Type",
+            ["linear", "dynamic", "yarn"],
+            help="linear: simple scaling, dynamic: better quality, yarn: best quality"
         )
+    with col_rope2:
+        rope_factor = st.slider("RoPE Factor", 1.0, 8.0, 2.0, 0.5, help="Multiply context by this factor")
+else:
+    rope_type = "linear"
+    rope_factor = 1.0
+new_context_length = st.slider("Target Context Length", 512, 32768, 2048, 512, help="Desired context window size (tokens)")
+col_p1, col_p2 = st.columns(2)
+with col_p1:
+    prompt = st.text_area("📝 Prompt", height=150, placeholder="Enter your prompt here...")
+with col_p2:
+    max_new_tokens = st.slider("Max New Tokens", 10, 1024, 100, 10)
+    temperature = st.slider("Temperature", 0.0, 2.0, 0.7, 0.1)
+    top_p = st.slider("Top-p", 0.0, 1.0, 0.9, 0.05)
+if st.button("🚀 Generate", type="primary"):
+    if not model_id.strip():
+        st.error("Please enter a model ID")
+    elif not prompt.strip():
+        st.error("Please enter a prompt")
+    else:
+        with st.spinner("Loading model..."):
+            try:
+                model_data = load_model_with_extension(
+                    model_id,
+                    extension_method,
+                    new_context_length,
+                    rope_type,
+                    rope_factor
+                )
+                model = model_data["model"]
+                tokenizer = model_data["tokenizer"]
+                st.success(f"Model loaded! Original context: {model_data['original_context']}, Applied: {model_data['applied_context']}")
+                with st.spinner("Generating..."):
+                    inputs = tokenizer(prompt, return_tensors="pt", truncation=False, padding=False)
+                    with torch.no_grad():
+                        outputs = model.generate(
+                            **inputs,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature,
+                            top_p=top_p,
+                            do_sample=temperature > 0,
+                            pad_token_id=tokenizer.pad_token_id,
+                            eos_token_id=tokenizer.eos_token_id,
+                        )
+                    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    if generated_text.strip() == prompt.strip():
+                        st.warning("Model generated same text as input. Try adjusting parameters.")
+                    else:
+                        st.text_area("📄 Generated Output", value=generated_text, height=250)
+            except Exception as e:
+                st.error(f"Error: {str(e)}")

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-gradio>=5.0.0
 transformers>=4.35.0
 accelerate>=0.25.0

+streamlit>=1.40.0
 transformers>=4.35.0
 accelerate>=0.25.0