Spaces:

i3-lab
/

i3-4096ctx

Sleeping

App Files Files Community

FlameF0X commited on Jan 5

Commit

ca534e3

verified ·

1 Parent(s): a873184

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -48

app.py CHANGED Viewed

@@ -371,14 +371,16 @@ class ModelLoader:
 loader = ModelLoader()
 model, tokenizer = loader.load()
-def generate_response(message, history, temperature, top_k, top_p, max_tokens):
-    """Generate response with streaming."""
-    # Encode the message
-    input_ids = tokenizer.encode(message).ids
     input_tensor = torch.tensor([input_ids], dtype=torch.long, device=loader.device)
     # Generate with streaming
-    response = ""
     for token_id in model.generate_stream(
         input_tensor,
         max_new_tokens=max_tokens,
@@ -387,33 +389,45 @@ def generate_response(message, history, temperature, top_k, top_p, max_tokens):
         top_p=top_p
     ):
         token_text = tokenizer.decode([token_id])
-        response += token_text
-        yield response
 # Create Gradio interface
-with gr.Blocks(title="i3-4096ctx Model", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🚀 i3-4096ctx Language Model
-    A hybrid RWKV-Attention model with latent context compression, supporting up to 4096 tokens of context.
     """)
     with gr.Row():
-        with gr.Column(scale=3):
-            chatbot = gr.Chatbot(
-                height=500,
-                label="Chat"
             )
-            msg = gr.Textbox(
-                label="Your message",
-                placeholder="Type your message here...",
-                lines=3
             )
             with gr.Row():
-                submit = gr.Button("Send", variant="primary")
-                clear = gr.Button("Clear")
         with gr.Column(scale=1):
             gr.Markdown("### Generation Settings")
@@ -424,7 +438,7 @@ with gr.Blocks(title="i3-4096ctx Model", theme=gr.themes.Soft()) as demo:
                 value=0.8,
                 step=0.1,
                 label="Temperature",
-                info="Higher = more creative"
             )
             top_k = gr.Slider(
@@ -450,45 +464,45 @@ with gr.Blocks(title="i3-4096ctx Model", theme=gr.themes.Soft()) as demo:
                 maximum=500,
                 value=200,
                 step=10,
-                label="Max tokens",
-                info="Maximum response length"
             )
             gr.Markdown("""
             ### Model Info
             - **Architecture**: Hybrid RWKV-Attention
             - **Context**: 4096 tokens (compressed)
-            - **Kernel**: 512 tokens
-            - **Compression**: 32 latent tokens per chunk
             """)
-    def user(user_message, history):
-        return "", history + [[user_message, None]]
-    def bot(history, temperature, top_k, top_p, max_tokens):
-        user_message = history[-1][0]
-        # Generate response with streaming
-        for response in generate_response(
-            user_message,
-            history[:-1],
-            temperature,
-            top_k,
-            top_p,
-            max_tokens
-        ):
-            history[-1][1] = response
-            yield history
-    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, [chatbot, temperature, top_k, top_p, max_tokens], chatbot
     )
-    submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, [chatbot, temperature, top_k, top_p, max_tokens], chatbot
     )
-    clear.click(lambda: None, None, chatbot, queue=False)
 # Launch
 if __name__ == "__main__":

 loader = ModelLoader()
 model, tokenizer = loader.load()
+def generate_text(prompt, temperature, top_k, top_p, max_tokens):
+    """Generate text completion with streaming."""
+    # Encode the prompt
+    input_ids = tokenizer.encode(prompt).ids
     input_tensor = torch.tensor([input_ids], dtype=torch.long, device=loader.device)
+    # Start with the prompt
+    output_text = prompt
     # Generate with streaming
     for token_id in model.generate_stream(
         input_tensor,
         max_new_tokens=max_tokens,
         top_p=top_p
     ):
         token_text = tokenizer.decode([token_id])
+        output_text += token_text
+        yield output_text
+# Example prompts
+examples = [
+    ["The future of artificial intelligence is", 0.8, 50, 0.9, 200],
+    ["In a world where technology has advanced beyond our wildest dreams,", 0.9, 40, 0.95, 300],
+    ["The key principles of quantum mechanics include", 0.7, 50, 0.9, 250],
+    ["Once upon a time in a distant galaxy,", 1.0, 50, 0.95, 200],
+    ["The most important factors in climate change are", 0.7, 50, 0.9, 200],
+]
 # Create Gradio interface
+with gr.Blocks(title="i3-4096ctx Text Completion", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🚀 i3-4096ctx Language Model - Text Completion
+    A hybrid RWKV-Attention pre-trained model with latent context compression, supporting up to 4096 tokens of context.
+    **Note**: This is a pre-trained base model, not an instruction-tuned chat model. It performs **text completion** - give it a prompt and it will continue the text.
     """)
     with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Enter your prompt here... The model will continue from where you leave off.",
+                lines=5
             )
+            output_text = gr.Textbox(
+                label="Generated Text",
+                lines=15,
+                interactive=False
             )
             with gr.Row():
+                generate_btn = gr.Button("Generate", variant="primary", scale=2)
+                clear_btn = gr.Button("Clear", scale=1)
         with gr.Column(scale=1):
             gr.Markdown("### Generation Settings")
                 value=0.8,
                 step=0.1,
                 label="Temperature",
+                info="Higher = more creative, random"
             )
             top_k = gr.Slider(
                 maximum=500,
                 value=200,
                 step=10,
+                label="Max new tokens",
+                info="Maximum length to generate"
             )
             gr.Markdown("""
             ### Model Info
+            - **Type**: Pre-trained base model
             - **Architecture**: Hybrid RWKV-Attention
             - **Context**: 4096 tokens (compressed)
+            - **Kernel**: 512 tokens direct
+            - **Compression**: 32 latent tokens/chunk
+            ### Tips for Better Results
+            - Start with a clear, specific prompt
+            - Lower temperature (0.5-0.8) for factual text
+            - Higher temperature (0.9-1.2) for creative writing
+            - Adjust top-k and top-p for diversity control
             """)
+    gr.Markdown("### Example Prompts")
+    gr.Examples(
+        examples=examples,
+        inputs=[prompt_input, temperature, top_k, top_p, max_tokens],
+        outputs=output_text,
+        fn=generate_text,
+        cache_examples=False
     )
+    generate_btn.click(
+        fn=generate_text,
+        inputs=[prompt_input, temperature, top_k, top_p, max_tokens],
+        outputs=output_text
     )
+    clear_btn.click(
+        fn=lambda: ("", ""),
+        inputs=None,
+        outputs=[prompt_input, output_text]
+    )
 # Launch
 if __name__ == "__main__":