Spaces:

VirtualInsight
/

Lumen-Instruct

Sleeping

App Files Files Community

VirtualInsight commited on Oct 23, 2025

Commit

47d3bb2

verified ·

1 Parent(s): fff1dce

Create app.py

Browse files

Inference Implementation

Files changed (1) hide show

app.py +94 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+import torch
+import json
+from tokenizers import Tokenizer
+from huggingface_hub import hf_hub_download
+from ModelArchitecture import Transformer, ModelConfig, generate
+from safetensors.torch import load_file
+# -----------------------------
+# Load model and tokenizer
+# -----------------------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+REPO_ID = "VirtualInsight/Lumen-Instruct"
+# Download model assets from Hugging Face Hub
+model_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors")
+tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.json")
+config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
+# Initialize tokenizer and model
+tokenizer = Tokenizer.from_file(tokenizer_path)
+with open(config_path) as f:
+    config = ModelConfig(**json.load(f))
+model = Transformer(config).to(device)
+model.load_state_dict(load_file(model_path, device=str(device)), strict=False)
+model.eval()
+# -----------------------------
+# Special Tokens for Chat Format
+# -----------------------------
+EOS_TOKEN = "<|im_end|>"
+EOS_TOKEN_ID = tokenizer.encode(EOS_TOKEN).ids[0]
+print(f"EOS token ID: {EOS_TOKEN_ID}")
+# -----------------------------
+# Generation Function
+# -----------------------------
+@torch.no_grad()
+def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
+    """
+    Generates a chat-style response using the Lumen-Instruct model.
+    """
+    # Format the input as a structured conversation
+    formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+    # Tokenize input
+    input_ids = torch.tensor([tokenizer.encode(formatted_prompt).ids], dtype=torch.long, device=device)
+    # Generate response with sampling
+    output = generate(
+        model,
+        input_ids,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=50,
+        top_p=top_p,
+        do_sample=True,
+        eos_token_id=EOS_TOKEN_ID,
+    )
+    # Decode full output text
+    full_text = tokenizer.decode(output[0].tolist())
+    # Extract only assistant’s part
+    if "<|im_start|>assistant" in full_text:
+        response = full_text.split("<|im_start|>assistant")[-1]
+        if "<|im_end|>" in response:
+            response = response.split("<|im_end|>")[0]
+        return response.strip()
+    return full_text.strip()
+# -----------------------------
+# Gradio Interface
+# -----------------------------
+demo = gr.Interface(
+    fn=generate_response,
+    inputs=[
+        gr.Textbox(label="User Prompt", placeholder="Ask Lumen anything...", lines=3),
+        gr.Slider(10, 500, value=200, label="Max Tokens"),
+        gr.Slider(0.1, 2.0, value=0.7, label="Temperature"),
+        gr.Slider(0.1, 1.0, value=0.9, label="Top-p"),
+    ],
+    outputs=gr.Textbox(label="Lumen’s Response", lines=10),
+    title="Lumen Instruct Model",
+    description="Chat with Lumen — a fine-tuned instruction-following language model created by Hariom Jangra.",
+)
+# -----------------------------
+# Launch
+# -----------------------------
+if __name__ == "__main__":
+    demo.launch()