Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import json | |
| import re | |
| from tokenizers import Tokenizer | |
| from huggingface_hub import hf_hub_download | |
| from safetensors.torch import load_file | |
| from ModelArchitecture import Transformer, ModelConfig, generate | |
| # ----------------------------- | |
| # Load model and tokenizer | |
| # ----------------------------- | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| REPO_ID = "VirtualInsight/Lumen-Instruct" | |
| # Download model files | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors") | |
| tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.json") | |
| config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json") | |
| # Initialize tokenizer and model | |
| tokenizer = Tokenizer.from_file(tokenizer_path) | |
| with open(config_path) as f: | |
| config = ModelConfig(**json.load(f)) | |
| model = Transformer(config).to(device) | |
| model.load_state_dict(load_file(model_path, device=str(device)), strict=False) | |
| model.eval() | |
| # ----------------------------- | |
| # Special Tokens | |
| # ----------------------------- | |
| EOS_TOKEN = "<|im_end|>" | |
| EOS_TOKEN_ID = tokenizer.encode(EOS_TOKEN).ids[0] | |
| print(f"EOS token ID: {EOS_TOKEN_ID}") | |
| # ----------------------------- | |
| # Generation Function | |
| # ----------------------------- | |
| def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9): | |
| """ | |
| Generates a clean assistant-only response, removing any echoed user text. | |
| """ | |
| # Chat-style prompt | |
| formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" | |
| # Tokenize | |
| input_ids = torch.tensor([tokenizer.encode(formatted_prompt).ids], dtype=torch.long, device=device) | |
| # Generate | |
| output = generate( | |
| model, | |
| input_ids, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_k=50, | |
| top_p=top_p, | |
| do_sample=True, | |
| eos_token_id=EOS_TOKEN_ID, | |
| ) | |
| # Decode | |
| full_text = tokenizer.decode(output[0].tolist()) | |
| # Extract assistant’s section | |
| if "<|im_start|>assistant" in full_text: | |
| response = full_text.split("<|im_start|>assistant")[-1] | |
| response = response.split("<|im_end|>")[0] if "<|im_end|>" in response else response | |
| else: | |
| response = full_text | |
| # Remove leftover role tokens and whitespace | |
| response = re.sub(r"(?i)\buser\b.*", "", response) | |
| response = re.sub(r"(?i)\bassistant\b.*", "", response) | |
| response = response.strip() | |
| # 🧹 Final cleanup: remove leading user echo if present | |
| lines = [line.strip() for line in response.splitlines() if line.strip()] | |
| if len(lines) >= 2 and ( | |
| lines[0].lower() == prompt.strip().lower() # exact echo | |
| or lines[0].rstrip("!?.,").lower() == prompt.strip().rstrip("!?.,").lower() # punctuation variation | |
| or len(lines[0].split()) <= 3 # very short echo like "Hello!" | |
| ): | |
| lines = lines[1:] # drop the first echo line | |
| clean_response = "\n".join(lines).strip() | |
| return clean_response | |
| # ----------------------------- | |
| # Gradio Interface | |
| # ----------------------------- | |
| demo = gr.Interface( | |
| fn=generate_response, | |
| inputs=[ | |
| gr.Textbox(label="User Prompt", placeholder="Ask Lumen anything...", lines=3), | |
| gr.Slider(10, 500, value=200, label="Max Tokens"), | |
| gr.Slider(0.1, 2.0, value=0.7, label="Temperature"), | |
| gr.Slider(0.1, 1.0, value=0.9, label="Top-p"), | |
| ], | |
| outputs=gr.Textbox(label="Lumen’s Response", lines=10), | |
| title="Lumen Instruct Model", | |
| description="Lumen Instruct — a fine-tuned, instruction-following language model built on the Lumen Foundational Model.", | |
| ) | |
| # ----------------------------- | |
| # Launch | |
| # ----------------------------- | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |