Spaces:

VirtualInsight
/

Lumen-Instruct

Sleeping

App Files Files Community

VirtualInsight commited on Oct 23, 2025

Commit

ddcd52b

verified ·

1 Parent(s): e922296

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -18

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import gradio as gr
 import torch
 import json
 from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
-from ModelArchitecture import Transformer, ModelConfig, generate
 from safetensors.torch import load_file
-import re
 # -----------------------------
 # Load model and tokenizer
@@ -13,7 +13,7 @@ import re
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 REPO_ID = "VirtualInsight/Lumen-Instruct"
-# Download model assets
 model_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors")
 tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.json")
 config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
@@ -40,12 +40,12 @@ print(f"EOS token ID: {EOS_TOKEN_ID}")
 @torch.no_grad()
 def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
     """
-    Generates a clean assistant-only response from the Lumen Instruct model.
     """
-    # Chat-style input
     formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-    # Tokenize input
     input_ids = torch.tensor([tokenizer.encode(formatted_prompt).ids], dtype=torch.long, device=device)
     # Generate
@@ -60,29 +60,33 @@ def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
         eos_token_id=EOS_TOKEN_ID,
     )
-    # Decode text
     full_text = tokenizer.decode(output[0].tolist())
-    # -----------------------------
-    # Clean assistant-only response
-    # -----------------------------
-    # 1. Get part after last assistant marker
     if "<|im_start|>assistant" in full_text:
         response = full_text.split("<|im_start|>assistant")[-1]
     else:
         response = full_text
-    # 2. Cut off at end marker if exists
-    response = response.split("<|im_end|>")[0]
-    # 3. Remove any lingering user/assistant labels or context lines
     response = re.sub(r"(?i)\buser\b.*", "", response)
     response = re.sub(r"(?i)\bassistant\b.*", "", response)
-    # 4. Clean newlines and whitespace
     response = response.strip()
-    return response
 # -----------------------------
 # Gradio Interface

 import gradio as gr
 import torch
 import json
+import re
 from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
+from ModelArchitecture import Transformer, ModelConfig, generate
 # -----------------------------
 # Load model and tokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 REPO_ID = "VirtualInsight/Lumen-Instruct"
+# Download model files
 model_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors")
 tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.json")
 config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
 @torch.no_grad()
 def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
     """
+    Generates a clean assistant-only response, removing any echoed user text.
     """
+    # Chat-style prompt
     formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+    # Tokenize
     input_ids = torch.tensor([tokenizer.encode(formatted_prompt).ids], dtype=torch.long, device=device)
     # Generate
         eos_token_id=EOS_TOKEN_ID,
     )
+    # Decode
     full_text = tokenizer.decode(output[0].tolist())
+    # Extract assistant’s section
     if "<|im_start|>assistant" in full_text:
         response = full_text.split("<|im_start|>assistant")[-1]
+        response = response.split("<|im_end|>")[0] if "<|im_end|>" in response else response
     else:
         response = full_text
+    # Remove leftover role tokens and whitespace
     response = re.sub(r"(?i)\buser\b.*", "", response)
     response = re.sub(r"(?i)\bassistant\b.*", "", response)
     response = response.strip()
+    # 🧹 Final cleanup: remove leading user echo if present
+    lines = [line.strip() for line in response.splitlines() if line.strip()]
+    if len(lines) >= 2 and (
+        lines[0].lower() == prompt.strip().lower()  # exact echo
+        or lines[0].rstrip("!?.,").lower() == prompt.strip().rstrip("!?.,").lower()  # punctuation variation
+        or len(lines[0].split()) <= 3  # very short echo like "Hello!"
+    ):
+        lines = lines[1:]  # drop the first echo line
+    clean_response = "\n".join(lines).strip()
+    return clean_response
 # -----------------------------
 # Gradio Interface