Spaces:

LuminLabs
/

flash

Sleeping

App Files Files Community

nova commited on Jan 17

Commit

a155f45

verified ·

1 Parent(s): 410a8fc

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -14

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-# Model Configuration: TinyLlama-1.1B (Classic, Fast, Non-Qwen)
-MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 # Check Device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Loading {MODEL_ID} on {device}...")
@@ -18,19 +18,20 @@ try:
 except Exception as e:
     print(f"❌ Error loading model: {e}")
 def chat(message, history):
-    # Prepare messages list for TinyLlama
-    # TinyLlama format: <|user|>\n...\n<|assistant|>\n...
-    # But applying chat template is safer if available.
     messages = []
-    messages.append({"role": "system", "content": "You are Lumin Flash, a helpful AI assistant."})
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    # Tokenize with template
     try:
         text = tokenizer.apply_chat_template(
             messages,
@@ -38,17 +39,17 @@ def chat(message, history):
             add_generation_prompt=True
         )
     except:
-        # Fallback manual format if template fails
-        text = f"<|system|>\nYou are Lumin Flash.<|end|>\n<|user|>\n{message}<|end|>\n<|assistant|>\n"
     inputs = tokenizer([text], return_tensors="pt").to(device)
     # Streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Generate
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
-        max_new_tokens=512,
         temperature=0.7,
         do_sample=True,
         top_p=0.9
@@ -64,9 +65,9 @@ def chat(message, history):
 # Gradio Interface
 demo = gr.ChatInterface(
     fn=chat,
-    chatbot=gr.Chatbot(height=500),
     textbox=gr.Textbox(placeholder="Ask Lumin Flash...", container=False, scale=7),
-    title=f"Lumin Flash ({MODEL_ID})"
 )
 if __name__ == "__main__":
     demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+# Model Configuration: Qwen 2.5 1.5B (Much Smarter, still runs on Free CPU)
+MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 # Check Device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Loading {MODEL_ID} on {device}...")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
 def chat(message, history):
+    # Prepare messages list
     messages = []
+    # Enhanced System Prompt
+    messages.append({
+        "role": "system",
+        "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
+    })
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
+    # Tokenize with chat template
     try:
         text = tokenizer.apply_chat_template(
             messages,
             add_generation_prompt=True
         )
     except:
+        # Fallback manual format for Qwen (ChatML style)
+        text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
     inputs = tokenizer([text], return_tensors="pt").to(device)
     # Streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Generate parameters (Tuned for smarts)
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
+        max_new_tokens=1024,  # Increased to prevent cut-offs
         temperature=0.7,
         do_sample=True,
         top_p=0.9
 # Gradio Interface
 demo = gr.ChatInterface(
     fn=chat,
+    chatbot=gr.Chatbot(height=500, type="messages"), # Updated type for newer Gradio versions
     textbox=gr.Textbox(placeholder="Ask Lumin Flash...", container=False, scale=7),
+    title=f"Lumin Flash (Smart Edition)"
 )
 if __name__ == "__main__":
     demo.queue().launch(server_name="0.0.0.0", server_port=7860)