flash / app.py
novapixelentretaiment's picture
Update app.py
0b329f8 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
# Model Configuration: Qwen 2.5 0.5B (ULTRA FAST - Low RAM/CPU)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
# Resource Optimization (Save Cores & RAM)
import os
os.environ["OMP_NUM_THREADS"] = "4" # Limit threads to avoid CPU contention
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)
# Check Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸš€ Loading {MODEL_ID} on {device}...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True # Optimize RAM loading
)
except Exception as e:
print(f"❌ Error loading model: {e}")
def chat(message, history):
# Prepare messages list
messages = []
# Enhanced System Prompt
messages.append({
"role": "system",
"content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
})
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Tokenize with chat template
try:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except:
# Fallback manual format for Qwen (ChatML style)
text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer([text], return_tensors="pt").to(device)
# Streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Generate parameters (Tuned for smarts)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024, # Increased to prevent cut-offs
temperature=0.7,
do_sample=True,
top_p=0.9
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Yield Output
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
# Gradio Interface
demo = gr.ChatInterface(
fn=chat,
chatbot=gr.Chatbot(height=500), # Removed type="messages" for compatibility
textbox=gr.Textbox(placeholder="Ask Lumin Flash...", container=False, scale=7),
title=f"Lumin Flash (Smart Edition)"
)
if __name__ == "__main__":
demo.queue().launch(server_name="0.0.0.0", server_port=7860)