import os import logging import spaces import gradio as gr import torch import uuid import time from transformers import AutoTokenizer, AutoModelForCausalLM def capture_logs(log_body, log_file, uuid_label): logger = logging.getLogger('MyApp') logger.setLevel(logging.INFO) # Check if handlers are already added to avoid duplication if not logger.handlers: fh = logging.FileHandler(log_file) fh.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info('uuid: %s - %s', log_body, uuid_label) return print("CUDA available: ", torch.cuda.is_available()) print("MPS available: ", torch.backends.mps.is_available()) tokenizer = AutoTokenizer.from_pretrained( "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", trust_remote_code=True, torch_dtype=torch.float16, # Use float16 for better GPU memory efficiency device_map="auto" # Automatically handle device placement ) # Disable tokenizers parallelism warning os.environ["TOKENIZERS_PARALLELISM"] = "True" # Configure device if torch.cuda.is_available(): device = torch.device("cuda") # Print GPU information print(f"Using GPU: {torch.cuda.get_device_name(0)}") print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") else: device = torch.device("cpu") print("No GPU available, using CPU") # Function to handle user input and generate a response @spaces.GPU def chatbot_response(query, tokens, top_k, top_p): uuid_label = str(uuid.uuid4()) start_time = time.time() # Start timer # Generate response using the model messages = [{'role': 'user', 'content': query}] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt").to(model.device) outputs = model.generate( inputs, max_new_tokens=tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id ) model_response = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True) end_time = time.time() # End timer performance_time = round(end_time - start_time, 2) log_body = 'query: %s, pocessTime: %s, tokens: %s, top_k: %s, top_p: %s' % ( query, performance_time, tokens, top_k, top_p) capture_logs(uuid_label, 'query_logs.csv', log_body) return model_response # Set up the Gradio interface iface = gr.Interface( fn=chatbot_response, inputs=[ gr.Textbox(label="Ask our DSChatbot Expert"), gr.Slider(label="Max New Tokens", minimum=128, maximum=2048, step=128, value=512), gr.Slider(label="Top K", minimum=0, maximum=100, step=10, value=50), gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.1, value=0.95), ], outputs=gr.Textbox(label="Hope it helps!"), title="DSChatbot" ) if __name__ == "__main__": iface.launch()