Spaces:
Paused
Paused
| # Import necessary libraries | |
| import os | |
| from threading import Thread | |
| import argparse | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer, AutoModelForCausalLM | |
| from peft import PeftConfig, PeftModel | |
| from utils import get_device # Assuming this function exists | |
| from huggingface_hub import login | |
| # Authenticate using Hugging Face API token from environment variable | |
| hf_api_token = os.getenv("HF_API_TOKEN") | |
| if hf_api_token is None: | |
| raise ValueError("Hugging Face API token not found in environment variables. Please set it as a secret in Hugging Face Spaces.") | |
| login(token=hf_api_token) | |
| # Create the parser | |
| parser = argparse.ArgumentParser(description='Check model usage.') | |
| # Add the arguments | |
| parser.add_argument('--baseonly', action='store_true', | |
| help='A boolean switch to indicate base only mode') | |
| # Execute the parse_args() method | |
| args = parser.parse_args() | |
| # Define model and adapter names, data type, and quantization type | |
| model_name = "microsoft/Phi-3-mini-4k-instruct" | |
| adapters_name = "zurd46/eliAI" | |
| torch_dtype = torch.bfloat16 # Set the appropriate torch data type | |
| # Display device and CPU thread information | |
| device = get_device() | |
| print(f"Number of GPUs available: {torch.cuda.device_count()}") | |
| print(f"Running on device: {device}") | |
| print(f"CPU threads: {torch.get_num_threads()}") | |
| # Check if CUDA is available and set the device accordingly | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("CUDA is not available. Ensure that a GPU is available and properly configured.") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Load base model | |
| model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| # Load adapter if available and not baseonly | |
| usingAdapter = False | |
| if not args.baseonly: | |
| usingAdapter = True | |
| model = PeftModel.from_pretrained(model, adapters_name) | |
| model.to(device) | |
| print(f"Model {model_name} loaded successfully on {device}") | |
| # Function to run the text generation process | |
| def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): | |
| template = "<|context|><|user|>\n{}<|end|>\n<|assistant|>" | |
| model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt") | |
| model_inputs = model_inputs.to(device) | |
| # Generate text in a separate thread | |
| streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) | |
| generate_kwargs = dict( | |
| input_ids=model_inputs['input_ids'], | |
| streamer=streamer, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| top_p=top_p, | |
| temperature=float(temperature), | |
| top_k=top_k, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id | |
| ) | |
| t = Thread(target=model.generate, kwargs=generate_kwargs) | |
| t.start() | |
| # Retrieve and yield the generated text | |
| model_output = "" | |
| for new_text in streamer: | |
| model_output += new_text | |
| return model_output | |
| # Gradio UI setup | |
| with gr.Blocks(css=""" | |
| .form.svelte-sfqy0y { | |
| background: var(--block-background-fill); | |
| padding: 20px; | |
| } | |
| body { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; | |
| color: #e0e0e0; | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| } | |
| .gradio-container { | |
| max-width: 900px; | |
| margin: auto; | |
| padding: 20px; | |
| border-radius: 8px; | |
| box-shadow: 0 0 10px rgba(0,0,0,0.5); | |
| } | |
| .gr-button { | |
| color: white; | |
| border: none; | |
| border-radius: 4px; | |
| padding: 10px 24px; | |
| cursor: pointer; | |
| } | |
| .gr-button:hover { | |
| background-color: #3700b3; | |
| } | |
| .gr-slider input[type=range] { | |
| -webkit-appearance: none; | |
| width: 100%; | |
| height: 8px; | |
| border-radius: 5px; | |
| outline: none; | |
| opacity: 0.9; | |
| -webkit-transition: .2s; | |
| transition: opacity .2s; | |
| } | |
| .gr-slider input[type=range]:hover { | |
| opacity: 1; | |
| } | |
| .gr-textbox { | |
| color: white; | |
| border: none; | |
| border-radius: 4px; | |
| padding: 10px; | |
| } | |
| .chatbox { | |
| max-height: 400px; | |
| overflow-y: auto; | |
| margin-bottom: 20px; | |
| } | |
| """) as demo: | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1>🌙 eliAI Text Generation Interface</h1> | |
| <h3>Model: Phi-3-mini-4k-instruct</h3> | |
| <h4>Developed by Daniel Zurmühle</h4> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| user_text = gr.Textbox(placeholder="Enter your question here", label="User Input", lines=3, elem_classes="gr-textbox") | |
| button_submit = gr.Button(value="Submit", elem_classes="gr-button") | |
| max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=1000, step=1, label="Max New Tokens") | |
| top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)") | |
| top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k") | |
| temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature") | |
| with gr.Column(scale=7): | |
| model_output = gr.Chatbot(label="Chatbot Output", height=566) | |
| def handle_submit(text, top_p, temperature, top_k, max_new_tokens): | |
| response = run_generation(text, top_p, temperature, top_k, max_new_tokens) | |
| return [(text, response)] | |
| button_submit.click(handle_submit, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) | |
| user_text.submit(handle_submit, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) | |
| demo.queue(max_size=32).launch(server_name="0.0.0.0", server_port=7860) | |