Spaces:
Runtime error
Runtime error
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import transformers | |
| import torch | |
| import gradio as gr | |
| #Setting device to cuda | |
| torch.set_default_device("cuda") | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| # model_name = "deepapaikar/katzbot-phi2" | |
| # model = AutoModelForCausalLM.from_from_pretrained(model_name) | |
| # Initialize the model and tokenizer | |
| model = AutoModelForCausalLM.from_pretrained("deepapaikar/katzbot-phi2", | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True) | |
| tokenizer = AutoTokenizer.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True) | |
| # pipeline = transformers.pipeline( | |
| # "text-generation", | |
| # model=model, | |
| # torch_dtype=torch.float16, | |
| # ) | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # def predict_answer(question, token=25): | |
| # messages = [{"role": "user", "content": f"{question}"}] | |
| # prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).to(device) | |
| # outputs = pipeline(prompt, max_new_tokens=token, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) | |
| # return outputs[0]["generated_text"] | |
| def predict_answer(question, token=25): | |
| messages = [{"role": "user", "content": f"{question}"}] | |
| # Generate prompt text using the chat template | |
| prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| # Tokenize the prompt text to create input IDs suitable for the model | |
| inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True) | |
| # Move the tensor to the specified device | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Use the model directly for inference | |
| model.eval() # Ensure the model is in evaluation mode | |
| model.to(device) # Ensure the model is on the correct device | |
| # Generate outputs | |
| output_sequences = model.generate( | |
| input_ids=inputs['input_ids'], | |
| attention_mask=inputs['attention_mask'], | |
| max_length=token + inputs['input_ids'].shape[-1], # Adjust max_length accordingly | |
| do_sample=True, | |
| temperature=0.7, | |
| top_k=50, | |
| top_p=0.95 | |
| ) | |
| # Decode the output sequences to text | |
| output_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True) | |
| return output_text | |
| def gradio_predict(question, token): | |
| answer = predict_answer(question, token) | |
| return answer | |
| # Define the Gradio interface | |
| iface = gr.Interface( | |
| fn=gradio_predict, | |
| inputs=[gr.Textbox(label="Question", placeholder="e.g. Where is Yeshiva University located?", scale=4), | |
| gr.Slider(2, 100, value=25, label="Token Count", info="Choose between 2 and 100")], | |
| outputs=gr.TextArea(label="Answer"), | |
| title="KatzBot", | |
| description="Phi2-trial1", | |
| ) | |
| # Launch the app | |
| iface.queue().launch(debug=True) | |