Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("quantized_model") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "quantized_model", | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| return tokenizer, model | |
| tokenizer, model = load_model() | |
| st.title("Quantized Model Inference") | |
| user_input = st.text_input("Enter your prompt:") | |
| if st.button("Generate"): | |
| if user_input: | |
| inputs = tokenizer(user_input, return_tensors="pt").to("cuda") | |
| outputs = model.generate(**inputs) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| st.write(f"Response: {response}") | |
| else: | |
| st.write("Please enter a prompt.") |