| import streamlit as st | |
| # Load model directly | |
| # from transformers import AutoModel, AutoModelForCausalLM | |
| # from huggingface_hub import login | |
| import os | |
| access_token = os.getenv('HF_TOKEN3') | |
| login(token = access_token) | |
| file = 'llama-2-7b.Q5_0.gguf' | |
| from llama_cpp import Llama | |
| llm = Llama( | |
| model_path="./" + file, | |
| # n_gpu_layers=-1, # Uncomment to use GPU acceleration | |
| # seed=1337, # Uncomment to set a specific seed | |
| # n_ctx=2048, # Uncomment to increase the context window | |
| ) | |
| prompt = "Q: Name the planets in the solar system? A: " | |
| output = llm( | |
| prompt, # Prompt | |
| max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window | |
| stop=["Q:", "\n"], # Stop generating just before the model would generate a new question | |
| echo=True # Echo the prompt back in the output | |
| ) # Generate a completion, can also call create_completion | |
| print(output) | |
| # NO_GPU = 0 | |
| # GPU_LAYERS = 50 | |
| # llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU) | |
| # # model = AutoModelForCausalLM.from_pretrained("valencar/llamm", | |
| # # model_file=file, model_type="llama", gpu_layers=NO_GPU) | |
| # # access_token = os.getenv('HF_TOKEN2') | |
| # # login(token = access_token) | |
| # prompt = "AI is going to" | |
| with st.container(): | |
| st.write('\n\n') | |
| st.write(prompt) | |
| answer = output | |
| st.write(answer) | |
| print(answer) |