Gargaz commited on
Commit
cf94df0
·
verified ·
1 Parent(s): 32e5194

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +81 -0
README.md CHANGED
@@ -31,3 +31,84 @@ Install the necessary dependencies with:
31
  ```bash
32
  pip install llama-cpp-python huggingface_hub
33
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ```bash
32
  pip install llama-cpp-python huggingface_hub
33
  ```
34
+ Load the model with:
35
+
36
+ ```bash
37
+ import logging
38
+ import os
39
+ import time # Make sure to import time for measuring durations
40
+ from huggingface_hub import hf_hub_download
41
+ from llama_cpp import Llama
42
+
43
+ # Set up logging
44
+ logging.basicConfig(level=logging.INFO) # Set to INFO to reduce overhead
45
+ logger = logging.getLogger()
46
+
47
+ # Download the GGUF model
48
+ model_name = "Gargaz/GPT-2-gguf"
49
+ model_file = "llama3.1-Q4_K_M.gguf"
50
+ model_path = hf_hub_download(model_name, filename=model_file)
51
+
52
+ # Instantiate the model from the downloaded file
53
+ llm = Llama(
54
+ model_path=model_path,
55
+ n_ctx=16000, # Context length to use
56
+ n_threads=64, # Number of CPU threads
57
+ n_gpu_layers=32 # Number of model layers to offload to GPU
58
+ )
59
+
60
+ # System instructions for the AI
61
+ system_instructions = (
62
+ "You are a friendly conversational AI designed to respond clearly and concisely to user inquiries. "
63
+ "Stay on topic by answering questions directly, use a warm tone and acknowledge gratitude, ask for "
64
+ "clarification on vague questions, provide brief and helpful recommendations, and encourage users "
65
+ "to ask more questions to keep the conversation flowing."
66
+ "don't speak alone always respond just to the user input"
67
+ )
68
+
69
+ def chat():
70
+ """Start a chat session with the model."""
71
+ print("Introduceti 'exit' pentru a iesi din chat.")
72
+ while True:
73
+ user_input = input("Tu: ")
74
+ if user_input.lower() == 'exit':
75
+ print("Iesire din chat.")
76
+ break
77
+
78
+ # Prepare the prompt
79
+ full_prompt = f"{system_instructions}\nUser: {user_input}\nAI:"
80
+
81
+ # Limit AI responses to a maximum of 500 tokens for faster responses
82
+ generation_kwargs = {
83
+ "max_tokens": 40, # Reduced max tokens for faster inference
84
+ "stop": ["AI:"], # Change the stop token to ensure clarity
85
+ "echo": False,
86
+ }
87
+
88
+ try:
89
+ # Start measuring time for response generation
90
+ load_start_time = time.time()
91
+ res = llm(full_prompt, **generation_kwargs) # Res is a dictionary
92
+ load_time = (time.time() - load_start_time) * 1000 # Convert to ms
93
+
94
+ # Log load time
95
+ load_message = f"llama_perf_context_print: load time = {load_time:.2f} ms"
96
+ logger.info(load_message)
97
+
98
+ generated_text = res["choices"][0]["text"].strip()
99
+ print(f"AI: {generated_text}")
100
+
101
+ # Log prompt evaluation time and other metrics
102
+ num_tokens = len(full_prompt.split())
103
+ eval_message = f"llama_perf_context_print: prompt eval time = {load_time:.2f} ms / {num_tokens} tokens"
104
+ logger.info(eval_message)
105
+
106
+ except Exception as e:
107
+ logger.error(f"Error generating response: {e}")
108
+ print("Eroare la generarea răspunsului.")
109
+
110
+ if __name__ == "__main__":
111
+ chat()
112
+
113
+ ```
114
+