Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -36,10 +36,10 @@ if torch.cuda.is_available():
|
|
| 36 |
)
|
| 37 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
| 38 |
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",quantization_config=bnb_config)
|
| 39 |
-
model = PeftModel.from_pretrained(
|
| 40 |
-
base_model,"ranamhamoud/storytell")
|
| 41 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 42 |
-
tokenizer.
|
|
|
|
| 43 |
|
| 44 |
def make_prompt(entry):
|
| 45 |
return f"### Human: YOUR INSTRUCTION HERE: {entry} ### Assistant:"
|
|
@@ -49,17 +49,20 @@ def generate(
|
|
| 49 |
message: str,
|
| 50 |
chat_history: list[tuple[str, str]],
|
| 51 |
max_new_tokens: int = 1024,
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
repetition_penalty: float = 1.
|
| 56 |
) -> Iterator[str]:
|
| 57 |
conversation = []
|
| 58 |
for user, assistant in chat_history:
|
| 59 |
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
|
| 60 |
conversation.append({"role": "user", "content": make_prompt(message)})
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
| 64 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
| 65 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
|
@@ -71,9 +74,9 @@ def generate(
|
|
| 71 |
streamer=streamer,
|
| 72 |
max_new_tokens=max_new_tokens,
|
| 73 |
do_sample=True,
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
num_beams=1,
|
| 78 |
repetition_penalty=repetition_penalty,
|
| 79 |
)
|
|
|
|
| 36 |
)
|
| 37 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
| 38 |
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",quantization_config=bnb_config)
|
| 39 |
+
model = PeftModel.from_pretrained(model,"ranamhamoud/storytell")
|
|
|
|
| 40 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 41 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 42 |
+
|
| 43 |
|
| 44 |
def make_prompt(entry):
|
| 45 |
return f"### Human: YOUR INSTRUCTION HERE: {entry} ### Assistant:"
|
|
|
|
| 49 |
message: str,
|
| 50 |
chat_history: list[tuple[str, str]],
|
| 51 |
max_new_tokens: int = 1024,
|
| 52 |
+
temperature: float = 0.1, # Lower -> less random
|
| 53 |
+
top_p: float = 0.1, # Lower -> less random, considering only the top 10% of tokens at each step
|
| 54 |
+
top_k: int = 1, # Least random, only the most likely next token is considered
|
| 55 |
+
repetition_penalty: float = 1.0, # No repetition penalty
|
| 56 |
) -> Iterator[str]:
|
| 57 |
conversation = []
|
| 58 |
for user, assistant in chat_history:
|
| 59 |
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
|
| 60 |
conversation.append({"role": "user", "content": make_prompt(message)})
|
| 61 |
|
| 62 |
+
enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
input_ids = enc.input_ids
|
| 66 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
| 67 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
| 68 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
|
|
|
| 74 |
streamer=streamer,
|
| 75 |
max_new_tokens=max_new_tokens,
|
| 76 |
do_sample=True,
|
| 77 |
+
top_p=top_p,
|
| 78 |
+
top_k=top_k,
|
| 79 |
+
temperature=temperature,
|
| 80 |
num_beams=1,
|
| 81 |
repetition_penalty=repetition_penalty,
|
| 82 |
)
|