locked the thread before generation on llama
Browse files
app.py
CHANGED
|
@@ -3,8 +3,7 @@ import gradio as gr
|
|
| 3 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
| 4 |
from huggingface_hub import login
|
| 5 |
import os
|
| 6 |
-
|
| 7 |
-
from openai import OpenAI
|
| 8 |
import spaces
|
| 9 |
# import multiprocessing as mp
|
| 10 |
import sys
|
|
@@ -111,13 +110,19 @@ def llama_generation(input_text: str,
|
|
| 111 |
if temperature == 0:
|
| 112 |
generate_kwargs["do_sample"] = False
|
| 113 |
|
|
|
|
|
|
|
|
|
|
| 114 |
# # Place the generation in a thread so we can access it.
|
| 115 |
# # place the function as target and place the kwargs next as the kwargs
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
| 121 |
thread.start()
|
| 122 |
thread.join()
|
| 123 |
return streamer
|
|
|
|
| 3 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
| 4 |
from huggingface_hub import login
|
| 5 |
import os
|
| 6 |
+
import threading
|
|
|
|
| 7 |
import spaces
|
| 8 |
# import multiprocessing as mp
|
| 9 |
import sys
|
|
|
|
| 110 |
if temperature == 0:
|
| 111 |
generate_kwargs["do_sample"] = False
|
| 112 |
|
| 113 |
+
# Use a lock object to synchronize access to the llama_model
|
| 114 |
+
lock = threading.lock()
|
| 115 |
+
|
| 116 |
# # Place the generation in a thread so we can access it.
|
| 117 |
# # place the function as target and place the kwargs next as the kwargs
|
| 118 |
+
def generation_llama():
|
| 119 |
+
with lock:
|
| 120 |
+
# Generate response using Llama3
|
| 121 |
+
response = llama_model.generate(**generate_kwargs)
|
| 122 |
+
return response
|
| 123 |
+
|
| 124 |
+
# start the thread and wait for it to finish
|
| 125 |
+
thread = threading.Thread(target=generation_llama)
|
| 126 |
thread.start()
|
| 127 |
thread.join()
|
| 128 |
return streamer
|