Spaces:
Runtime error
Runtime error
kwabs22
commited on
Commit
·
cd998d9
1
Parent(s):
7e4c949
RAG Placeholder demo test
Browse files
app.py
CHANGED
|
@@ -76,9 +76,120 @@ from sentence_transformers import SentenceTransformer
|
|
| 76 |
# yield response, f"{tokens_per_second:.2f}"
|
| 77 |
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Load the embedding model
|
| 84 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
@@ -93,31 +204,28 @@ llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
|
| 93 |
|
| 94 |
# Sample knowledge base (replace with your own data)
|
| 95 |
knowledge_base = [
|
| 96 |
-
"The capital of France is Paris.",
|
| 97 |
-
"Python is a popular programming language.",
|
| 98 |
-
"Machine learning is a subset of artificial intelligence.",
|
| 99 |
-
"The Earth orbits around the Sun.",
|
|
|
|
| 100 |
]
|
| 101 |
|
| 102 |
# Create embeddings for the knowledge base
|
| 103 |
-
knowledge_base_embeddings = embedding_model.encode(knowledge_base)
|
| 104 |
|
| 105 |
def retrieve(query, k=2):
|
| 106 |
query_embedding = embedding_model.encode([query])
|
| 107 |
similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
| 108 |
top_k_indices = similarities.argsort(descending=True)[:k]
|
| 109 |
-
return [knowledge_base[i] for i in top_k_indices]
|
| 110 |
|
| 111 |
-
def
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
|
| 115 |
-
return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
|
| 116 |
|
| 117 |
@spaces.GPU
|
| 118 |
-
def llmguide_generate_response(prompt, stream=False):
|
| 119 |
-
print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
|
| 120 |
-
|
| 121 |
messages = [
|
| 122 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 123 |
{"role": "user", "content": prompt}
|
|
@@ -127,7 +235,7 @@ def llmguide_generate_response(prompt, stream=False):
|
|
| 127 |
tokenize=False,
|
| 128 |
add_generation_prompt=True
|
| 129 |
)
|
| 130 |
-
model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(
|
| 131 |
|
| 132 |
start_time = time.time()
|
| 133 |
total_tokens = 0
|
|
@@ -149,10 +257,10 @@ def llmguide_generate_response(prompt, stream=False):
|
|
| 149 |
total_tokens += 1
|
| 150 |
current_time = time.time()
|
| 151 |
tokens_per_second = total_tokens / (current_time - start_time)
|
| 152 |
-
yield generated_text, f"{tokens_per_second:.2f}", ""
|
| 153 |
|
| 154 |
-
|
| 155 |
-
yield generated_text, f"{tokens_per_second:.2f}",
|
| 156 |
else:
|
| 157 |
generated_ids = llmguide_model.generate(
|
| 158 |
model_inputs.input_ids,
|
|
@@ -165,36 +273,32 @@ def llmguide_generate_response(prompt, stream=False):
|
|
| 165 |
total_tokens = len(generated_ids[0])
|
| 166 |
end_time = time.time()
|
| 167 |
tokens_per_second = total_tokens / (end_time - start_time)
|
| 168 |
-
|
| 169 |
-
yield response, f"{tokens_per_second:.2f}",
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
def rag(query, stream=False):
|
| 182 |
-
retrieved_docs = retrieve(query)
|
| 183 |
-
context = " ".join(retrieved_docs)
|
| 184 |
-
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
| 185 |
|
| 186 |
-
generator = llmguide_generate_response(prompt, stream)
|
| 187 |
|
| 188 |
if stream:
|
| 189 |
def stream_output():
|
| 190 |
-
for generated_text, tokens_per_second, ram_usage in generator:
|
| 191 |
-
yield generated_text, tokens_per_second, ram_usage
|
| 192 |
return stream_output()
|
| 193 |
else:
|
| 194 |
# For non-streaming, we just need to get the final output
|
| 195 |
-
for generated_text, tokens_per_second, ram_usage in generator:
|
| 196 |
pass # This will iterate to the last yield
|
| 197 |
-
return generated_text, tokens_per_second, ram_usage
|
|
|
|
| 198 |
|
| 199 |
#--------------------------------------------------------------------------------------------------------------------------------
|
| 200 |
|
|
@@ -838,24 +942,43 @@ with gr.Blocks() as demo:
|
|
| 838 |
<div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
|
| 839 |
</div>""")
|
| 840 |
with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
|
| 841 |
-
gr.HTML("Placeholder for FAQ type - front end as prompt engineering for the first message to force direction of conversion")
|
| 842 |
gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
gr.Interface(
|
| 844 |
-
fn=
|
| 845 |
inputs=[
|
| 846 |
gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
|
|
|
| 847 |
gr.Checkbox(label="Stream output")
|
| 848 |
],
|
| 849 |
outputs=[
|
| 850 |
gr.Textbox(label="Generated Response"),
|
| 851 |
gr.Textbox(label="Tokens per second"),
|
| 852 |
-
gr.Textbox(label="
|
|
|
|
| 853 |
],
|
| 854 |
-
title="RAG Q&A System
|
| 855 |
-
description="Ask a question
|
| 856 |
)
|
| 857 |
-
("Placeholder for
|
| 858 |
-
|
|
|
|
|
|
|
|
|
|
| 859 |
gr.Markdown("# Qwen-0.5B-Instruct Language Model")
|
| 860 |
gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
|
| 861 |
gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
|
|
|
|
| 76 |
# yield response, f"{tokens_per_second:.2f}"
|
| 77 |
|
| 78 |
|
| 79 |
+
#---------
|
| 80 |
+
#----------
|
| 81 |
+
|
| 82 |
+
# # Initialize GPU tensor
|
| 83 |
+
# zero = torch.Tensor([0]).cuda()
|
| 84 |
+
# print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
|
| 85 |
+
|
| 86 |
+
# # Load the embedding model
|
| 87 |
+
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 88 |
+
|
| 89 |
+
# # Load the Qwen model and tokenizer
|
| 90 |
+
# llmguide_model = AutoModelForCausalLM.from_pretrained(
|
| 91 |
+
# "Qwen/Qwen2-0.5B-Instruct",
|
| 92 |
+
# torch_dtype="auto",
|
| 93 |
+
# device_map="auto"
|
| 94 |
+
# )
|
| 95 |
+
# llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
| 96 |
+
|
| 97 |
+
# # Sample knowledge base (replace with your own data)
|
| 98 |
+
# knowledge_base = [
|
| 99 |
+
# "The capital of France is Paris.",
|
| 100 |
+
# "Python is a popular programming language.",
|
| 101 |
+
# "Machine learning is a subset of artificial intelligence.",
|
| 102 |
+
# "The Earth orbits around the Sun.",
|
| 103 |
+
# "orbits are a group of fans of a music group"
|
| 104 |
+
# ]
|
| 105 |
+
|
| 106 |
+
# # Create embeddings for the knowledge base
|
| 107 |
+
# knowledge_base_embeddings = embedding_model.encode(knowledge_base)
|
| 108 |
+
|
| 109 |
+
# def retrieve(query, k=2):
|
| 110 |
+
# query_embedding = embedding_model.encode([query])
|
| 111 |
+
# similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
| 112 |
+
# top_k_indices = similarities.argsort(descending=True)[:k]
|
| 113 |
+
# return [knowledge_base[i] for i in top_k_indices]
|
| 114 |
+
|
| 115 |
+
# def get_resource_usage():
|
| 116 |
+
# ram_usage = psutil.virtual_memory().percent
|
| 117 |
+
# gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3) # Convert to GB
|
| 118 |
+
# gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
|
| 119 |
+
# return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
|
| 120 |
+
|
| 121 |
+
# @spaces.GPU
|
| 122 |
+
# def llmguide_generate_response(prompt, stream=False):
|
| 123 |
+
# print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
|
| 124 |
+
|
| 125 |
+
# messages = [
|
| 126 |
+
# {"role": "system", "content": "You are a helpful assistant."},
|
| 127 |
+
# {"role": "user", "content": prompt}
|
| 128 |
+
# ]
|
| 129 |
+
# text = llmguide_tokenizer.apply_chat_template(
|
| 130 |
+
# messages,
|
| 131 |
+
# tokenize=False,
|
| 132 |
+
# add_generation_prompt=True
|
| 133 |
+
# )
|
| 134 |
+
# model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
|
| 135 |
+
|
| 136 |
+
# start_time = time.time()
|
| 137 |
+
# total_tokens = 0
|
| 138 |
+
|
| 139 |
+
# if stream:
|
| 140 |
+
# streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
|
| 141 |
+
# generation_kwargs = dict(
|
| 142 |
+
# model_inputs,
|
| 143 |
+
# streamer=streamer,
|
| 144 |
+
# max_new_tokens=512,
|
| 145 |
+
# temperature=0.7,
|
| 146 |
+
# )
|
| 147 |
+
# thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
|
| 148 |
+
# thread.start()
|
| 149 |
+
|
| 150 |
+
# generated_text = ""
|
| 151 |
+
# for new_text in streamer:
|
| 152 |
+
# generated_text += new_text
|
| 153 |
+
# total_tokens += 1
|
| 154 |
+
# current_time = time.time()
|
| 155 |
+
# tokens_per_second = total_tokens / (current_time - start_time)
|
| 156 |
+
# yield generated_text, f"{tokens_per_second:.2f}", ""
|
| 157 |
+
|
| 158 |
+
# resource_usage = get_resource_usage()
|
| 159 |
+
# yield generated_text, f"{tokens_per_second:.2f}", resource_usage
|
| 160 |
+
# else:
|
| 161 |
+
# generated_ids = llmguide_model.generate(
|
| 162 |
+
# model_inputs.input_ids,
|
| 163 |
+
# max_new_tokens=512
|
| 164 |
+
# )
|
| 165 |
+
# generated_ids = [
|
| 166 |
+
# output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
| 167 |
+
# ]
|
| 168 |
+
# response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 169 |
+
# total_tokens = len(generated_ids[0])
|
| 170 |
+
# end_time = time.time()
|
| 171 |
+
# tokens_per_second = total_tokens / (end_time - start_time)
|
| 172 |
+
# resource_usage = get_resource_usage()
|
| 173 |
+
# yield response, f"{tokens_per_second:.2f}", resource_usage
|
| 174 |
+
|
| 175 |
+
# def rag(query, stream=False):
|
| 176 |
+
# retrieved_docs = retrieve(query)
|
| 177 |
+
# context = " ".join(retrieved_docs)
|
| 178 |
+
# prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
| 179 |
+
|
| 180 |
+
# generator = llmguide_generate_response(prompt, stream)
|
| 181 |
+
|
| 182 |
+
# if stream:
|
| 183 |
+
# def stream_output():
|
| 184 |
+
# for generated_text, tokens_per_second, ram_usage in generator:
|
| 185 |
+
# yield generated_text, tokens_per_second, ram_usage
|
| 186 |
+
# return stream_output()
|
| 187 |
+
# else:
|
| 188 |
+
# # For non-streaming, we just need to get the final output
|
| 189 |
+
# for generated_text, tokens_per_second, ram_usage in generator:
|
| 190 |
+
# pass # This will iterate to the last yield
|
| 191 |
+
# return generated_text, tokens_per_second, ram_usage
|
| 192 |
+
|
| 193 |
|
| 194 |
# Load the embedding model
|
| 195 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
| 204 |
|
| 205 |
# Sample knowledge base (replace with your own data)
|
| 206 |
knowledge_base = [
|
| 207 |
+
{"id": "doc1", "content": "The capital of France is Paris."},
|
| 208 |
+
{"id": "doc2", "content": "Python is a popular programming language."},
|
| 209 |
+
{"id": "doc3", "content": "Machine learning is a subset of artificial intelligence."},
|
| 210 |
+
{"id": "doc4", "content": "The Earth orbits around the Sun."},
|
| 211 |
+
{"id": "doc5", "content": "orbits is the name of a korean fangroup"},
|
| 212 |
]
|
| 213 |
|
| 214 |
# Create embeddings for the knowledge base
|
| 215 |
+
knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
|
| 216 |
|
| 217 |
def retrieve(query, k=2):
|
| 218 |
query_embedding = embedding_model.encode([query])
|
| 219 |
similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
| 220 |
top_k_indices = similarities.argsort(descending=True)[:k]
|
| 221 |
+
return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]
|
| 222 |
|
| 223 |
+
def get_ram_usage():
|
| 224 |
+
ram = psutil.virtual_memory()
|
| 225 |
+
return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
|
|
|
|
|
|
|
| 226 |
|
| 227 |
@spaces.GPU
|
| 228 |
+
def llmguide_generate_response(prompt, doc_ids=None, stream=False):
|
|
|
|
|
|
|
| 229 |
messages = [
|
| 230 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 231 |
{"role": "user", "content": prompt}
|
|
|
|
| 235 |
tokenize=False,
|
| 236 |
add_generation_prompt=True
|
| 237 |
)
|
| 238 |
+
model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
|
| 239 |
|
| 240 |
start_time = time.time()
|
| 241 |
total_tokens = 0
|
|
|
|
| 257 |
total_tokens += 1
|
| 258 |
current_time = time.time()
|
| 259 |
tokens_per_second = total_tokens / (current_time - start_time)
|
| 260 |
+
yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
|
| 261 |
|
| 262 |
+
ram_usage = get_ram_usage()
|
| 263 |
+
yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
|
| 264 |
else:
|
| 265 |
generated_ids = llmguide_model.generate(
|
| 266 |
model_inputs.input_ids,
|
|
|
|
| 273 |
total_tokens = len(generated_ids[0])
|
| 274 |
end_time = time.time()
|
| 275 |
tokens_per_second = total_tokens / (end_time - start_time)
|
| 276 |
+
ram_usage = get_ram_usage()
|
| 277 |
+
yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
|
| 278 |
+
|
| 279 |
+
def process_query(query, use_rag, stream=False):
|
| 280 |
+
if use_rag:
|
| 281 |
+
retrieved_docs = retrieve(query)
|
| 282 |
+
context = " ".join([doc for doc, _ in retrieved_docs])
|
| 283 |
+
doc_ids = [doc_id for _, doc_id in retrieved_docs]
|
| 284 |
+
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
| 285 |
+
else:
|
| 286 |
+
prompt = query
|
| 287 |
+
doc_ids = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
+
generator = llmguide_generate_response(prompt, doc_ids, stream)
|
| 290 |
|
| 291 |
if stream:
|
| 292 |
def stream_output():
|
| 293 |
+
for generated_text, tokens_per_second, ram_usage, doc_references in generator:
|
| 294 |
+
yield generated_text, tokens_per_second, ram_usage, doc_references
|
| 295 |
return stream_output()
|
| 296 |
else:
|
| 297 |
# For non-streaming, we just need to get the final output
|
| 298 |
+
for generated_text, tokens_per_second, ram_usage, doc_references in generator:
|
| 299 |
pass # This will iterate to the last yield
|
| 300 |
+
return generated_text, tokens_per_second, ram_usage, doc_references
|
| 301 |
+
|
| 302 |
|
| 303 |
#--------------------------------------------------------------------------------------------------------------------------------
|
| 304 |
|
|
|
|
| 942 |
<div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
|
| 943 |
</div>""")
|
| 944 |
with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
|
|
|
|
| 945 |
gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
|
| 946 |
+
# gr.Interface(
|
| 947 |
+
# fn=rag,
|
| 948 |
+
# inputs=[
|
| 949 |
+
# gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
| 950 |
+
# gr.Checkbox(label="Stream output")
|
| 951 |
+
# ],
|
| 952 |
+
# outputs=[
|
| 953 |
+
# gr.Textbox(label="Generated Response"),
|
| 954 |
+
# gr.Textbox(label="Tokens per second"),
|
| 955 |
+
# gr.Textbox(label="Resource Usage")
|
| 956 |
+
# ],
|
| 957 |
+
# title="RAG Q&A System with GPU Acceleration and Resource Monitoring",
|
| 958 |
+
# description="Ask a question and get an answer based on the retrieved context. The response is generated using a GPU-accelerated model. Resource usage is logged at the end of generation."
|
| 959 |
+
# )
|
| 960 |
+
|
| 961 |
gr.Interface(
|
| 962 |
+
fn=process_query,
|
| 963 |
inputs=[
|
| 964 |
gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
| 965 |
+
gr.Checkbox(label="Use RAG"),
|
| 966 |
gr.Checkbox(label="Stream output")
|
| 967 |
],
|
| 968 |
outputs=[
|
| 969 |
gr.Textbox(label="Generated Response"),
|
| 970 |
gr.Textbox(label="Tokens per second"),
|
| 971 |
+
gr.Textbox(label="RAM Usage"),
|
| 972 |
+
gr.Textbox(label="Referenced Documents")
|
| 973 |
],
|
| 974 |
+
title="RAG/Non-RAG Q&A System",
|
| 975 |
+
description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
|
| 976 |
)
|
| 977 |
+
gr.HTML("Placeholder for FAQ type (merge as buttons on the above interface) - front end as prompt engineering for the first message to force direction of conversion")
|
| 978 |
+
|
| 979 |
+
|
| 980 |
+
gr.HTML("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
|
| 981 |
+
gr.HTML("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
|
| 982 |
gr.Markdown("# Qwen-0.5B-Instruct Language Model")
|
| 983 |
gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
|
| 984 |
gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
|