Spaces:
Sleeping
Sleeping
Update run.py
Browse files
run.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
|
| 3 |
# Author: Andreas Fischer
|
| 4 |
# Date: October 10th, 2024
|
| 5 |
-
# Last update: October
|
| 6 |
##########################################################################################
|
| 7 |
|
| 8 |
import os
|
|
@@ -16,17 +16,18 @@ import ocrmypdf #convertPDF
|
|
| 16 |
from pypdf import PdfReader #convertPDF
|
| 17 |
import re #format_prompt
|
| 18 |
import gradio as gr # multimodal_response
|
| 19 |
-
from huggingface_hub import InferenceClient #multimodal_response
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
#---------------------------------------------------
|
| 23 |
# Specify models for text generation and embeddings
|
| 24 |
#---------------------------------------------------
|
| 25 |
|
| 26 |
myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
| 27 |
-
#myModel="princeton-nlp/gemma-2-9b-it-SimPO"
|
| 28 |
-
#myModel="google/gemma-2-2b-it"
|
| 29 |
#myModel="meta-llama/Llama-3.1-8B-Instruct"
|
|
|
|
|
|
|
| 30 |
#mod=myModel
|
| 31 |
#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
|
| 32 |
#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
|
|
@@ -34,6 +35,27 @@ myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
|
| 34 |
#res=tok.apply_chat_template(cha)
|
| 35 |
#print(tok.decode(res))
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| 38 |
#jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
|
| 39 |
device='cuda:0' if torch.cuda.is_available() else 'cpu'
|
|
@@ -84,8 +106,8 @@ def format_prompt0(message, history):
|
|
| 84 |
|
| 85 |
def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
|
| 86 |
startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
|
| 87 |
-
#startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="<end_of_turn>\n"): # google/gemma-2-2b-it
|
| 88 |
-
#startOfString="", template0="<|start_header_id|>system<|end_header_id|>\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id
|
| 89 |
if zeichenlimit is None: zeichenlimit=1000000000 # :-)
|
| 90 |
prompt = ""
|
| 91 |
if RAGAddon is not None:
|
|
@@ -256,17 +278,144 @@ def add_doc(path, session):
|
|
| 256 |
print(len(x))
|
| 257 |
if(len(x)==0):
|
| 258 |
chunkSize=40000
|
| 259 |
-
for i in range(round(len(corpus)/chunkSize+0.5)):
|
| 260 |
print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
|
| 261 |
ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
|
| 262 |
batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
|
| 263 |
textIDs=[str(id) for id in ids[0:len(batch)]]
|
| 264 |
-
ids=[str(id+len(x)+1) for id in ids[0:len(batch)]]
|
| 265 |
collection.add(documents=batch, ids=ids,
|
| 266 |
-
metadatas=[{"date": str("2024-10-10")} for b in batch])
|
| 267 |
print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
|
| 268 |
now = datetime.now()
|
| 269 |
gr.Info(f"Indexing complete!")
|
| 270 |
-
print(now-then)
|
| 271 |
return(collection)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
|
| 3 |
# Author: Andreas Fischer
|
| 4 |
# Date: October 10th, 2024
|
| 5 |
+
# Last update: October 26th, 2024
|
| 6 |
##########################################################################################
|
| 7 |
|
| 8 |
import os
|
|
|
|
| 16 |
from pypdf import PdfReader #convertPDF
|
| 17 |
import re #format_prompt
|
| 18 |
import gradio as gr # multimodal_response
|
| 19 |
+
from huggingface_hub import InferenceClient # multimodal_response
|
| 20 |
+
import json # multimodal_response (on-prem)
|
| 21 |
+
import requests # multimodal_response (on-prem)
|
| 22 |
|
| 23 |
#---------------------------------------------------
|
| 24 |
# Specify models for text generation and embeddings
|
| 25 |
#---------------------------------------------------
|
| 26 |
|
| 27 |
myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
|
|
|
|
|
|
| 28 |
#myModel="meta-llama/Llama-3.1-8B-Instruct"
|
| 29 |
+
#myModel="QuantFactory/gemma-2-9b-it-SimPO-GGUF"
|
| 30 |
+
#myModel="bartowski/gemma-2-9b-it-GGUF"
|
| 31 |
#mod=myModel
|
| 32 |
#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
|
| 33 |
#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
|
|
|
|
| 35 |
#res=tok.apply_chat_template(cha)
|
| 36 |
#print(tok.decode(res))
|
| 37 |
|
| 38 |
+
if("GGUF" in myModel): # start Llama-cpp-server for GGUF-models on premises:
|
| 39 |
+
#modelPath="/home/af/gguf/models/bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf"
|
| 40 |
+
modelPath="/home/af/gguf/models/QuantFactory/gemma-2-9b-it-SimPO-GGUF/gemma-2-9b-it-SimPO.Q4_K_M.gguf"
|
| 41 |
+
if(os.path.exists(modelPath)==False):
|
| 42 |
+
#url="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf?download=true"
|
| 43 |
+
url="https://huggingface.co/QuantFactory/gemma-2-9b-it-SimPO-GGUF/resolve/main/gemma-2-9b-it-SimPO.Q4_K_M.gguf?download=true"
|
| 44 |
+
response = requests.get(url)
|
| 45 |
+
with open("./model.gguf", mode="wb") as file:
|
| 46 |
+
file.write(response.content)
|
| 47 |
+
print("Model downloaded")
|
| 48 |
+
modelPath="./model.gguf"
|
| 49 |
+
print(modelPath)
|
| 50 |
+
import subprocess
|
| 51 |
+
command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "4", "--n_gpu_layers","42"] #20
|
| 52 |
+
subprocess.Popen(command)
|
| 53 |
+
print("Server ready!")
|
| 54 |
+
|
| 55 |
+
url="http://0.0.0.0:2600/v1/completions"
|
| 56 |
+
body={"prompt":"test","max_tokens":1000, "echo":"False","stream":"False"} #e.g. Mixtral-Instruct
|
| 57 |
+
test=requests.post(url, json=body, stream=False)
|
| 58 |
+
|
| 59 |
jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| 60 |
#jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
|
| 61 |
device='cuda:0' if torch.cuda.is_available() else 'cpu'
|
|
|
|
| 106 |
|
| 107 |
def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
|
| 108 |
startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
|
| 109 |
+
#startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="{response}<end_of_turn>\n"): # google/gemma-2-2b-it
|
| 110 |
+
#startOfString="<|begin_of_text|><", template0="<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", template2="{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct
|
| 111 |
if zeichenlimit is None: zeichenlimit=1000000000 # :-)
|
| 112 |
prompt = ""
|
| 113 |
if RAGAddon is not None:
|
|
|
|
| 278 |
print(len(x))
|
| 279 |
if(len(x)==0):
|
| 280 |
chunkSize=40000
|
| 281 |
+
for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
|
| 282 |
print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
|
| 283 |
ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
|
| 284 |
batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
|
| 285 |
textIDs=[str(id) for id in ids[0:len(batch)]]
|
| 286 |
+
ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
|
| 287 |
collection.add(documents=batch, ids=ids,
|
| 288 |
+
metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
|
| 289 |
print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
|
| 290 |
now = datetime.now()
|
| 291 |
gr.Info(f"Indexing complete!")
|
| 292 |
+
print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
|
| 293 |
return(collection)
|
| 294 |
|
| 295 |
+
|
| 296 |
+
#--------------------------------------------------------
|
| 297 |
+
# Function for response to user queries and pot. addenda
|
| 298 |
+
#--------------------------------------------------------
|
| 299 |
+
|
| 300 |
+
def multimodal_response(message, history, dropdown, hfToken, request: gr.Request):
|
| 301 |
+
print("def multimodal response!")
|
| 302 |
+
if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
|
| 303 |
+
inferenceClient = InferenceClient(model=myModel, token=hfToken)
|
| 304 |
+
else:
|
| 305 |
+
inferenceClient = InferenceClient(myModel)
|
| 306 |
+
global databases
|
| 307 |
+
if request:
|
| 308 |
+
session=request.session_hash
|
| 309 |
+
else:
|
| 310 |
+
session="0"
|
| 311 |
+
length=str(len(history))
|
| 312 |
+
print(databases)
|
| 313 |
+
if(not databases[-1][1]==session):
|
| 314 |
+
databases.append((date.today(),session))
|
| 315 |
+
#print(databases)
|
| 316 |
+
query=message["text"]
|
| 317 |
+
if(len(message["files"])>0): # is there at least one file attached?
|
| 318 |
+
collection=add_doc(message["files"][0], session)
|
| 319 |
+
else: # otherwise, you still want to get the collection with the session-based db
|
| 320 |
+
collection=add_doc(message["text"], session)
|
| 321 |
+
client = chromadb.PersistentClient(path=dbPath)
|
| 322 |
+
print(str(client.list_collections()))
|
| 323 |
+
x=collection.get(include=[])["ids"]
|
| 324 |
+
ragQuery=[format_prompt(query, history, historylimit=2,
|
| 325 |
+
#startOfString="", template0="{system}\n",template1="USER: {message}\n\n",template2="ASSISTANT: {response}\n\n") if len(history)>0 else query] # embed simply-formated dialogue
|
| 326 |
+
startOfString="", template1="{message}\n\n",template2="") if len(history)>0 else query] # embed simple string of User-queries only
|
| 327 |
+
context=collection.query(query_texts=ragQuery, n_results=3)
|
| 328 |
+
#context=["<Kontext "+str(i)+"> "+str(c)+"</Kontext "+str(i)+">" for i,c in enumerate(context["documents"][0])]
|
| 329 |
+
context=["Kontext "+str(i+1)+": \""+re.sub("\"","'",str(c))+"\"" for i,c in enumerate(context["documents"][0])]
|
| 330 |
+
gr.Info("Kontext:\n"+str(context))
|
| 331 |
+
generate_kwargs = dict(
|
| 332 |
+
temperature=float(0.9),
|
| 333 |
+
max_new_tokens=5000,
|
| 334 |
+
top_p=0.95,
|
| 335 |
+
repetition_penalty=1.0,
|
| 336 |
+
do_sample=True,
|
| 337 |
+
seed=42,
|
| 338 |
+
)
|
| 339 |
+
system="Mit Blick auf das folgende Gespräch und den relevanten Kontext, antworte auf die aktuelle Frage des Nutzers. "+\
|
| 340 |
+
"Antworte ausschließlich auf Basis der Informationen im Kontext.\n\nKontext:\n\n"+\
|
| 341 |
+
str("\n\n".join(context))
|
| 342 |
+
#"Given the following conversation, relevant context, and a follow up question, "+\
|
| 343 |
+
#"reply with an answer to the current question the user is asking. "+\
|
| 344 |
+
#"Return only your response to the question given the above information "+\
|
| 345 |
+
#"following the users instructions as needed.\n\nContext:"+\
|
| 346 |
+
print(system)
|
| 347 |
+
#formatted_prompt = format_prompt0(system+"\n"+query, history)
|
| 348 |
+
formatted_prompt = format_prompt(query, history,system=system)
|
| 349 |
+
print(formatted_prompt)
|
| 350 |
+
output = ""
|
| 351 |
+
if(not "GGUF" in myModel):
|
| 352 |
+
try:
|
| 353 |
+
stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
| 354 |
+
for response in stream:
|
| 355 |
+
output += response.token.text
|
| 356 |
+
yield output
|
| 357 |
+
except Exception as e:
|
| 358 |
+
output = "Für weitere Antworten von der KI gebe bitte einen gültigen HuggingFace-Token an."
|
| 359 |
+
if(len(context)>0):
|
| 360 |
+
output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
|
| 361 |
+
yield output
|
| 362 |
+
print(str(e))
|
| 363 |
+
else:
|
| 364 |
+
try:
|
| 365 |
+
#generate_kwargs["prompt"]=formatted_prompt #
|
| 366 |
+
generate_kwargs={"prompt":formatted_prompt,"max_tokens":1000, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
|
| 367 |
+
url="http://0.0.0.0:2600/v1/completions"
|
| 368 |
+
response=""
|
| 369 |
+
buffer=""
|
| 370 |
+
print("URL: "+url)
|
| 371 |
+
print("User: "+str(message)+"\nAssistant: ")
|
| 372 |
+
for text in requests.post(url, json=generate_kwargs, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
|
| 373 |
+
if buffer is None: buffer=""
|
| 374 |
+
buffer=str("".join(buffer))
|
| 375 |
+
text=text.decode('utf-8')
|
| 376 |
+
if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
|
| 377 |
+
buffer=buffer.split('"finish_reason": null}]}')
|
| 378 |
+
if(len(buffer)==1):
|
| 379 |
+
buffer="".join(buffer)
|
| 380 |
+
pass
|
| 381 |
+
if(len(buffer)==2):
|
| 382 |
+
part=buffer[0]+'"finish_reason": null}]}'
|
| 383 |
+
if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
|
| 384 |
+
try:
|
| 385 |
+
part = str(json.loads(part)["choices"][0]["text"])
|
| 386 |
+
print(part, end="", flush=True)
|
| 387 |
+
output += part
|
| 388 |
+
buffer=""
|
| 389 |
+
except Exception as e:
|
| 390 |
+
print("Exception:"+str(e))
|
| 391 |
+
pass
|
| 392 |
+
yield output
|
| 393 |
+
except Exception as e:
|
| 394 |
+
output = "Die KI antwortet gerade nicht."
|
| 395 |
+
if(len(context)>0):
|
| 396 |
+
output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
|
| 397 |
+
yield output
|
| 398 |
+
print(str(e))
|
| 399 |
+
if(len(context)>0):
|
| 400 |
+
output=output+"\n\n<br><details open><summary><strong>Quellen</strong></summary><br><ul>"+ "".join(["<li>" + c + "</li>" for c in context])+"</ul></details>"
|
| 401 |
+
yield output
|
| 402 |
+
|
| 403 |
+
#------------------------------
|
| 404 |
+
# Launch Gradio-ChatInterface
|
| 405 |
+
#------------------------------
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
i=gr.ChatInterface(multimodal_response,
|
| 409 |
+
title="Frag dein PDF",
|
| 410 |
+
multimodal=True,
|
| 411 |
+
additional_inputs=[
|
| 412 |
+
gr.Dropdown(
|
| 413 |
+
info="Wähle eine Variante",
|
| 414 |
+
choices=["1","2","3"],
|
| 415 |
+
value="1",
|
| 416 |
+
label="Variante"),
|
| 417 |
+
gr.Textbox(
|
| 418 |
+
value="",
|
| 419 |
+
label="HF_token"),
|
| 420 |
+
])
|
| 421 |
+
i.launch() #allowed_paths=["."])
|