Spaces:
No application file
No application file
Create src/utils/chatbot.py
Browse files- src/utils/chatbot.py +108 -0
src/utils/chatbot.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from openai import OpenAI
|
| 3 |
+
import os
|
| 4 |
+
from langchain_community.vectorstores import Chroma
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
import re
|
| 7 |
+
import ast
|
| 8 |
+
import html
|
| 9 |
+
from utils.load_config import LoadConfig
|
| 10 |
+
|
| 11 |
+
client = OpenAI()
|
| 12 |
+
APPCFG = LoadConfig()
|
| 13 |
+
|
| 14 |
+
class ChatBot:
|
| 15 |
+
@staticmethod
|
| 16 |
+
def respond(chatbot: List, message: str, data_type: str = "Preprocessed doc", temperature: float = 0.0) -> Tuple:
|
| 17 |
+
if data_type == "Preprocessed doc":
|
| 18 |
+
# directories
|
| 19 |
+
if os.path.exists(APPCFG.persist_directory):
|
| 20 |
+
vectordb = Chroma(persist_directory=APPCFG.persist_directory,
|
| 21 |
+
embedding_function=APPCFG.embedding_model)
|
| 22 |
+
else:
|
| 23 |
+
chatbot.append(
|
| 24 |
+
(message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module."))
|
| 25 |
+
return "", chatbot, None
|
| 26 |
+
|
| 27 |
+
elif data_type == "Upload doc: Process for RAG":
|
| 28 |
+
if os.path.exists(APPCFG.custom_persist_directory):
|
| 29 |
+
vectordb = Chroma(persist_directory=APPCFG.custom_persist_directory,
|
| 30 |
+
embedding_function=APPCFG.embedding_model)
|
| 31 |
+
else:
|
| 32 |
+
chatbot.append(
|
| 33 |
+
(message, f"No file was uploaded. Please first upload your files using the 'upload' button."))
|
| 34 |
+
return "", chatbot, None
|
| 35 |
+
|
| 36 |
+
docs = vectordb.similarity_search(message, k=APPCFG.k)
|
| 37 |
+
print(docs)
|
| 38 |
+
question = "# User new question:\n" + message
|
| 39 |
+
retrieved_content = ChatBot.clean_references(docs)
|
| 40 |
+
# Memory: previous two Q&A pairs
|
| 41 |
+
chat_history = f"Chat history:\n {str(chatbot[-APPCFG.number_of_q_a_pairs:])}\n\n"
|
| 42 |
+
prompt = f"{chat_history}{retrieved_content}{question}"
|
| 43 |
+
print("========================")
|
| 44 |
+
print(prompt)
|
| 45 |
+
response = client.chat.completions.create(
|
| 46 |
+
model=APPCFG.llm_engine,
|
| 47 |
+
messages=[
|
| 48 |
+
{"role": "system", "content": APPCFG.llm_system_role},
|
| 49 |
+
{"role": "user", "content": prompt}
|
| 50 |
+
],
|
| 51 |
+
temperature=temperature,
|
| 52 |
+
# stream=False
|
| 53 |
+
)
|
| 54 |
+
chatbot.append(
|
| 55 |
+
(message, response.choices[0].message.content))
|
| 56 |
+
time.sleep(2)
|
| 57 |
+
|
| 58 |
+
return "", chatbot, retrieved_content
|
| 59 |
+
|
| 60 |
+
@staticmethod
|
| 61 |
+
def clean_references(documents: List) -> str:
|
| 62 |
+
server_url = "http://localhost:8000"
|
| 63 |
+
documents = [str(x)+"\n\n" for x in documents]
|
| 64 |
+
markdown_documents = ""
|
| 65 |
+
counter = 1
|
| 66 |
+
for doc in documents:
|
| 67 |
+
# Extract content and metadata
|
| 68 |
+
content, metadata = re.match(
|
| 69 |
+
r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
|
| 70 |
+
metadata = metadata.split('=', 1)[1]
|
| 71 |
+
metadata_dict = ast.literal_eval(metadata)
|
| 72 |
+
|
| 73 |
+
# Decode newlines and other escape sequences
|
| 74 |
+
content = bytes(content, "utf-8").decode("unicode_escape")
|
| 75 |
+
|
| 76 |
+
# Replace escaped newlines with actual newlines
|
| 77 |
+
content = re.sub(r'\\n', '\n', content)
|
| 78 |
+
# Remove special tokens
|
| 79 |
+
content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
|
| 80 |
+
# Remove any remaining multiple spaces
|
| 81 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 82 |
+
|
| 83 |
+
# Decode HTML entities
|
| 84 |
+
content = html.unescape(content)
|
| 85 |
+
|
| 86 |
+
# Replace incorrect unicode characters with correct ones
|
| 87 |
+
content = content.encode('latin1').decode('utf-8', 'ignore')
|
| 88 |
+
|
| 89 |
+
# Remove or replace special characters and mathematical symbols
|
| 90 |
+
# This step may need to be customized based on the specific symbols in your documents
|
| 91 |
+
content = re.sub(r'â', '-', content)
|
| 92 |
+
content = re.sub(r'â', '∈', content)
|
| 93 |
+
content = re.sub(r'Ã', '×', content)
|
| 94 |
+
content = re.sub(r'ï¬', 'fi', content)
|
| 95 |
+
content = re.sub(r'â', '∈', content)
|
| 96 |
+
content = re.sub(r'·', '·', content)
|
| 97 |
+
content = re.sub(r'ï¬', 'fl', content)
|
| 98 |
+
|
| 99 |
+
pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
|
| 100 |
+
|
| 101 |
+
# Append cleaned content to the markdown string with two newlines between documents
|
| 102 |
+
markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
|
| 103 |
+
f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
|
| 104 |
+
f"Page number: {str(metadata_dict['page'])}" + " | " +\
|
| 105 |
+
f"[View PDF]({pdf_url})" "\n\n"
|
| 106 |
+
counter += 1
|
| 107 |
+
|
| 108 |
+
return markdown_documents
|