sid-0313 commited on
Commit
0fefc76
·
verified ·
1 Parent(s): 0659b87

Create src/utils/chatbot.py

Browse files
Files changed (1) hide show
  1. src/utils/chatbot.py +108 -0
src/utils/chatbot.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from openai import OpenAI
3
+ import os
4
+ from langchain_community.vectorstores import Chroma
5
+ from typing import List, Tuple
6
+ import re
7
+ import ast
8
+ import html
9
+ from utils.load_config import LoadConfig
10
+
11
+ client = OpenAI()
12
+ APPCFG = LoadConfig()
13
+
14
+ class ChatBot:
15
+ @staticmethod
16
+ def respond(chatbot: List, message: str, data_type: str = "Preprocessed doc", temperature: float = 0.0) -> Tuple:
17
+ if data_type == "Preprocessed doc":
18
+ # directories
19
+ if os.path.exists(APPCFG.persist_directory):
20
+ vectordb = Chroma(persist_directory=APPCFG.persist_directory,
21
+ embedding_function=APPCFG.embedding_model)
22
+ else:
23
+ chatbot.append(
24
+ (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module."))
25
+ return "", chatbot, None
26
+
27
+ elif data_type == "Upload doc: Process for RAG":
28
+ if os.path.exists(APPCFG.custom_persist_directory):
29
+ vectordb = Chroma(persist_directory=APPCFG.custom_persist_directory,
30
+ embedding_function=APPCFG.embedding_model)
31
+ else:
32
+ chatbot.append(
33
+ (message, f"No file was uploaded. Please first upload your files using the 'upload' button."))
34
+ return "", chatbot, None
35
+
36
+ docs = vectordb.similarity_search(message, k=APPCFG.k)
37
+ print(docs)
38
+ question = "# User new question:\n" + message
39
+ retrieved_content = ChatBot.clean_references(docs)
40
+ # Memory: previous two Q&A pairs
41
+ chat_history = f"Chat history:\n {str(chatbot[-APPCFG.number_of_q_a_pairs:])}\n\n"
42
+ prompt = f"{chat_history}{retrieved_content}{question}"
43
+ print("========================")
44
+ print(prompt)
45
+ response = client.chat.completions.create(
46
+ model=APPCFG.llm_engine,
47
+ messages=[
48
+ {"role": "system", "content": APPCFG.llm_system_role},
49
+ {"role": "user", "content": prompt}
50
+ ],
51
+ temperature=temperature,
52
+ # stream=False
53
+ )
54
+ chatbot.append(
55
+ (message, response.choices[0].message.content))
56
+ time.sleep(2)
57
+
58
+ return "", chatbot, retrieved_content
59
+
60
+ @staticmethod
61
+ def clean_references(documents: List) -> str:
62
+ server_url = "http://localhost:8000"
63
+ documents = [str(x)+"\n\n" for x in documents]
64
+ markdown_documents = ""
65
+ counter = 1
66
+ for doc in documents:
67
+ # Extract content and metadata
68
+ content, metadata = re.match(
69
+ r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
70
+ metadata = metadata.split('=', 1)[1]
71
+ metadata_dict = ast.literal_eval(metadata)
72
+
73
+ # Decode newlines and other escape sequences
74
+ content = bytes(content, "utf-8").decode("unicode_escape")
75
+
76
+ # Replace escaped newlines with actual newlines
77
+ content = re.sub(r'\\n', '\n', content)
78
+ # Remove special tokens
79
+ content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
80
+ # Remove any remaining multiple spaces
81
+ content = re.sub(r'\s+', ' ', content).strip()
82
+
83
+ # Decode HTML entities
84
+ content = html.unescape(content)
85
+
86
+ # Replace incorrect unicode characters with correct ones
87
+ content = content.encode('latin1').decode('utf-8', 'ignore')
88
+
89
+ # Remove or replace special characters and mathematical symbols
90
+ # This step may need to be customized based on the specific symbols in your documents
91
+ content = re.sub(r'–', '-', content)
92
+ content = re.sub(r'∈', '∈', content)
93
+ content = re.sub(r'×', '×', content)
94
+ content = re.sub(r'fi', 'fi', content)
95
+ content = re.sub(r'∈', '∈', content)
96
+ content = re.sub(r'·', '·', content)
97
+ content = re.sub(r'fl', 'fl', content)
98
+
99
+ pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
100
+
101
+ # Append cleaned content to the markdown string with two newlines between documents
102
+ markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
103
+ f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
104
+ f"Page number: {str(metadata_dict['page'])}" + " | " +\
105
+ f"[View PDF]({pdf_url})" "\n\n"
106
+ counter += 1
107
+
108
+ return markdown_documents