kanha-upadhyay commited on
Commit
6586d52
·
1 Parent(s): ed86bf8

Add initial project structure with .gitignore, environment example, and utility modules

Browse files
Files changed (8) hide show
  1. .env.example +2 -0
  2. .gitignore +5 -0
  3. app.py +173 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +23 -0
  6. requirements.txt +9 -0
  7. utils/__init__.py +5 -0
  8. utils/_file_parser.py +119 -0
.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=
2
+ LLAMA_CLOUD_API_KEY=
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .env
2
+ .venv
3
+ __pycache__/
4
+ *.faiss
5
+ *.pkl
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ import streamlit as st
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.messages import AIMessage, HumanMessage
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_openai.chat_models import ChatOpenAI
10
+ from langchain_openai.embeddings import OpenAIEmbeddings
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+
13
+ from utils import FileParser
14
+
15
+ vector_database_name = "rag-poc"
16
+ temp_pdf_folder = "temp-files"
17
+ vector_database_path = (
18
+ f"{os.environ.get('VECTOR_DATABASE_PATH', '.')}/{vector_database_name}"
19
+ )
20
+
21
+ RETRIEVER = None
22
+
23
+
24
+ def load_and_split(file, ocr_enabled):
25
+ with tempfile.TemporaryDirectory() as temp_pdf_folder:
26
+ local_filepath = os.path.join(temp_pdf_folder, file.name)
27
+ with open(local_filepath, "wb") as f:
28
+ f.write(file.getvalue())
29
+ text = FileParser().parse(input_dir=temp_pdf_folder, ocr_enabled=ocr_enabled)
30
+ docs = []
31
+ if text:
32
+ text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=512, chunk_overlap=100
34
+ )
35
+ texts = text_splitter.split_text(text)
36
+ docs = text_splitter.create_documents(
37
+ texts=texts, metadatas=[{"file_name": file.name}] * len(texts)
38
+ )
39
+ return docs
40
+
41
+
42
+ def initialize_vector_db():
43
+ vector_database = FAISS.from_texts([""], OpenAIEmbeddings())
44
+ vector_database.save_local(vector_database_path)
45
+ return vector_database
46
+
47
+
48
+ def load_vector_db():
49
+ if os.path.exists(vector_database_path):
50
+ return FAISS.load_local(
51
+ vector_database_path,
52
+ OpenAIEmbeddings(),
53
+ allow_dangerous_deserialization=True,
54
+ )
55
+ return initialize_vector_db()
56
+
57
+
58
+ def append_to_vector_db(docs: list = []):
59
+ global RETRIEVER
60
+ existing_vector_db = load_vector_db()
61
+ new_vector_db = FAISS.from_documents(docs, OpenAIEmbeddings())
62
+ existing_vector_db.merge_from(new_vector_db)
63
+ existing_vector_db.save_local(vector_database_path)
64
+ RETRIEVER = existing_vector_db.as_retriever()
65
+
66
+
67
+ def create_embeddings(files: list = [], ocr_enabled: bool = False):
68
+ for file in files:
69
+ docs = load_and_split(file=file, ocr_enabled=ocr_enabled)
70
+ if docs:
71
+ append_to_vector_db(docs=docs)
72
+ st.session_state.last_uploaded_files.append(file.name)
73
+ st.toast(f"{file.name} processed successfully")
74
+ print(f"{file.name} processed successfully")
75
+ else:
76
+ st.toast(f"{file.name} could not be processed")
77
+ print(f"{file.name} could not be processed")
78
+
79
+
80
+ def get_response(user_query, chat_history):
81
+ docs = RETRIEVER.invoke(user_query, k=20)
82
+ additional_info = RETRIEVER.invoke(
83
+ " ".join(
84
+ [
85
+ message.content
86
+ for message in chat_history
87
+ if isinstance(message, HumanMessage)
88
+ ]
89
+ ),
90
+ k=20,
91
+ )
92
+ docs_content = [doc.page_content for doc in docs]
93
+ for doc in additional_info:
94
+ if doc.page_content not in docs_content:
95
+ docs.append(doc)
96
+ template = """
97
+ You are Sifa, a virtual assistant designed by Sifars.
98
+ Execute the below mandatory considerations when responding to the inquiries:
99
+ --- Tone - Respectful, Patient, and Encouraging:
100
+ Maintain a tone that is not only polite but also encouraging. Positive language can help build confidence, especially when they are trying to learn something new.
101
+ Be mindful of cultural references or idioms that may not be universally understood or may date back to a different era, ensuring relatability.
102
+ --- Clarity - Simple, Direct, and Unambiguous:
103
+ Avoid abbreviations, slang, or colloquialisms that might be confusing. Stick to standard language.
104
+ Use bullet points or numbered lists to break down instructions or information, which can aid in comprehension.
105
+ --- Structure - Organized, Consistent, and Considerate:
106
+ Include relevant examples or analogies that relate to experiences common in their lifetime, which can aid in understanding complex topics.
107
+ --- Empathy and Understanding - Compassionate and Responsive:
108
+ Recognize and validate their feelings or concerns. Phrases like, “It’s completely normal to find this challenging,” can be comforting.
109
+ Be aware of the potential need for more frequent repetition or rephrasing of information for clarity.
110
+ Answer the following questions considering the documents and/or history of the conversation.
111
+ Chat history: {chat_history}
112
+ Documents from files: {retrieved_info}
113
+ User question: {user_question}
114
+ """
115
+
116
+ prompt = ChatPromptTemplate.from_template(template)
117
+ llm = ChatOpenAI(model="gpt-4o", streaming=True)
118
+
119
+ chain = prompt | llm | StrOutputParser()
120
+
121
+ return chain.stream(
122
+ {
123
+ "chat_history": chat_history,
124
+ "retrieved_info": docs,
125
+ "user_question": user_query,
126
+ }
127
+ )
128
+
129
+
130
+ def main():
131
+ st.set_page_config(page_title="RAG POC", page_icon="")
132
+ st.title("RAG POC")
133
+ if "last_uploaded_files" not in st.session_state:
134
+ st.session_state.last_uploaded_files = []
135
+ if "chat_history" not in st.session_state:
136
+ st.session_state.chat_history = [
137
+ AIMessage(content="Hello, I am Sifa. How can I help you?"),
138
+ ]
139
+ for message in st.session_state.chat_history:
140
+ if isinstance(message, AIMessage):
141
+ with st.chat_message("AI"):
142
+ st.write(message.content)
143
+ elif isinstance(message, HumanMessage):
144
+ with st.chat_message("Human"):
145
+ st.write(message.content)
146
+ user_query = st.chat_input("Type your message here...")
147
+ if user_query is not None and user_query != "":
148
+ st.session_state.chat_history.append(HumanMessage(content=user_query))
149
+ with st.chat_message("Human"):
150
+ st.markdown(user_query)
151
+ with st.chat_message("AI"):
152
+ response = st.write_stream(
153
+ get_response(
154
+ user_query=user_query, chat_history=st.session_state.chat_history
155
+ )
156
+ )
157
+ st.session_state.chat_history.append(AIMessage(content=response))
158
+ uploaded_files = st.sidebar.file_uploader(
159
+ label="Upload files", accept_multiple_files=True
160
+ )
161
+ ocr_enabled = st.sidebar.checkbox("Enable OCR", value=False)
162
+ to_be_vectorised_files = [
163
+ item
164
+ for item in uploaded_files
165
+ if item.name not in st.session_state.last_uploaded_files
166
+ ]
167
+ if to_be_vectorised_files:
168
+ create_embeddings(files=to_be_vectorised_files, ocr_enabled=ocr_enabled)
169
+
170
+
171
+ if __name__ == "__main__":
172
+ RETRIEVER = load_vector_db().as_retriever()
173
+ main()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rag-poc"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Kanha Upadhyay <kanha.upadhyay@sifars.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "3.12.*"
10
+ streamlit = "^1.44.0"
11
+ python-dotenv = "^1.1.0"
12
+ openai = "^1.70.0"
13
+ llama-index = "^0.12.27"
14
+ langchain = "^0.3.22"
15
+ langchain-openai = "^0.3.11"
16
+ faiss-cpu = "^1.10.0"
17
+ langchain-core = "^0.3.49"
18
+ langchain-community = "^0.3.20"
19
+
20
+
21
+ [build-system]
22
+ requires = ["poetry-core"]
23
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit == 1.44.0
2
+ python-dotenv == 1.1.0
3
+ openai == 1.70.0
4
+ llama-index == 0.12.27
5
+ langchain == 0.3.22
6
+ langchain-openai == 0.3.11
7
+ faiss-cpu == 1.10.0
8
+ langchain-core == 0.3.49
9
+ langchain-community == 0.3.20
utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from ._file_parser import FileParser
2
+
3
+ __all__ = ["FileParser"]
4
+
5
+ version = "0.1.0"
utils/_file_parser.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import SimpleDirectoryReader
2
+ from llama_index.core.schema import Document
3
+ from llama_parse import LlamaParse
4
+
5
+
6
+ class FileParser:
7
+ def __init__(self):
8
+ self.parser = LlamaParse(
9
+ result_type="markdown",
10
+ auto_mode=True,
11
+ auto_mode_trigger_on_image_in_page=True,
12
+ auto_mode_trigger_on_table_in_page=True,
13
+ )
14
+ self.file_extractor = {
15
+ # Base types
16
+ ".pdf": self.parser,
17
+ # Documents and presentations
18
+ ".abw": self.parser,
19
+ ".cgm": self.parser,
20
+ ".cwk": self.parser,
21
+ ".doc": self.parser,
22
+ ".docx": self.parser,
23
+ ".docm": self.parser,
24
+ ".dot": self.parser,
25
+ ".dotm": self.parser,
26
+ ".hwp": self.parser,
27
+ ".key": self.parser,
28
+ ".lwp": self.parser,
29
+ ".mw": self.parser,
30
+ ".mcw": self.parser,
31
+ ".pages": self.parser,
32
+ ".pbd": self.parser,
33
+ ".ppt": self.parser,
34
+ ".pptm": self.parser,
35
+ ".pptx": self.parser,
36
+ ".pot": self.parser,
37
+ ".potm": self.parser,
38
+ ".potx": self.parser,
39
+ ".rtf": self.parser,
40
+ ".sda": self.parser,
41
+ ".sdd": self.parser,
42
+ ".sdp": self.parser,
43
+ ".sdw": self.parser,
44
+ ".sgl": self.parser,
45
+ ".sti": self.parser,
46
+ ".sxi": self.parser,
47
+ ".sxw": self.parser,
48
+ ".stw": self.parser,
49
+ ".sxg": self.parser,
50
+ ".uof": self.parser,
51
+ ".uop": self.parser,
52
+ ".uot": self.parser,
53
+ ".vor": self.parser,
54
+ ".wpd": self.parser,
55
+ ".wps": self.parser,
56
+ ".xml": self.parser,
57
+ ".zabw": self.parser,
58
+ ".epub": self.parser,
59
+ # Images
60
+ ".jpg": self.parser,
61
+ ".jpeg": self.parser,
62
+ ".png": self.parser,
63
+ ".gif": self.parser,
64
+ ".bmp": self.parser,
65
+ ".svg": self.parser,
66
+ ".tiff": self.parser,
67
+ ".webp": self.parser,
68
+ ".web": self.parser,
69
+ ".htm": self.parser,
70
+ ".html": self.parser,
71
+ # Spreadsheets
72
+ ".xlsx": self.parser,
73
+ ".xls": self.parser,
74
+ ".xlsm": self.parser,
75
+ ".xlsb": self.parser,
76
+ ".xlw": self.parser,
77
+ ".csv": self.parser,
78
+ ".dif": self.parser,
79
+ ".sylk": self.parser,
80
+ ".slk": self.parser,
81
+ ".prn": self.parser,
82
+ ".numbers": self.parser,
83
+ ".et": self.parser,
84
+ ".ods": self.parser,
85
+ ".fods": self.parser,
86
+ ".uos1": self.parser,
87
+ ".uos2": self.parser,
88
+ ".dbf": self.parser,
89
+ ".wk1": self.parser,
90
+ ".wk2": self.parser,
91
+ ".wk3": self.parser,
92
+ ".wk4": self.parser,
93
+ ".wks": self.parser,
94
+ ".123": self.parser,
95
+ ".wq1": self.parser,
96
+ ".wq2": self.parser,
97
+ ".wb1": self.parser,
98
+ ".wb2": self.parser,
99
+ ".wb3": self.parser,
100
+ ".qpw": self.parser,
101
+ ".xlr": self.parser,
102
+ ".eth": self.parser,
103
+ ".tsv": self.parser,
104
+ # Audio
105
+ ".mp3": self.parser,
106
+ ".mp4": self.parser,
107
+ ".mpeg": self.parser,
108
+ ".mpga": self.parser,
109
+ ".m4a": self.parser,
110
+ ".wav": self.parser,
111
+ ".webm": self.parser,
112
+ }
113
+
114
+ def parse(self, input_dir: str, ocr_enabled: bool = False):
115
+ documents: list[Document] = SimpleDirectoryReader(
116
+ input_dir=input_dir,
117
+ file_extractor=self.file_extractor if ocr_enabled else None,
118
+ ).load_data()
119
+ return "\n".join([doc.text for doc in documents])