TorchLLM commited on
Commit
d9e3edb
·
1 Parent(s): 5927b78

Initial commit for deploying the project

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/chroma_rag/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
src/.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore all MP3 files
2
+ *.mp3
3
+
4
+ # Ignore all PDF files
5
+ *.pdf
6
+
7
+ # Ignore all MP4 files
8
+ *.mp4
9
+
10
+ # Ignore all TXT files
11
+ *.txt
12
+
13
+ # Ignore environment files
14
+ .env
15
+
16
+ # Ignore Python cache files
17
+ __pycache__/
18
+ *.pyc
src/Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 as base image
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory in container
5
+ WORKDIR /app/src
6
+
7
+ # Copy requirements file
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the entire project
14
+ COPY . .
15
+
16
+ # Expose ports for API and Streamlit
17
+ EXPOSE 8000 8501
18
+
19
+ # Create script to run both services
20
+ RUN echo '#!/bin/bash\n\
21
+ python response_api.py &\n\
22
+ sleep 5\n\
23
+ streamlit run app.py' > ./start.sh
24
+
25
+ # Make the script executable
26
+ RUN chmod +x ./start.sh
27
+
28
+ # Copy .env file and set environment variables
29
+ ENV $(cat .env | xargs)
30
+
31
+ # Run the start script
32
+ CMD ["./start.sh"]
src/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Gemini_RAG
2
+ Create multi-functionality LLM application which support RAG functionality with multiple agents.
src/__init__.py ADDED
File without changes
src/agents_rag/__init__.py ADDED
File without changes
src/agents_rag/agents.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Agent
2
+ from dotenv import load_dotenv
3
+
4
+ from agents_rag.tools import tool
5
+
6
+ load_dotenv()
7
+ import os
8
+
9
+ from langchain_google_genai import ChatGoogleGenerativeAI
10
+
11
+ ## call the gemini models
12
+ llm = ChatGoogleGenerativeAI(
13
+ model="gemini-2.0-flash-exp",
14
+ verbose=True,
15
+ temperature=0.5,
16
+ google_api_key=os.getenv("GEMINI_API_KEY"),
17
+ )
18
+
19
+ # Creating a senior researcher agent with memory and verbose mode
20
+
21
+ news_researcher = Agent(
22
+ role="Senior Researcher",
23
+ goal="Unccover ground breaking technologies in {topic}",
24
+ verbose=True,
25
+ memory=True,
26
+ backstory=(
27
+ "Driven by curiosity, you're at the forefront of"
28
+ "innovation, eager to explore and share knowledge that could change"
29
+ "the world."
30
+ ),
31
+ tools=[tool],
32
+ llm=llm,
33
+ allow_delegation=True,
34
+ )
35
+
36
+ ## creating a write agent with custom tools responsible in writing news blog
37
+
38
+ news_writer = Agent(
39
+ role="Writer",
40
+ goal="Narrate compelling tech stories about {topic}",
41
+ verbose=True,
42
+ memory=True,
43
+ backstory=(
44
+ "With a flair for simplifying complex topics, you craft"
45
+ "engaging narratives that captivate and educate, bringing new"
46
+ "discoveries to light in an accessible manner."
47
+ ),
48
+ tools=[tool],
49
+ llm=llm,
50
+ allow_delegation=False,
51
+ )
52
+
53
+ writer_rag = Agent(
54
+ role="Query Answerer using Web Search",
55
+ goal="Create comprehensive and well-structured answers to given queries based on web search content.",
56
+ backstory=(
57
+ "Specialized in synthesizing information from multiple sources into "
58
+ "coherent and engaging content while maintaining accuracy."
59
+ ),
60
+ tools=[tool], # Ensure 'tool' is suitable for web search tasks
61
+ llm=llm, # Ensure 'llm' is configured for this agent's needs
62
+ verbose=True,
63
+ memory=True,
64
+ allow_delegation=False,
65
+ )
66
+
67
+ rag_agent = Agent(
68
+ role="RAG agent that answer of given query in quick manner along with citation using provided context only don't search on only of you don't found relvatent context then denie to give answer and also answer give in given format",
69
+ goal="Create comprehensive and well-structured answers to given queries with citation based on given context only and also give the answer in given format.",
70
+ backstory=(
71
+ "Specialized in synthesizing information from multiple sources into "
72
+ "coherent and engaging content while maintaining accuracy."
73
+ ), # Ensure 'tool' is suitable for web search tasks
74
+ llm=llm, # Ensure 'llm' is configured for this agent's needs
75
+ verbose=True,
76
+ memory=True,
77
+ allow_delegation=False,
78
+ )
src/agents_rag/crew.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Crew, Process
2
+
3
+ from agents_rag.agents import rag_agent, writer_rag
4
+ from agents_rag.tasks import rag_task, write_rag_task
5
+
6
+
7
+ def get_crew_response(query=None, context=None, url=None):
8
+ if url != "None":
9
+ print("in crwe url")
10
+ ## Forming the tech focused crew with some enhanced configuration
11
+ crew_search = Crew(
12
+ agents=[writer_rag],
13
+ tasks=[write_rag_task],
14
+ process=Process.sequential,
15
+ )
16
+
17
+ ## starting the task execution process wiht enhanced feedback
18
+ result_web = crew_search.kickoff(
19
+ inputs={
20
+ "query": f"{query}",
21
+ "url": f"{url}",
22
+ }
23
+ )
24
+ print(result_web)
25
+
26
+ result_web = result_web.removeprefix("```json")
27
+ result_web = result_web.removesuffix("```")
28
+ result_web = result_web.replace("```", "")
29
+
30
+ return result_web
31
+ else:
32
+ print("In crew else")
33
+ ## Forming the tech focused crew with some enhanced configuration
34
+ crew_rag = Crew(
35
+ agents=[rag_agent],
36
+ tasks=[rag_task],
37
+ process=Process.sequential,
38
+ )
39
+
40
+ ## starting the task execution process wiht enhanced feedback
41
+ result_rag = crew_rag.kickoff(
42
+ inputs={
43
+ "query": f"{query}",
44
+ "context": f"{context}",
45
+ }
46
+ )
47
+
48
+ result_rag = result_rag.removeprefix("```json")
49
+ result_rag = result_rag.removesuffix("```")
50
+ result_rag = result_rag.replace("```", "")
51
+
52
+ return result_rag
src/agents_rag/tasks.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Task
2
+
3
+ from agents_rag.agents import news_researcher, news_writer, rag_agent, writer_rag
4
+ from agents_rag.tools import tool
5
+
6
+ # Research task
7
+ research_task = Task(
8
+ description=(
9
+ "Identify the next big trend in {topic}."
10
+ "Focus on identifying pros and cons and the overall narrative."
11
+ "Your final report should clearly articulate the key points,"
12
+ "its market opportunities, and potential risks."
13
+ ),
14
+ expected_output="A comprehensive 3 paragraphs long report on the latest AI trends.",
15
+ tools=[tool],
16
+ agent=news_researcher,
17
+ )
18
+
19
+ # Writing task with language model configuration
20
+ write_task = Task(
21
+ description=(
22
+ "Compose an insightful article on {topic}."
23
+ "Focus on the latest trends and how it's impacting the industry."
24
+ "This article should be easy to understand, engaging, and positive."
25
+ ),
26
+ expected_output="A 4 paragraph article on {topic} advancements formatted as markdown.",
27
+ tools=[tool],
28
+ agent=news_writer,
29
+ async_execution=False,
30
+ # output_file="new-blog-post.md", # Example of output customization
31
+ )
32
+
33
+ write_rag_task = Task(
34
+ description=(
35
+ "Answer the Give query: {query} like RAG application from given url: {url} only. if user manention url then use only that url to answer the query otherwise you can do web-search to answer the qurey. "
36
+ "Focus on the latest trends."
37
+ "The answer should be easy to understand, engaging, and positive."
38
+ "Also Show source of content That use to generate generate answer."
39
+ "All source are also show in citation and each citation have number and in end there is map of citation number and source."
40
+ ),
41
+ expected_output="A Give normal length answer if user asked something specific like small or big or more this some type of word then give answer according to that length on {query} advancements formatted as only one JSON format. in which it has two field first is Answer in which generated answer is written and second field is context in which the context value that are used to for generation of answer and third is citations. If required, then give in table format.",
42
+ agent=writer_rag,
43
+ tools=[tool], # Ensure 'tool' is appropriate for the task
44
+ async_execution=False,
45
+ output_file="new-blog-post.md", # Specify the desired output file path
46
+ )
47
+
48
+ rag_task = Task(
49
+ description=(
50
+ "Answer the Give query: {query} like RAG application of using given context: {context} "
51
+ "Focus on the latest trends."
52
+ "The answer should be easy to understand, engaging, and positive."
53
+ "Also Show source of content That use to generate generate answer."
54
+ "All source are also show in citation and each citation have number and in end there is map of citation number and source."
55
+ ),
56
+ expected_output="A Give normal length answer if user asked something specific like small or big or or more this some type of word then give answer according to that length on {query} advancements formatted as JSON only in which it has three field first is Answer in which generated answer is written and second field is context in which the context value that are used to for generation of answer and third is citations. If required, then give in table format.",
57
+ agent=rag_agent,
58
+ async_execution=False,
59
+ # output_file="new-blog-post.md", # Specify the desired output file path
60
+ )
src/agents_rag/tools.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## https://serper.dev/
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import os
7
+
8
+ os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY")
9
+
10
+
11
+ from crewai_tools import SerperDevTool
12
+
13
+ # Initialize the tool for internet searching capabilities
14
+ tool = SerperDevTool()
src/app.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import os
3
+ # import re
4
+ # import time
5
+ # from pathlib import Path
6
+ # from typing import Dict, List
7
+
8
+ # import requests
9
+ # import streamlit as st
10
+
11
+
12
+ # def extract_and_verify_url(string):
13
+ # """Extract the URL from the string and verify if it points to content."""
14
+ # url_pattern = re.compile(
15
+ # r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
16
+ # )
17
+ # match = url_pattern.search(string)
18
+ # if match:
19
+ # url = match.group()
20
+ # try:
21
+ # response = requests.head(url, allow_redirects=True, timeout=5)
22
+ # if response.status_code == 200:
23
+ # return {
24
+ # "url": url,
25
+ # "status": "Valid and content exists",
26
+ # "status_code": 200,
27
+ # }
28
+ # else:
29
+ # return {
30
+ # "url": url,
31
+ # "status": f"Invalid (HTTP {response.status_code})",
32
+ # "status_code": response.status_code,
33
+ # }
34
+ # except requests.RequestException as e:
35
+ # return {"url": url, "status": f"Error: {e}"}
36
+ # return {"url": "", "status": "No URL found"}
37
+
38
+
39
+ # def stream_data(data_val):
40
+ # for word in data_val.split(" "):
41
+ # yield word + " "
42
+ # time.sleep(0.02)
43
+
44
+
45
+ # def list_files_in_directory(directory):
46
+ # """List all files in the given directory."""
47
+ # try:
48
+ # return os.listdir(directory)
49
+ # except FileNotFoundError:
50
+ # return []
51
+
52
+
53
+ # def save_uploaded_file(uploaded_file, directory):
54
+ # """Save the uploaded file to the specified directory."""
55
+ # with open(os.path.join(directory, uploaded_file.name), "wb") as f:
56
+ # f.write(uploaded_file.getbuffer())
57
+
58
+ # return os.path.join(directory, uploaded_file.name)
59
+
60
+
61
+ # def call_rag_api(
62
+ # query: str,
63
+ # url: str,
64
+ # is_uploaded: bool = False,
65
+ # ) -> Dict:
66
+ # """Call the RAG API and get a response."""
67
+ # endpoint = f"http://127.0.0.1:8000/get-response"
68
+ # payload = {"query": query, "is_uploaded": is_uploaded, "url": url}
69
+
70
+ # try:
71
+ # response = requests.post(endpoint, json=payload)
72
+ # response.raise_for_status()
73
+ # result = response.json()
74
+ # print(type(result))
75
+ # print(result)
76
+ # return {
77
+ # "status": "success",
78
+ # "response": result["response"],
79
+ # "context": result["context"],
80
+ # "citations": result["citations"],
81
+ # }
82
+ # except requests.exceptions.RequestException as e:
83
+ # return {"status": "error", "message": str(e)}
84
+
85
+
86
+ # # Main Streamlit app
87
+ # def main():
88
+ # st.title("🤖 RAG Chat Assistant")
89
+
90
+ # # Sidebar inputs and actions
91
+ # with st.sidebar:
92
+ # st.header("📚 Document Control")
93
+
94
+ # # Input directory
95
+ # directory = st.text_input("Enter the directory path:", value="data")
96
+
97
+ # # Ensure directory exists
98
+ # Path(directory).mkdir(parents=True, exist_ok=True)
99
+
100
+ # # Display files in the directory
101
+ # st.subheader("Files in Directory")
102
+ # files = list_files_in_directory(directory)
103
+ # if files:
104
+ # st.write(files)
105
+ # else:
106
+ # st.write("No files found.")
107
+
108
+ # # Upload file
109
+ # st.subheader("Upload a File")
110
+ # uploaded_file = st.file_uploader(
111
+ # "Choose a file", type=["txt", "pdf", "doc", "docx", "mp3", "mp4"]
112
+ # )
113
+ # if uploaded_file:
114
+ # file_path = save_uploaded_file(uploaded_file, directory)
115
+ # with st.spinner:
116
+ # endpoint = f"http://127.0.0.1:8000/process-file"
117
+ # payload = {"file_path": file_path}
118
+ # result = requests.post(endpoint, json=payload)
119
+ # st.success(result["response"])
120
+ # st.success(f"File '{uploaded_file.name}' uploaded successfully!")
121
+
122
+ # # Delete a file
123
+ # st.subheader("Delete a File")
124
+ # if files:
125
+ # file_to_delete = st.selectbox("Select a file to delete:", options=files)
126
+ # if st.button("Delete File"):
127
+ # try:
128
+ # os.remove(os.path.join(directory, file_to_delete))
129
+ # st.success(f"File '{file_to_delete}' deleted successfully!")
130
+ # except Exception as e:
131
+ # st.error(f"Error deleting file: {e}")
132
+
133
+ # # Chat system status
134
+ # st.divider()
135
+ # st.markdown("### System Status")
136
+ # if uploaded_file:
137
+ # st.success("Document loaded")
138
+ # else:
139
+ # st.info("No document uploaded")
140
+
141
+ # # Initialize chat history
142
+ # if "messages" not in st.session_state:
143
+ # st.session_state.messages = []
144
+
145
+ # # Display chat messages in streaming manner
146
+ # chat_placeholder = st.container()
147
+
148
+ # for message in st.session_state.messages:
149
+ # with chat_placeholder.container():
150
+ # with st.chat_message(message["role"]):
151
+ # st.markdown(message["content"])
152
+ # if message["role"] == "assistant" and "context" in message:
153
+ # with st.expander("View source context"):
154
+ # st.info(message["context"])
155
+
156
+ # # Chat input
157
+ # if prompt := st.chat_input("Ask me anything about your documents..."):
158
+ # res = extract_and_verify_url(prompt)
159
+ # print(res)
160
+ # if res["url"] != None:
161
+ # print(res["url"])
162
+ # st.session_state.messages.append({"role": "user", "content": prompt})
163
+ # with chat_placeholder.container():
164
+ # with st.chat_message("user"):
165
+ # st.markdown(prompt)
166
+
167
+ # with chat_placeholder.container():
168
+ # with st.chat_message("assistant"):
169
+ # with st.spinner("Thinking..."):
170
+ # result = call_rag_api(
171
+ # url=res["url"],
172
+ # query=prompt,
173
+ # is_uploaded=uploaded_file is not None,
174
+ # )
175
+
176
+ # if result["status"] == "success":
177
+
178
+ # response_content = result["response"]
179
+ # context = result["context"]
180
+ # citations = result["citations"]
181
+
182
+ # # st.markdown(response_content)
183
+ # st.write_stream(stream_data(response_content))
184
+ # with st.expander("View source context"):
185
+ # st.json(citations)
186
+
187
+ # st.session_state.messages.append(
188
+ # {
189
+ # "role": "assistant",
190
+ # "content": response_content,
191
+ # "context": context,
192
+ # "citations": citations,
193
+ # }
194
+ # )
195
+ # else:
196
+ # st.error(
197
+ # f"Error: {result.get('message', 'Unknown error occurred')}"
198
+ # )
199
+
200
+
201
+ # if __name__ == "__main__":
202
+ # main()
203
+
204
+
205
+ import json
206
+ import os
207
+ import re
208
+ import time
209
+ from pathlib import Path
210
+ from typing import Dict, List
211
+
212
+ import requests
213
+ import streamlit as st
214
+ from streamlit_js_eval import streamlit_js_eval
215
+
216
+
217
+ def extract_and_verify_url(string):
218
+ """Extract the URL from the string and verify if it points to content."""
219
+ url_pattern = re.compile(
220
+ r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
221
+ )
222
+ match = url_pattern.search(string)
223
+ if match:
224
+ url = match.group()
225
+ try:
226
+ response = requests.head(url, allow_redirects=True, timeout=5)
227
+ if response.status_code == 200:
228
+ return {
229
+ "url": url,
230
+ "status": "Valid and content exists",
231
+ "status_code": 200,
232
+ }
233
+ else:
234
+ return {
235
+ "url": url,
236
+ "status": f"Invalid (HTTP {response.status_code})",
237
+ "status_code": response.status_code,
238
+ }
239
+ except requests.RequestException as e:
240
+ return {"url": url, "status": f"Error: {e}"}
241
+ return {"url": "", "status": "No URL found"}
242
+
243
+
244
+ def stream_data(data_val):
245
+ for word in data_val.split(" "):
246
+ yield word + " "
247
+ time.sleep(0.02)
248
+
249
+
250
+ def list_files_in_directory(directory):
251
+ """List all files in the given directory."""
252
+ try:
253
+ return os.listdir(directory)
254
+ except FileNotFoundError:
255
+ return []
256
+
257
+
258
+ def save_uploaded_file(uploaded_file, directory):
259
+ """Save the uploaded file to the specified directory."""
260
+ with open(os.path.join(directory, uploaded_file.name), "wb") as f:
261
+ f.write(uploaded_file.getbuffer())
262
+ return os.path.join(directory, uploaded_file.name)
263
+
264
+
265
+ def call_rag_api(query: str, url: str, is_uploaded: bool = False) -> Dict:
266
+ """Call the RAG API and get a response."""
267
+ endpoint = f"http://127.0.0.1:8000/get-response"
268
+ payload = {"query": query, "is_uploaded": is_uploaded, "url": url}
269
+
270
+ try:
271
+ response = requests.post(endpoint, json=payload)
272
+ response.raise_for_status()
273
+ result = response.json()
274
+ return {
275
+ "status": "success",
276
+ "response": result["response"],
277
+ "context": result["context"],
278
+ "citations": result["citations"],
279
+ }
280
+ except requests.exceptions.RequestException as e:
281
+ return {"status": "error", "message": str(e)}
282
+
283
+
284
+ def call_llm_api(query: str) -> Dict:
285
+ """Call the LLM API for answering questions."""
286
+ endpoint = f"http://127.0.0.1:8000/llm-response"
287
+ payload = {"query": query}
288
+
289
+ try:
290
+ response = requests.post(endpoint, json=payload)
291
+ response.raise_for_status()
292
+ result = response.json()
293
+ return {"status": "success", "response": result["response"]}
294
+ except requests.exceptions.RequestException as e:
295
+ return {"status": "error", "message": str(e)}
296
+
297
+
298
+ # Main Streamlit app
299
+ def main():
300
+ st.title("🤖 Multi-Functional Chat Assistant")
301
+
302
+ # Sidebar inputs and actions
303
+ with st.sidebar:
304
+
305
+ # Chat functionality selection
306
+ mode = st.radio("Select Mode:", ["LLM Answering", "Web Search Agent", "RAG"])
307
+
308
+ st.header("📂 Document Control")
309
+
310
+ # Input directory
311
+ directory = st.text_input("Enter the directory path:", value="data")
312
+
313
+ # Ensure directory exists
314
+ Path(directory).mkdir(parents=True, exist_ok=True)
315
+
316
+ # Display files in the directory
317
+ st.subheader("Files in Directory")
318
+ files = list_files_in_directory(directory)
319
+ if files:
320
+ st.write(files)
321
+ else:
322
+ st.write("No files found.")
323
+
324
+ # Upload file
325
+ st.subheader("Upload a File")
326
+ uploaded_file = st.file_uploader(
327
+ "Choose a file", type=["txt", "pdf", "doc", "docx", "mp3", "mp4"]
328
+ )
329
+ if uploaded_file:
330
+ file_path = save_uploaded_file(uploaded_file, directory)
331
+ endpoint = f"http://127.0.0.1:8000/process-file"
332
+ payload = {"file_path": file_path}
333
+
334
+ with st.spinner("File is in process..."):
335
+ response = requests.post(endpoint, json=payload)
336
+
337
+ st.success(f"File '{uploaded_file.name}' uploaded successfully!")
338
+ time.sleep(3)
339
+ streamlit_js_eval(js_expressions="parent.window.location.reload()")
340
+
341
+ # Delete a file
342
+ st.subheader("Delete a File")
343
+ if files:
344
+ file_to_delete = st.selectbox("Select a file to delete:", options=files)
345
+ if st.button("Delete File"):
346
+ try:
347
+ payload = {"file_path": file_to_delete}
348
+ endpoint = f"http://127.0.0.1:8000/delete-file"
349
+
350
+ with st.spinner("File is deleting..."):
351
+ response = requests.post(endpoint, json=payload)
352
+ os.remove(os.path.join(directory, file_to_delete))
353
+
354
+ st.success(f"File '{file_to_delete}' deleted successfully!")
355
+ time.sleep(3)
356
+ streamlit_js_eval(js_expressions="parent.window.location.reload()")
357
+ except Exception as e:
358
+ st.error(f"Error deleting file: {e}")
359
+
360
+ # Chat system status
361
+ st.divider()
362
+ st.markdown("### System Status")
363
+ if uploaded_file:
364
+ st.success("Document loaded")
365
+ else:
366
+ st.info("No document uploaded")
367
+
368
+ # Initialize chat history
369
+ if "messages" not in st.session_state:
370
+ st.session_state.messages = []
371
+
372
+ # Display chat messages in streaming manner
373
+ chat_placeholder = st.container()
374
+
375
+ for message in st.session_state.messages:
376
+ with chat_placeholder.container():
377
+ with st.chat_message(message["role"]):
378
+ st.markdown(message["content"])
379
+
380
+ # # Chat functionality selection
381
+ # mode = st.radio("Select Mode:", ["LLM Answering", "Web Search Agent", "RAG"])
382
+
383
+ # Chat input
384
+ if prompt := st.chat_input("Ask me anything..."):
385
+ st.session_state.messages.append({"role": "user", "content": prompt})
386
+ with chat_placeholder.container():
387
+ with st.chat_message("user"):
388
+ st.markdown(prompt)
389
+
390
+ with chat_placeholder.container():
391
+ with st.chat_message("assistant"):
392
+ with st.spinner("Thinking..."):
393
+ citations = []
394
+ if mode == "LLM Answering":
395
+ result = call_llm_api(prompt)
396
+ if result["status"] == "success":
397
+ response_content = result.get("response", "")
398
+ st.write_stream(stream_data(response_content))
399
+
400
+ st.session_state.messages.append(
401
+ {"role": "assistant", "content": response_content}
402
+ )
403
+
404
+ else:
405
+ st.error(
406
+ f"Error: {result.get('message', 'Unknown error occurred')}"
407
+ )
408
+ elif mode == "Web Search Agent":
409
+ res = extract_and_verify_url(prompt)
410
+ result = call_rag_api(
411
+ url=res.get("url", ""),
412
+ query=prompt,
413
+ is_uploaded=uploaded_file is not None,
414
+ )
415
+ if result["status"] == "success":
416
+
417
+ response_content = result["response"]
418
+ context = result["context"]
419
+ citations = result["citations"]
420
+
421
+ # st.markdown(response_content)
422
+ st.write_stream(stream_data(response_content))
423
+ # with st.expander("View source context"):
424
+ # st.json(citations)
425
+
426
+ st.session_state.messages.append(
427
+ {
428
+ "role": "assistant",
429
+ "content": response_content,
430
+ "context": context,
431
+ "citations": citations,
432
+ }
433
+ )
434
+ else:
435
+ st.error(
436
+ f"Error: {result.get('message', 'Unknown error occurred')}"
437
+ )
438
+
439
+ elif mode == "RAG":
440
+ res = extract_and_verify_url(prompt)
441
+ result = call_rag_api(
442
+ url="None",
443
+ query=prompt,
444
+ is_uploaded=uploaded_file is not None,
445
+ )
446
+
447
+ if result["status"] == "success":
448
+
449
+ response_content = result["response"]
450
+ context = result["context"]
451
+ citations = result["citations"]
452
+
453
+ # st.markdown(response_content)
454
+ st.write_stream(stream_data(response_content))
455
+ # with st.expander("View source context"):
456
+ # st.json(citations)
457
+
458
+ st.session_state.messages.append(
459
+ {
460
+ "role": "assistant",
461
+ "content": response_content,
462
+ "context": context,
463
+ "citations": citations,
464
+ }
465
+ )
466
+ else:
467
+ st.error(
468
+ f"Error: {result.get('message', 'Unknown error occurred')}"
469
+ )
470
+
471
+ with st.expander("View source context"):
472
+ st.json(citations)
473
+
474
+
475
+ if __name__ == "__main__":
476
+ main()
src/assistants/__init__.py ADDED
File without changes
src/assistants/assistant_v1.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # from typing import List
3
+
4
+ # import google.generativeai as genai
5
+ # from dotenv import load_dotenv
6
+ # from langchain.prompts import PromptTemplate
7
+ # from langchain.schema import StrOutputParser
8
+ # from pydantic import BaseModel, Field
9
+
10
+ # from .gemini_assistant import GeminiAssistant
11
+
12
+ # load_dotenv()
13
+
14
+ # api_key = os.getenv("GEMINI_API_KEY")
15
+ # model_name = "gemini-2.0-flash-exp"
16
+
17
+
18
+ # class GenerativeAnswer(BaseModel):
19
+ # answer: str = Field(
20
+ # ...,
21
+ # description="""The answer to the user question, which is based only on the given sources.\n
22
+ # example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains, leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations compared to biological systems (vec-005) and so on.....
23
+ # """,
24
+ # )
25
+ # source_id: List[str] = Field(
26
+ # ...,
27
+ # description="The ID of a SPECIFIC source which justifies the answer.",
28
+ # )
29
+ # quote: List[str] = Field(
30
+ # ...,
31
+ # description="The VERBATIM quote from the specified source that justifies the answer.",
32
+ # )
33
+ # related_questions: List[str] = Field(
34
+ # ...,
35
+ # description="Related questions based on the user question and generated answer.",
36
+ # )
37
+
38
+ # # [INST]<<SYS>>You are an Intelligent Assistant for question-answering tasks.
39
+ # # Your knowledge base consists of documents related to Convene and its resources.
40
+ # # Convene is a community of Christian business leaders who gather monthly in advisory teams worldwide to share insights, face challenges, and grow in faith through executive learning. Members can join or lead a Convene Team, participate in One2One™ Coaching, and access exclusive events. The organization, founded in 1996 by Brian Thatcher and others from Saddleback Church, aims to help CEOs integrate faith with business to build profitable, faith-based enterprises. Services include peer advisory, executive coaching, and business consulting, focusing on enhancing business and leadership skills while maintaining a faith foundation. Convene emphasizes balancing business success with spiritual growth and family leadership.
41
+
42
+ # # Your [Task] is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the context provided and Convene's activities, history, services, and community impacts as outlined.
43
+ # # Identify if the User question aligns with the context or Convene's interests. If the information requested is outside of the context or Convene's provided data or knowledge base, You should politely decline the request and tell the user to ask related questions.
44
+ # # Only provide answers based on the information available in your context and the above topics.
45
+
46
+ # # Use the following steps and pieces of retrieved context to answer the question. The answer should be detailed and concise.
47
+
48
+
49
+ # prompt = """
50
+ # You are a friendly and helpful assistant.
51
+ # Ensure your answers are complete, unless the user requests a more concise approach.
52
+ # When generating code, offer explanations for code segments as necessary and maintain good coding practices.
53
+ # When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
54
+ # For any non-english queries, respond in the same language as the prompt unless otherwise specified by the user.
55
+ # For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
56
+
57
+ # [Task]
58
+ # Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
59
+ # Assess whether the user’s question aligns with the context or the specified areas of interest.
60
+ # If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
61
+ # Only provide answers based on the information available in the given context and the specified areas of interest.
62
+
63
+ # The answer should look like:
64
+ # Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains, leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations compared to biological systems (vec-005) and so on.....
65
+ # means your answer must contains source_id in given format.
66
+ # Output should follow the pattern defined in schema and the output should be in json format only so that it can be directly used with json.loads()
67
+ # <</SYS>>
68
+ # steps:
69
+ # 1. use the provided data.
70
+ # 2. Check if the question is related to Convene or related to the context provided. If the question is not related, ask them politely to ask related questions. Do not follow further steps but use the wrong_question_schema to output the answer.
71
+ # 3. use one source and extract the sentences from it that justify the answer all the source sentences are in must be unique.
72
+ # 4. use the sentences to generate the answer and append its source_id to the answer with prefix "vec-". You must append its source_id only if it exists. Do not hallucinate or add from your imagination. Only add source_id which you get from the context. This is a strict requirement.
73
+ # 5. use other source and extract the sentences from it that justify the answer. NOTE: use atleast 3 sources to generate the answer.
74
+ # 6. If the user has asked question related to the context or Convene, use the whole answer and user's question to generate 3 related questions. If the question is outside the context, do not generate related questions.
75
+
76
+ # NOTE: Make sure no out of bounds questions or questions that are not in the context or knowledge base are answered. This is mandatory that no out of context questions are answered."""
77
+
78
+ # gemini_rag_assistant = GeminiAssistant(
79
+ # model=model_name,
80
+ # api_key=api_key,
81
+ # system_prompt=prompt,
82
+ # output_model=GenerativeAnswer,
83
+ # )
84
+
85
+
86
+ # import os
87
+ # from typing import List
88
+
89
+ # import google.generativeai as genai
90
+ # from dotenv import load_dotenv
91
+ # from langchain.prompts import PromptTemplate
92
+ # from langchain.schema import StrOutputParser
93
+ # from pydantic import BaseModel, Field
94
+
95
+ # from .gemini_assistant import GeminiAssistant
96
+
97
+ # # Load environment variables
98
+ # load_dotenv()
99
+
100
+ # # Load the API key from the environment
101
+ # api_key = os.getenv("GEMINI_API_KEY")
102
+ # model_name = "gemini-2.0-flash-exp"
103
+
104
+
105
+ # # Define the output model schema using Pydantic
106
+ # class GenerativeAnswer(BaseModel):
107
+ # answer: str = Field(
108
+ # ...,
109
+ # description="""The answer to the user question, which is based only on the given sources.
110
+ # Example:
111
+ # Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
112
+ # ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
113
+ # while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
114
+ # significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
115
+ # (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
116
+ # leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
117
+ # continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
118
+ # (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
119
+ # compared to biological systems (vec-005).""",
120
+ # )
121
+ # source_id: List[str] = Field(
122
+ # ...,
123
+ # description="The ID of a SPECIFIC source which justifies the answer.",
124
+ # )
125
+ # quote: List[str] = Field(
126
+ # ...,
127
+ # description="The VERBATIM quote from the specified source that justifies the answer.",
128
+ # )
129
+ # related_questions: List[str] = Field(
130
+ # ...,
131
+ # description="Related questions based on the user question and generated answer.",
132
+ # )
133
+
134
+
135
+ # # Define the system prompt
136
+ # prompt = """
137
+ # You are a friendly and helpful assistant.
138
+ # Ensure your answers are complete, unless the user requests a more concise approach.
139
+ # When generating code, offer explanations for code segments as necessary and maintain good coding practices.
140
+ # When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
141
+ # For any non-English queries, respond in the same language as the prompt unless otherwise specified by the user.
142
+ # For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
143
+
144
+ # [Task]
145
+ # Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
146
+ # Assess whether the user’s question aligns with the context or the specified areas of interest.
147
+ # If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
148
+ # Only provide answers based on the information available in the given context and the specified areas of interest.
149
+
150
+ # The answer should look like:
151
+ # Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
152
+ # ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
153
+ # while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
154
+ # significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
155
+ # (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
156
+ # leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
157
+ # continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
158
+ # (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
159
+ # compared to biological systems (vec-005).
160
+ # Make sure your answer contains source_id in the given format.
161
+
162
+ # Output should follow the pattern defined in the schema, and the output should be in JSON format only, so that it can be directly used with `json.loads()`.
163
+
164
+ # Steps:
165
+ # 1. Use the provided data.
166
+ # 2. Check if the question is related to the context or related to the provided scope. If the question is not related, politely ask the user to ask relevant questions. Do not proceed with other steps and use the wrong_question_schema to output the answer.
167
+ # 3. Use one source and extract the sentences from it that justify the answer. All source sentences must be unique.
168
+ # 4. Use the sentences to generate the answer and append its source_id to the answer with prefix "vec-". You must append its source_id only if it exists. Do not hallucinate or add from your imagination. Only add source_id which you get from the context. This is a strict requirement.
169
+ # 5. Use other sources and extract the sentences from them that justify the answer. NOTE: Use at least 3 sources to generate the answer.
170
+ # 6. If the user has asked a question related to the context or scope, use the entire answer and the user's question to generate 3 related questions. If the question is outside the context, do not generate related questions.
171
+
172
+ # NOTE: Ensure that no out-of-bounds questions or questions outside the context are answered. It is mandatory that no out-of-context questions are addressed.
173
+ # """
174
+
175
+ # # Instantiate the Gemini RAG assistant
176
+ # gemini_rag_assistant = GeminiAssistant(
177
+ # model=model_name,
178
+ # api_key=api_key,
179
+ # system_prompt=prompt,
180
+ # output_model=GenerativeAnswer,
181
+ # )
182
+
183
+ import os
184
+ from typing import List
185
+
186
+ from dotenv import load_dotenv
187
+ from pydantic import BaseModel, Field
188
+
189
+ from .gemini_assistant import GeminiAssistant
190
+
191
+ # Load environment variables
192
+ load_dotenv()
193
+
194
+ # Load the API key from the environment
195
+ api_key = os.getenv("GEMINI_API_KEY")
196
+ model_name = "gemini-2.0-flash-exp"
197
+
198
+
199
+ # Define the output model schema using Pydantic
200
+ class GenerativeAnswer(BaseModel):
201
+ answer: str = Field(
202
+ ...,
203
+ description="""The answer to the user question, which is based only on the given sources.
204
+ Example:
205
+ Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
206
+ ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
207
+ while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
208
+ significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
209
+ (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
210
+ leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
211
+ continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
212
+ (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
213
+ compared to biological systems (vec-005).""",
214
+ )
215
+ related_questions: List[str] = Field(
216
+ ...,
217
+ description="Related questions based on the user question and generated answer.",
218
+ )
219
+
220
+
221
+ # Define the system prompt using Langchain's PromptTemplate
222
+ prompt_template = """
223
+ You are a friendly and helpful assistant.
224
+ Ensure your answers are complete, unless the user requests a more concise approach.
225
+ When generating code, offer explanations for code segments as necessary and maintain good coding practices.
226
+ When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
227
+ For any non-English queries, respond in the same language as the prompt unless otherwise specified by the user.
228
+ For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
229
+
230
+ [Task]
231
+ Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
232
+ Assess whether the user’s question aligns with the context or the specified areas of interest.
233
+ If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
234
+ Only provide answers based on the information available in the given context and the specified areas of interest.
235
+
236
+ The answer should look like:
237
+ Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
238
+ ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
239
+ while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
240
+ significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
241
+ (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
242
+ leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
243
+ continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
244
+ (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
245
+ compared to biological systems (vec-005).
246
+
247
+ The final output should be structured like this:
248
+ 1. The complete answer to the user question.
249
+ 2. The source IDs appended to the answer, e.g., (vec-001, vec-002, vec-003).
250
+ 3. Related questions that might arise from the user's question.
251
+
252
+ Output should be in the following format:
253
+ {
254
+ "answer": "<generated_answer_with_sources>",
255
+ "related_questions": [<related_question_1>, <related_question_2>, <related_question_3>]
256
+ }
257
+ """
258
+
259
+
260
+ # Instantiate the Gemini RAG assistant with the prompt template and output parser
261
+ gemini_rag_assistant = GeminiAssistant(
262
+ model=model_name,
263
+ api_key=api_key,
264
+ system_prompt=prompt_template,
265
+ output_model=GenerativeAnswer,
266
+ )
src/assistants/gemini_assistant.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import os
3
+ # from dataclasses import dataclass
4
+ # from enum import Enum
5
+ # from typing import Any, Dict, List, Optional, Union
6
+
7
+ # import google.generativeai as genai
8
+ # from pydantic import BaseModel
9
+
10
+
11
+ # class Role(Enum):
12
+ # USER = "user"
13
+ # ASSISTANT = "assistant"
14
+ # SYSTEM = "system"
15
+
16
+
17
+ # @dataclass
18
+ # class Message:
19
+ # role: Role
20
+ # content: str
21
+
22
+
23
+ # class GeminiAssistant:
24
+ # def __init__(
25
+ # self,
26
+ # api_key: str,
27
+ # model: str = "gemini-pro",
28
+ # temperature: float = 0.7,
29
+ # top_p: float = 0.95,
30
+ # top_k: int = 40,
31
+ # max_output_tokens: int = 2048,
32
+ # system_prompt: Optional[str] = None,
33
+ # output_model: Optional[type[BaseModel]] = None,
34
+ # ):
35
+ # genai.configure(api_key=api_key)
36
+
37
+ # self.model = genai.GenerativeModel(
38
+ # model_name=model,
39
+ # system_instruction=system_prompt,
40
+ # generation_config={
41
+ # "temperature": temperature,
42
+ # "top_p": top_p,
43
+ # "top_k": top_k,
44
+ # "max_output_tokens": max_output_tokens,
45
+ # "response_mime_type": "application/json",
46
+ # },
47
+ # )
48
+
49
+ # self.system_prompt = system_prompt
50
+ # self.output_model = output_model
51
+ # self.conversation_history: List[Message] = []
52
+ # if system_prompt:
53
+ # self.conversation_history.append(
54
+ # Message(role=Role.SYSTEM, content=system_prompt)
55
+ # )
56
+
57
+ # def add_message(self, role: Union[Role, str], content: str) -> None:
58
+ # if isinstance(role, str):
59
+ # role = Role(role)
60
+ # self.conversation_history.append(Message(role=role, content=content))
61
+
62
+ # def _format_messages_for_gemini(self) -> List[Dict]:
63
+ # """Convert internal message format to Gemini's expected format."""
64
+ # formatted_history = []
65
+
66
+ # for msg in self.conversation_history:
67
+ # if msg.role == Role.USER:
68
+ # formatted_history.append(
69
+ # {"role": "user", "parts": [{"text": msg.content}]}
70
+ # )
71
+ # elif msg.role == Role.ASSISTANT:
72
+ # formatted_history.append(
73
+ # {"role": "model", "parts": [{"text": msg.content}]}
74
+ # )
75
+ # # System messages are handled differently
76
+
77
+ # return formatted_history
78
+
79
+ # async def get_response(
80
+ # self,
81
+ # message: str,
82
+ # context_data: Optional[str] = None,
83
+ # ) -> Union[str, BaseModel]:
84
+ # """Get a response from the assistant for the given message."""
85
+ # # Prepare the full message with system prompt if it exists
86
+ # full_message = message
87
+ # if self.system_prompt and not self.conversation_history:
88
+ # full_message = (
89
+ # f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
90
+ # )
91
+ # else:
92
+
93
+ # full_message = f"{message}\n\nContext:\n{context_data}\n"
94
+
95
+ # print("********************")
96
+ # print(full_message)
97
+ # print("********************")
98
+
99
+ # # Add user message to history
100
+ # self.add_message(Role.USER, full_message)
101
+
102
+ # try:
103
+ # # Create chat session with properly formatted history
104
+ # chat = self.model.start_chat(history=self._format_messages_for_gemini())
105
+
106
+ # # Generate response
107
+ # response = await chat.send_message_async(full_message)
108
+ # response_text = response.text
109
+
110
+ # # Handle structured output if output_model is specified
111
+ # if self.output_model:
112
+ # try:
113
+ # # Try to parse the response as JSON
114
+ # response_data = json.loads(response_text)
115
+ # # Validate with Pydantic model
116
+ # structured_response = self.output_model(**response_data)
117
+ # # Add the formatted response to history
118
+ # self.add_message(
119
+ # Role.ASSISTANT, json.dumps(response_data, indent=2)
120
+ # )
121
+ # return structured_response
122
+ # except (json.JSONDecodeError, ValueError) as e:
123
+ # # If parsing fails, try to fix the response format
124
+ # retry_prompt = (
125
+ # "Please format your previous response as a valid JSON object "
126
+ # "that matches the specified schema. Include all required fields."
127
+ # )
128
+ # response = await chat.send_message_async(retry_prompt)
129
+ # try:
130
+ # print(response.text)
131
+ # response_data = json.loads(response.text)
132
+ # structured_response = self.output_model(**response_data)
133
+ # self.add_message(
134
+ # Role.ASSISTANT, json.dumps(response_data, indent=2)
135
+ # )
136
+ # return structured_response
137
+ # except (json.JSONDecodeError, ValueError) as e:
138
+ # raise ValueError(
139
+ # f"Failed to generate properly formatted response: {str(e)}"
140
+ # )
141
+ # else:
142
+ # # Handle regular string response
143
+ # self.add_message(Role.ASSISTANT, response_text)
144
+ # return response_text
145
+
146
+ # except Exception as e:
147
+ # raise Exception(f"Error generating response: {str(e)}")
148
+
149
+ # def clear_history(self) -> None:
150
+ # """Clear the conversation history."""
151
+ # self.conversation_history = []
152
+ # if self.system_prompt:
153
+ # self.add_message(Role.SYSTEM, self.system_prompt)
154
+
155
+
156
+ # # Example usage
157
+ # async def main():
158
+ # # Initialize assistant
159
+ # assistant = GeminiAssistant(
160
+ # api_key="your-api-key-here", system_prompt="You are a helpful AI assistant."
161
+ # )
162
+
163
+ # # Get response
164
+ # response = await assistant.get_response("What is machine learning?")
165
+ # print(response)
166
+
167
+ # # Continue conversation
168
+ # response = await assistant.get_response("Can you give me an example?")
169
+ # print(response)
170
+
171
+
172
+ # if __name__ == "__main__":
173
+ # import asyncio
174
+
175
+ # asyncio.run(main())
176
+
177
+
178
+ # import json
179
+ # import os
180
+ # from dataclasses import dataclass
181
+ # from enum import Enum
182
+ # from typing import Any, Dict, List, Optional, Union
183
+
184
+ # import google.generativeai as genai
185
+ # from pydantic import BaseModel
186
+
187
+
188
+ # class Role(Enum):
189
+ # USER = "user"
190
+ # ASSISTANT = "assistant"
191
+ # SYSTEM = "system"
192
+
193
+
194
+ # @dataclass
195
+ # class Message:
196
+ # role: Role
197
+ # content: str
198
+
199
+
200
+ # class GeminiAssistant:
201
+ # def __init__(
202
+ # self,
203
+ # api_key: str,
204
+ # model: str = "gemini-pro",
205
+ # temperature: float = 0.7,
206
+ # top_p: float = 0.95,
207
+ # top_k: int = 40,
208
+ # max_output_tokens: int = 2048,
209
+ # system_prompt: Optional[str] = None,
210
+ # output_model: Optional[type[BaseModel]] = None,
211
+ # ):
212
+ # genai.configure(api_key=api_key)
213
+
214
+ # self.model = genai.GenerativeModel(
215
+ # model_name=model,
216
+ # system_instruction=system_prompt,
217
+ # generation_config={
218
+ # "temperature": temperature,
219
+ # "top_p": top_p,
220
+ # "top_k": top_k,
221
+ # "max_output_tokens": max_output_tokens,
222
+ # "response_mime_type": "application/json",
223
+ # },
224
+ # )
225
+
226
+ # self.system_prompt = system_prompt
227
+ # self.output_model = output_model
228
+ # self.conversation_history: List[Message] = []
229
+ # if system_prompt:
230
+ # self.conversation_history.append(
231
+ # Message(role=Role.SYSTEM, content=system_prompt)
232
+ # )
233
+
234
+ # def add_message(self, role: Union[Role, str], content: str) -> None:
235
+ # if isinstance(role, str):
236
+ # role = Role(role)
237
+ # self.conversation_history.append(Message(role=role, content=content))
238
+
239
+ # def _format_messages_for_gemini(self) -> List[Dict]:
240
+ # """Convert internal message format to Gemini's expected format."""
241
+ # formatted_history = []
242
+
243
+ # for msg in self.conversation_history:
244
+ # if msg.role == Role.USER:
245
+ # formatted_history.append(
246
+ # {"role": "user", "parts": [{"text": msg.content}]}
247
+ # )
248
+ # elif msg.role == Role.ASSISTANT:
249
+ # formatted_history.append(
250
+ # {"role": "model", "parts": [{"text": msg.content}]}
251
+ # )
252
+ # # System messages are handled differently
253
+
254
+ # return formatted_history
255
+
256
+ # async def get_response_gemini(
257
+ # self,
258
+ # message: str,
259
+ # context_data: Optional[str] = None,
260
+ # ) -> Union[str, BaseModel]:
261
+ # """Get a response from the assistant for the given message."""
262
+ # # Prepare the full message with system prompt if it exists
263
+ # full_message = message
264
+ # if self.system_prompt and not self.conversation_history:
265
+ # full_message = (
266
+ # f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
267
+ # )
268
+ # else:
269
+ # full_message = f"{message}\n\nContext:\n{context_data}\n"
270
+
271
+ # print("********************")
272
+ # print(full_message)
273
+ # print("********************")
274
+
275
+ # # Add user message to history
276
+ # self.add_message(Role.USER, full_message)
277
+
278
+ # try:
279
+ # # Create chat session with properly formatted history
280
+ # chat = self.model.start_chat(history=self._format_messages_for_gemini())
281
+
282
+ # # Generate response
283
+ # response = await chat.send_message_async(full_message)
284
+ # response_text = response.text
285
+
286
+ # # Handle structured output if output_model is specified
287
+ # if self.output_model:
288
+ # try:
289
+ # # Try to parse the response as JSON
290
+ # response_data = json.loads(response_text)
291
+
292
+ # # Ensure required fields are present
293
+ # if "source_id" not in response_data:
294
+ # response_data["source_id"] = "default_source_id"
295
+ # if "quote" not in response_data:
296
+ # response_data["quote"] = "default_quote"
297
+
298
+ # # Validate with Pydantic model
299
+ # structured_response = self.output_model(**response_data)
300
+ # # Add the formatted response to history
301
+ # self.add_message(
302
+ # Role.ASSISTANT, json.dumps(response_data, indent=2)
303
+ # )
304
+ # return structured_response
305
+ # except (json.JSONDecodeError, ValueError) as e:
306
+ # # If parsing fails, try to fix the response format
307
+ # retry_prompt = (
308
+ # "Please format your previous response as a valid JSON object "
309
+ # "that matches the specified schema. Include all required fields."
310
+ # )
311
+ # response = await chat.send_message_async(retry_prompt)
312
+ # try:
313
+ # print(response.text)
314
+ # response_data = json.loads(response.text)
315
+
316
+ # # Ensure required fields are present
317
+ # if "source_id" not in response_data:
318
+ # response_data["source_id"] = "default_source_id"
319
+ # if "quote" not in response_data:
320
+ # response_data["quote"] = "default_quote"
321
+
322
+ # structured_response = self.output_model(**response_data)
323
+ # self.add_message(
324
+ # Role.ASSISTANT, json.dumps(response_data, indent=2)
325
+ # )
326
+ # return structured_response
327
+ # except (json.JSONDecodeError, ValueError) as e:
328
+ # raise ValueError(
329
+ # f"Failed to generate properly formatted response: {str(e)}"
330
+ # )
331
+ # else:
332
+ # # Handle regular string response
333
+ # self.add_message(Role.ASSISTANT, response_text)
334
+ # return response_text
335
+
336
+ # except Exception as e:
337
+ # raise Exception(f"Error generating response: {str(e)}")
338
+
339
+ # def clear_history(self) -> None:
340
+ # """Clear the conversation history."""
341
+ # self.conversation_history = []
342
+ # if self.system_prompt:
343
+ # self.add_message(Role.SYSTEM, self.system_prompt)
344
+
345
+
346
+ import json
347
+ from dataclasses import dataclass
348
+ from enum import Enum
349
+ from typing import Any, Dict, List, Optional, Union
350
+
351
+ import google.generativeai as genai
352
+ from pydantic import BaseModel
353
+
354
+
355
+ class Role(Enum):
356
+ USER = "user"
357
+ ASSISTANT = "assistant"
358
+ SYSTEM = "system"
359
+
360
+
361
+ @dataclass
362
+ class Message:
363
+ role: Role
364
+ content: str
365
+
366
+
367
+ class GeminiAssistant:
368
+ def __init__(
369
+ self,
370
+ api_key: str,
371
+ model: str = "gemini-pro",
372
+ temperature: float = 0.7,
373
+ top_p: float = 0.95,
374
+ top_k: int = 40,
375
+ max_output_tokens: int = 2048,
376
+ system_prompt: Optional[str] = None,
377
+ output_model: Optional[type[BaseModel]] = None,
378
+ ):
379
+ genai.configure(api_key=api_key)
380
+
381
+ self.model = genai.GenerativeModel(
382
+ model_name=model,
383
+ system_instruction=system_prompt,
384
+ generation_config={
385
+ "temperature": temperature,
386
+ "top_p": top_p,
387
+ "top_k": top_k,
388
+ "max_output_tokens": max_output_tokens,
389
+ "response_mime_type": "application/json",
390
+ },
391
+ )
392
+
393
+ self.system_prompt = system_prompt
394
+ self.output_model = output_model
395
+ self.conversation_history: List[Message] = []
396
+ if system_prompt:
397
+ self.conversation_history.append(
398
+ Message(role=Role.SYSTEM, content=system_prompt)
399
+ )
400
+
401
+ def add_message(self, role: Union[Role, str], content: str) -> None:
402
+ if isinstance(role, str):
403
+ role = Role(role)
404
+ self.conversation_history.append(Message(role=role, content=content))
405
+
406
+ def _format_messages_for_gemini(self) -> List[Dict]:
407
+ """Convert internal message format to Gemini's expected format."""
408
+ formatted_history = []
409
+
410
+ for msg in self.conversation_history:
411
+ if msg.role == Role.USER:
412
+ formatted_history.append(
413
+ {"role": "user", "parts": [{"text": msg.content}]}
414
+ )
415
+ elif msg.role == Role.ASSISTANT:
416
+ formatted_history.append(
417
+ {"role": "model", "parts": [{"text": msg.content}]}
418
+ )
419
+
420
+ return formatted_history
421
+
422
+ def get_response_gemini(
423
+ self,
424
+ message: str,
425
+ context_data: Optional[str] = None,
426
+ ) -> Dict[str, Any]:
427
+ """Get a response from the assistant for the given message."""
428
+ full_message = message
429
+ if self.system_prompt and not self.conversation_history:
430
+ full_message = (
431
+ f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
432
+ )
433
+ else:
434
+ full_message = f"{message}\n\nContext:\n{context_data}\n"
435
+
436
+ # Add user message to history
437
+ self.add_message(Role.USER, full_message)
438
+
439
+ try:
440
+ # Create chat session with properly formatted history
441
+ chat = self.model.start_chat(history=self._format_messages_for_gemini())
442
+
443
+ # Generate response
444
+ response = chat.send_message(full_message)
445
+ response_text = response.text
446
+
447
+ # Parse response as JSON
448
+ try:
449
+ response_data = json.loads(response_text)
450
+
451
+ # Ensure required fields are present
452
+ if "source_id" not in response_data:
453
+ response_data["source_id"] = "default_source_id"
454
+ if "quote" not in response_data:
455
+ response_data["quote"] = "default_quote"
456
+
457
+ # Validate with Pydantic model if specified
458
+ if self.output_model:
459
+ structured_response = self.output_model(**response_data)
460
+ self.add_message(
461
+ Role.ASSISTANT, json.dumps(response_data, indent=2)
462
+ )
463
+ return structured_response.dict()
464
+
465
+ # Add response to history and return as dict
466
+ self.add_message(Role.ASSISTANT, response_text)
467
+ return response_data
468
+
469
+ except (json.JSONDecodeError, ValueError) as e:
470
+ raise ValueError(
471
+ f"Failed to parse response as JSON: {str(e)}. Response: {response_text}"
472
+ )
473
+
474
+ except Exception as e:
475
+ raise Exception(f"Error generating response: {str(e)}")
476
+
477
+ def clear_history(self) -> None:
478
+ """Clear the conversation history."""
479
+ self.conversation_history = []
480
+ if self.system_prompt:
481
+ self.add_message(Role.SYSTEM, self.system_prompt)
src/chroma_rag/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91a33bf0cab01f5a6e14731ab350378bc396ef1ee7d6c339b279327f33ac337
3
+ size 30126080
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
3
+ size 3212000
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
3
+ size 100
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d7d015a7ab0459c0f4812e91eb55167f930ea9df777241555219a13bc4f9512
3
+ size 4000
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/link_lists.bin ADDED
File without changes
src/notebooks/read_pdf.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+
3
+ # Load the PDF file
4
+ pdf_path = "../data/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf"
5
+ doc = fitz.open(pdf_path)
6
+
7
+ # Extract text from each page
8
+ for page_number in range(len(doc)):
9
+ page = doc[page_number]
10
+ print(f"Page {page_number + 1}:")
11
+ print(page.get_text())
12
+ print("-" * 50)
13
+
14
+ # Close the document
15
+ doc.close()
src/notebooks/temp.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import pytesseract
3
+ from PIL import Image
4
+
5
+
6
+ # Define a function to extract text from all pages of a PDF
7
+ def extract_text_from_pdf(pdf_path, dpi=300):
8
+ """
9
+ Extract text from all pages of a PDF.
10
+
11
+ Args:
12
+ pdf_path (str): Path to the PDF file.
13
+ dpi (int): Resolution for converting PDF pages to images (default: 300).
14
+
15
+ Returns:
16
+ dict: A dictionary where keys are page numbers (1-based) and values are extracted text.
17
+ """
18
+ # Open the PDF file
19
+ pdf_document = fitz.open(pdf_path)
20
+ extracted_text = {}
21
+
22
+ for page_number in range(len(pdf_document)):
23
+ # Select the page
24
+ page = pdf_document[page_number]
25
+
26
+ # Convert the page to an image
27
+ pixmap = page.get_pixmap(dpi=dpi)
28
+
29
+ # Save the image to a temporary file
30
+ image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
31
+
32
+ # Extract text from the image using Tesseract
33
+ text = pytesseract.image_to_string(image)
34
+
35
+ print(text)
36
+
37
+ # Store the text in the dictionary
38
+ extracted_text[page_number + 1] = text
39
+
40
+ # Close the PDF document
41
+ pdf_document.close()
42
+
43
+ return extracted_text
44
+
45
+
46
+ # Usage example
47
+ if __name__ == "__main__":
48
+ # pdf_path = "c:/Abhi-MTech/Sem-1/AI/Books/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf" # Path to your PDF file
49
+ pdf_path = "c:/Abhi-MTech/Sem-1/AI/AI Technical.pdf" # Path to your PDF file
50
+
51
+ try:
52
+ all_text = extract_text_from_pdf(pdf_path)
53
+ for page_num, text in all_text.items():
54
+ print(f"Page {page_num} Text:")
55
+ print(text)
56
+ print("-" * 80) # Separator for readability
57
+ except Exception as e:
58
+ print(f"Error: {e}")
src/response_api.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # import sys
3
+
4
+ # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ # import json
6
+
7
+ # from pydantic import ValidationError
8
+ # from src.assistants.assistant_v1 import gemini_rag_assistant
9
+ # from src.utils.knowledge_base import AgenticRAG
10
+
11
+
12
+ # async def get_response(query=True, is_uploaded=False):
13
+ # rag = AgenticRAG(query_value=query, is_uploaded=is_uploaded)
14
+
15
+ # # query = input("Enter your query: ")
16
+ # context = rag.query(query_text=query, n_results=10)
17
+ # print("\nQuery Results:")
18
+ # print(json.dumps(context, indent=2))
19
+
20
+ # try:
21
+ # response = await gemini_rag_assistant.get_response(
22
+ # message=query, context_data=context
23
+ # )
24
+ # print("\nAssistant Response:")
25
+ # print(response)
26
+
27
+ # return {"response": response, "context": context}
28
+
29
+ # except ValidationError as e:
30
+ # print("Validation Error:", e)
31
+ # return {
32
+ # "source_id": "validation_error",
33
+ # "content": str(e),
34
+ # }
35
+
36
+ # except Exception as e:
37
+ # print("Internal Server Error:", e)
38
+ # return {
39
+ # "source_id": "internal_error",
40
+ # "content": "Internal Server Error",
41
+ # }
42
+
43
+
44
+ # async def main():
45
+ # rag = AgenticRAG(query_value=True)
46
+ # # while True:
47
+ # query = input("Enter your query: ")
48
+ # results = rag.query(query_text=query, n_results=10)
49
+ # print("\nQuery Results:")
50
+ # print(json.dumps(results, indent=2))
51
+
52
+ # try:
53
+ # print("gemini start generating answer")
54
+ # response = await gemini_rag_assistant.get_response(
55
+ # message=query, context_data=results
56
+ # )
57
+ # print("\nAssistant Response:")
58
+ # print(response)
59
+
60
+ # except ValidationError as e:
61
+ # print("Validation Error:", e)
62
+ # return {
63
+ # "source_id": "validation_error",
64
+ # "content": str(e),
65
+ # }
66
+
67
+ # except Exception as e:
68
+ # print("Internal Server Error:", e)
69
+ # return {
70
+ # "source_id": "internal_error",
71
+ # "content": "Internal Server Error",
72
+ # }
73
+
74
+
75
+ # if __name__ == "__main__":
76
+ # import asyncio
77
+
78
+ # asyncio.run(main())
79
+
80
+
81
+ import json
82
+ import os
83
+ import traceback
84
+
85
+ import google.generativeai as genai
86
+ from dotenv import load_dotenv
87
+ from fastapi import FastAPI, HTTPException
88
+ from pydantic import BaseModel, ValidationError
89
+
90
+ from agents_rag.crew import get_crew_response
91
+ from assistants.assistant_v1 import gemini_rag_assistant
92
+ from utils.knowledge_base import AgenticRAG
93
+ from utils.vectorDB import VectorStore
94
+
95
+ load_dotenv()
96
+
97
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
98
+ genai.configure(api_key=GEMINI_API_KEY)
99
+
100
+ # Initialize FastAPI app
101
+ app = FastAPI(title="Gemini RAG Assistant API", version="1.0.0")
102
+
103
+
104
+ # Pydantic model for request body
105
+ class QueryRequest(BaseModel):
106
+ query: str
107
+ is_uploaded: bool = False
108
+ url: str
109
+
110
+
111
+ # Pydantic model for response
112
+ class QueryResponse(BaseModel):
113
+ response: str
114
+ context: dict
115
+
116
+
117
+ def llm_answer(query=""):
118
+ try:
119
+ # Initialize Gemini RAG assistant
120
+ model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
121
+
122
+ print("query", query)
123
+
124
+ response = model.generate_content(query)
125
+ print(response.text)
126
+
127
+ return {"response": response.text, "status": "success"}
128
+
129
+ except Exception as e:
130
+ print(f"Error in Gemini chunking: {e}")
131
+ return [
132
+ {
133
+ "response": "",
134
+ "status": "fail",
135
+ }
136
+ ]
137
+
138
+
139
+ @app.post("/get-response")
140
+ def get_response(request: QueryRequest):
141
+ """
142
+ Endpoint to process a query and get a response from the assistant.
143
+ """
144
+ try:
145
+ # Initialize AgenticRAG and fetch context
146
+ rag = AgenticRAG(query_value=request.query, is_uploaded=request.is_uploaded)
147
+ context = rag.query(query_text=request.query, n_results=15)
148
+
149
+ print("Generate answer form gemini")
150
+ # Fetch response from gemini assistant
151
+ # response = gemini_rag_assistant.get_response_gemini(
152
+ # message=request.query, context_data=context
153
+ # )
154
+
155
+ response = get_crew_response(
156
+ query=request.query, context=context, url=request.url
157
+ )
158
+
159
+ print(response)
160
+
161
+ cleaned_text = "".join(
162
+ char for char in response if ord(char) >= 32 or char in "\n\r\t"
163
+ )
164
+ result = json.loads(cleaned_text)
165
+ print(result)
166
+
167
+ result = {
168
+ "response": result["Answer"],
169
+ "context": result["context"],
170
+ "citations": result["citations"],
171
+ }
172
+ # print(result)
173
+
174
+ return result
175
+
176
+ except ValidationError as e:
177
+ raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
178
+
179
+ except ValueError as e:
180
+ raise HTTPException(status_code=400, detail=f"Value Error: {e}")
181
+
182
+ except Exception as e:
183
+ traceback.print_exc() # Log the full traceback
184
+ raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
185
+
186
+
187
+ @app.post("/llm-response")
188
+ def get_response_llm(request: dict):
189
+ """
190
+ Endpoint to process a query and get a response from the assistant.
191
+ """
192
+ try:
193
+ # Initialize AgenticRAG and fetch context
194
+
195
+ print("Generate answer form gemini")
196
+ # Fetch response from gemini assistant
197
+
198
+ result = llm_answer(query=request["query"])
199
+
200
+ result = {
201
+ "response": result["response"],
202
+ }
203
+ # print(result)
204
+
205
+ return result
206
+
207
+ except ValidationError as e:
208
+ raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
209
+
210
+ except ValueError as e:
211
+ raise HTTPException(status_code=400, detail=f"Value Error: {e}")
212
+
213
+ except Exception as e:
214
+ traceback.print_exc() # Log the full traceback
215
+ raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
216
+
217
+
218
+ @app.get("/health")
219
+ def health_check():
220
+ """
221
+ Endpoint for health check.
222
+ """
223
+ return {"status": "ok"}
224
+
225
+
226
+ @app.post("/delete-file")
227
+ async def process_upload_data(request: dict):
228
+ """
229
+ Endpoint to retrieve do emedding of new file and store the result in Vector database.
230
+ """
231
+ try:
232
+ db = VectorStore()
233
+
234
+ print("deletion started.")
235
+ db.delete_documents_by_filename(request["file_path"])
236
+ print("deletion end.")
237
+
238
+ return {"response": 200}
239
+
240
+ except ValidationError as e:
241
+ raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
242
+
243
+ except ValueError as e:
244
+ raise HTTPException(status_code=400, detail=f"Value Error: {e}")
245
+
246
+ except Exception as e:
247
+ traceback.print_exc() # Log the full traceback
248
+ raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
249
+
250
+
251
+ @app.post("/process-file")
252
+ async def process_upload_data(request: dict):
253
+ """
254
+ Endpoint to retrieve do emedding of new file and store the result in Vector database.
255
+ """
256
+ try:
257
+ # Initialize AgenticRAG and fetch context
258
+ rag = AgenticRAG(is_uploaded=False)
259
+
260
+ print("process started.")
261
+ rag.process_file(request["file_path"])
262
+ print("process end.")
263
+
264
+ # rag = AgenticRAG(query_value=request.query)
265
+ # context = rag.query(query_text=request.query, n_results=10)
266
+
267
+ # # Fetch response from gemini assistant
268
+ # response = await gemini_rag_assistant.get_response(
269
+ # message=request.query, context_data=context
270
+ # )
271
+
272
+ # # Ensure response is in correct format
273
+ # if not isinstance(response, str):
274
+ # raise ValueError("Unexpected response format from gemini_rag_assistant.")
275
+
276
+ return {"response": 200}
277
+
278
+ except ValidationError as e:
279
+ raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
280
+
281
+ except ValueError as e:
282
+ raise HTTPException(status_code=400, detail=f"Value Error: {e}")
283
+
284
+ except Exception as e:
285
+ traceback.print_exc() # Log the full traceback
286
+ raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
287
+
288
+
289
+ if __name__ == "__main__":
290
+ import uvicorn
291
+
292
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/run_app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ import json
6
+
7
+ from pydantic import ValidationError
8
+
9
+ from src.assistants.assistant_v1 import gemini_rag_assistant
10
+ from src.utils.knowledge_base import AgenticRAG
11
+
12
+
13
+ def main():
14
+ rag = AgenticRAG(query_value=True)
15
+ while True:
16
+ query = input("Enter your query: ")
17
+ results = rag.query(query_text=query, n_results=10)
18
+ print("\nQuery Results:")
19
+ print(json.dumps(results, indent=2))
20
+
21
+ try:
22
+ print("gemini start generating answer")
23
+ response = gemini_rag_assistant.get_response_gemini(
24
+ message=query, context_data=results
25
+ )
26
+ print("\nAssistant Response:")
27
+ print(response)
28
+
29
+ except ValidationError as e:
30
+ print("Validation Error:", e)
31
+ return {
32
+ "source_id": "validation_error",
33
+ "content": str(e),
34
+ }
35
+
36
+ except Exception as e:
37
+ print("Internal Server Error:", e)
38
+ return {
39
+ "source_id": "internal_error",
40
+ "content": "Internal Server Error",
41
+ }
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
src/utils/__init__.py ADDED
File without changes
src/utils/download.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+
4
+ import requests
5
+ import yt_dlp
6
+ from bs4 import BeautifulSoup
7
+ from download_video import downlaod_video_from_url
8
+ from pytube import YouTube
9
+
10
+
11
+ def download_youtube_video(url, download_path="../data/"):
12
+ try:
13
+ yt = YouTube(url)
14
+
15
+ # Get the best stream (highest resolution video)
16
+ video_stream = (
17
+ yt.streams.filter(progressive=True, file_extension="mp4")
18
+ .order_by("resolution")
19
+ .desc()
20
+ .first()
21
+ )
22
+
23
+ # If the stream exists, download it
24
+ if video_stream:
25
+ video_stream.download(output_path=download_path)
26
+ print(f"Video downloaded successfully to {download_path}")
27
+ else:
28
+ print("No suitable video stream found")
29
+ except Exception as e:
30
+ print(f"Error in downloading YouTube video: {e}")
31
+
32
+
33
+ def download_audio(url, download_path="../data/"):
34
+ """
35
+ Download audio from YouTube and convert to MP3 format.
36
+
37
+ Args:
38
+ url: YouTube video URL
39
+ download_path: Path where the MP3 file will be saved
40
+ """
41
+ ydl_opts = {
42
+ "outtmpl": f"{download_path}%(title)s.%(ext)s",
43
+ "format": "bestaudio/best",
44
+ "geo-bypass": True,
45
+ "noplaylist": True,
46
+ "force-ipv4": True,
47
+ # Add postprocessors for MP3 conversion
48
+ "postprocessors": [
49
+ {
50
+ "key": "FFmpegExtractAudio",
51
+ "preferredcodec": "mp3",
52
+ "preferredquality": "192",
53
+ }
54
+ ],
55
+ "headers": {
56
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
57
+ },
58
+ }
59
+
60
+ try:
61
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
62
+ ydl.download([url])
63
+ print(f"Audio downloaded and converted to MP3 successfully at {download_path}")
64
+ except Exception as e:
65
+ print(f"An error occurred: {e}")
66
+
67
+
68
+ # Function to download PDF, DOC, or other files
69
+ def download_file(url, download_path="../data/"):
70
+ try:
71
+ response = requests.get(url, stream=True)
72
+ response.raise_for_status() # Check if the request was successful
73
+ filename = os.path.join(download_path, url.split("/")[-1])
74
+
75
+ with open(filename, "wb") as file:
76
+ for chunk in response.iter_content(chunk_size=1024):
77
+ if chunk:
78
+ file.write(chunk)
79
+ print(f"File downloaded successfully to {filename}")
80
+ except Exception as e:
81
+ print(f"An error occurred: {e}")
82
+
83
+
84
+ # Function to download text or webpage content
85
+ def download_text_or_webpage(url, download_path="../data/", is_text=False):
86
+ try:
87
+ response = requests.get(url)
88
+ response.raise_for_status()
89
+
90
+ if is_text:
91
+ filename = os.path.join(download_path, url.split("/")[-1] + ".txt")
92
+ with open(filename, "w") as file:
93
+ file.write(response.text)
94
+ print(f"Text file downloaded successfully to {filename}")
95
+ else:
96
+ soup = BeautifulSoup(response.text, "html.parser")
97
+ filename = os.path.join(download_path, url.split("/")[-1] + ".html")
98
+ with open(filename, "w", encoding="utf-8") as file:
99
+ file.write(soup.prettify())
100
+ print(f"Webpage downloaded successfully to {filename}")
101
+
102
+ except Exception as e:
103
+ print(f"An error occurred: {e}")
104
+
105
+
106
+ def main():
107
+ # Example Usage:
108
+ # url_video = "https://www.youtube.com/watch?v=dIYmzf21d1g"
109
+ # downlaod_video_from_url(
110
+ # youtube_url=url_video, download_path="../data/"
111
+ # ) # Download video
112
+ url_audio = "https://www.youtube.com/watch?v=8OHYynw7Yh4"
113
+ download_audio(url_audio) # Download audio
114
+
115
+ # url_pdf = "https://example.com/somefile.pdf"
116
+ # download_file(url_pdf) # Download PDF, DOC, or any other file
117
+
118
+ # url_text = "https://example.com/sometextfile"
119
+ # download_text_or_webpage(url_text, is_text=True) # Download text
120
+
121
+ # url_webpage = "https://en.wikipedia.org/wiki/Microsoft"
122
+ # download_text_or_webpage(url_webpage) # Download webpage content
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()
src/utils/download_video.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import random
4
+ import time
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import requests
8
+ import yt_dlp
9
+ from bs4 import BeautifulSoup
10
+ from pytube import YouTube
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ContentDownloader:
20
+ def __init__(self, download_path: str = "./downloads/"):
21
+ self.download_path = download_path
22
+ self.create_download_directory()
23
+
24
+ def create_download_directory(self) -> None:
25
+ """Create download directory if it doesn't exist."""
26
+ os.makedirs(self.download_path, exist_ok=True)
27
+
28
+ def _get_available_formats(self, url: str) -> List[Dict]:
29
+ """Get list of available formats for a YouTube video."""
30
+ ydl_opts = {"quiet": True, "no_warnings": True, "extract_flat": True}
31
+
32
+ try:
33
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
34
+ info = ydl.extract_info(url, download=False)
35
+ formats = info.get("formats", [])
36
+ # Filter for formats that have both video and audio
37
+ combined_formats = [
38
+ f
39
+ for f in formats
40
+ if f.get("vcodec") != "none" and f.get("acodec") != "none"
41
+ ]
42
+ return combined_formats
43
+ except Exception as e:
44
+ logger.error(f"Error getting formats: {str(e)}")
45
+ return []
46
+
47
+ def download_youtube_content(
48
+ self, url: str, download_audio: bool = False
49
+ ) -> Optional[str]:
50
+ """
51
+ Download YouTube content with automatic format selection.
52
+ """
53
+ if download_audio:
54
+ ydl_opts = {
55
+ "outtmpl": os.path.join(self.download_path, "%(title)s.%(ext)s"),
56
+ "format": "bestaudio/best",
57
+ "postprocessors": [
58
+ {
59
+ "key": "FFmpegExtractAudio",
60
+ "preferredcodec": "mp3",
61
+ }
62
+ ],
63
+ }
64
+ else:
65
+ # Get available formats first
66
+ formats = self._get_available_formats(url)
67
+ if not formats:
68
+ logger.error("No suitable formats found")
69
+ return None
70
+
71
+ # Configure options for video download
72
+ ydl_opts = {
73
+ "outtmpl": os.path.join(self.download_path, "%(title)s.%(ext)s"),
74
+ "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b", # Prefer MP4 format
75
+ "merge_output_format": "mp4",
76
+ "postprocessors": [
77
+ {
78
+ "key": "FFmpegVideoRemuxer",
79
+ "preferedformat": "mp4",
80
+ }
81
+ ],
82
+ "quiet": False,
83
+ "no_warnings": False,
84
+ "max_filesize": 2048 * 1024 * 1024, # 2GB max
85
+ "geo_bypass": True,
86
+ "nocheckcertificate": True,
87
+ "http_headers": {
88
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
89
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
90
+ "Accept-Language": "en-us,en;q=0.5",
91
+ "Sec-Fetch-Mode": "navigate",
92
+ },
93
+ }
94
+
95
+ try:
96
+ # First update yt-dlp
97
+ os.system("yt-dlp -U")
98
+
99
+ # Attempt download with yt-dlp
100
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
101
+ logger.info("Attempting download with yt-dlp...")
102
+ ydl.download([url])
103
+ return self.download_path
104
+
105
+ except Exception as e:
106
+ logger.warning(f"yt-dlp download failed: {str(e)}")
107
+ logger.info("Attempting fallback to direct stream download...")
108
+ return self._download_with_direct_stream(url)
109
+
110
+ def _download_with_direct_stream(
111
+ self, url: str, max_retries: int = 3
112
+ ) -> Optional[str]:
113
+ """Alternative download method using direct stream access."""
114
+ for attempt in range(max_retries):
115
+ try:
116
+ if attempt > 0:
117
+ time.sleep(random.uniform(2, 5))
118
+
119
+ yt = YouTube(url)
120
+ # Sort streams by both resolution and bitrate
121
+ streams = yt.streams.filter(progressive=True, file_extension="mp4")
122
+ stream = streams.order_by("resolution").desc().first()
123
+
124
+ if stream:
125
+ # Add random query parameter to avoid caching
126
+ timestamp = int(time.time())
127
+ stream.url = f"{stream.url}&_={timestamp}"
128
+
129
+ file_path = stream.download(
130
+ output_path=self.download_path,
131
+ filename_prefix=f"video_{timestamp}_",
132
+ )
133
+ logger.info(f"Successfully downloaded to: {file_path}")
134
+ return file_path
135
+ else:
136
+ logger.error("No suitable stream found")
137
+ return None
138
+
139
+ except Exception as e:
140
+ logger.error(f"Download attempt {attempt + 1} failed: {str(e)}")
141
+ if attempt == max_retries - 1:
142
+ logger.error("All download attempts failed")
143
+ return None
144
+
145
+
146
+ def downlaod_video_from_url(youtube_url="", download_path="./downloads/"):
147
+ # Update yt-dlp first
148
+ os.system("yt-dlp -U")
149
+
150
+ downloader = ContentDownloader(download_path=download_path)
151
+
152
+
153
+
154
+ # First, check available formats
155
+ formats = downloader._get_available_formats(youtube_url)
156
+ if formats:
157
+ print("\nAvailable formats:")
158
+ for f in formats:
159
+ print(
160
+ f"Format ID: {f.get('format_id')} - "
161
+ f"Resolution: {f.get('resolution')} - "
162
+ f"Filesize: {f.get('filesize_approx', 'unknown')} bytes"
163
+ )
164
+
165
+ # Download video with audio
166
+ video_path = downloader.download_youtube_content(youtube_url)
167
+ if video_path:
168
+ print(f"\nVideo downloaded to: {video_path}")
169
+ else:
170
+ print("\nDownload failed")
src/utils/json_file_record.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "esnXVYfJ",
4
+ "file_path": "data\\What Makes Hanuman Chalisa So POWERFUL - Tantric Explains TRS Clips.mp3"
5
+ },
6
+ {
7
+ "id": "iVeOlEJl",
8
+ "file_path": "data\\The Science Of Building Extreme Discipline - Andrew Huberman.mp3"
9
+ },
10
+ {
11
+ "id": "nrrAW71c",
12
+ "file_path": "data\\attenstion_is_all_you_need.pdf"
13
+ }
14
+ ]
src/utils/knowledge_base.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Add the project root to Python path
5
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
6
+ sys.path.append(project_root)
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ import time
12
+ from typing import Any, Dict, List, Optional
13
+ from urllib.parse import urlparse
14
+
15
+ import docx
16
+ import exceptiongroup
17
+ import google.generativeai as genai
18
+ import numpy as np
19
+ import pandas as pd
20
+ import PyPDF2
21
+ import requests
22
+ from bs4 import BeautifulSoup
23
+ from dotenv import load_dotenv
24
+ from nanoid import generate
25
+ from pydantic import BaseModel, Field
26
+
27
+ from src.utils.vectorDB import VectorStore
28
+
29
+ load_dotenv()
30
+
31
+
32
+ class MetadataPDF(BaseModel):
33
+ key_concepts: List[str] = Field(
34
+ ..., description="Key concepts related to the topic"
35
+ )
36
+ page_number: int = Field(
37
+ ...,
38
+ alias="page-number",
39
+ description="The page number where this content is located",
40
+ )
41
+
42
+
43
+ class SegmentPDF(BaseModel):
44
+ content: str = Field(..., description="The main text of the segment")
45
+ metadata: MetadataPDF
46
+
47
+
48
+ class AnalyzedContentPDF(BaseModel):
49
+ segments: List[SegmentPDF] = Field(
50
+ ..., description="List of meaningful content segments"
51
+ )
52
+
53
+
54
+ # for text data
55
+
56
+
57
+ class MetadataTxt(BaseModel):
58
+ key_concepts: List[str] = Field(
59
+ ..., description="Key concepts related to the topic"
60
+ )
61
+
62
+
63
+ class SegmentTxt(BaseModel):
64
+ content: str = Field(..., description="The main text of the segment")
65
+ metadata: MetadataTxt
66
+
67
+
68
+ class AnalyzedContentTxt(BaseModel):
69
+ segments: List[SegmentTxt] = Field(
70
+ ..., description="List of meaningful content segments"
71
+ )
72
+
73
+
74
+ class GeminiChunker:
75
+ def __init__(self):
76
+ self.api_key = os.getenv("GEMINI_API_KEY")
77
+ genai.configure(api_key=self.api_key)
78
+ # self.model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
79
+ self.model = genai.GenerativeModel(model_name="gemini-1.5-flash")
80
+ # self.model = genai.GenerativeModel(model_name="gemini-1.5-pro")
81
+
82
+ def check_file_ready(self, file):
83
+ while file.state.name == "PROCESSING":
84
+ print(".", end="")
85
+ time.sleep(10)
86
+ file = genai.get_file(file.name)
87
+
88
+ if file.state.name == "FAILED":
89
+ raise ValueError(f"File processing failed: {file.state.name}")
90
+
91
+ def chunk_with_gemini(
92
+ self, content: str, content_type: str
93
+ ) -> List[Dict[str, Any]]:
94
+ # cleaned_content = "".join(
95
+ # char for char in content if ord(char) >= 32 or char in "\n\r\t"
96
+ # )
97
+
98
+ # # Encode and decode to handle unsupported characters
99
+ # safe_string = cleaned_content.encode("utf-8", errors="replace").decode("utf-8")
100
+ # print(safe_string)
101
+
102
+ if content_type == "pdf" or content_type == "docx":
103
+ """Use Gemini to intelligently chunk content based on semantic understanding"""
104
+ prompt = f"""
105
+ Analyze the following {content_type} content first means read whole content first then after divide it into complete and meaningful segments (chunks).
106
+ Each chunk size has 512 token should:
107
+ 1. Be self-contained and end at logical boundaries (e.g., complete sentences or sections).
108
+ 2. Include all text that belongs to a single segment without truncation.
109
+ 3. Ensure the last chunk is fully complete and not cut off.
110
+
111
+ Return the response strictly in the specified schema format:
112
+
113
+ {{
114
+ "content": "segment text here",
115
+ "metadata": {{
116
+ "key_concepts": ["concept1", "concept2"],
117
+ "page-number": 64
118
+ }}
119
+ }},
120
+ // more segments...
121
+
122
+ Content to analyze:
123
+ {content}
124
+
125
+ Keep the response as pure JSON without any additional text or explanation. Avoid splitting content mid-sentence or mid-thought.
126
+ All chunks should be complete.
127
+ """
128
+ schema = AnalyzedContentPDF
129
+ else:
130
+ prompt = f"""
131
+ Analyze the following {content_type} content first means read whole content first then after divide it into complete and meaningful segments (chunks).
132
+ Each chunk should:
133
+ 1. Be self-contained and end at logical boundaries (e.g., complete sentences or sections).
134
+ 2. Include all text that belongs to a single segment without truncation.
135
+ 3. Ensure the last chunk is fully complete and not cut off.
136
+
137
+ Return the response strictly in the specified schema format:
138
+
139
+ {{
140
+ "content": "segment text here",
141
+ "metadata": {{
142
+ "key_concepts": ["concept1", "concept2"],
143
+ "page-number": NA
144
+ }}
145
+ }},
146
+ // more segments...
147
+
148
+ Content to analyze:
149
+ {content}
150
+
151
+ Keep the response as pure JSON without any additional text or explanation. Avoid splitting content mid-sentence or mid-thought.
152
+ All chunks should be complete.
153
+ """
154
+ schema = AnalyzedContentTxt
155
+
156
+ print(schema)
157
+
158
+ try:
159
+ response = self.model.generate_content(
160
+ prompt,
161
+ generation_config=genai.GenerationConfig(
162
+ response_mime_type="application/json",
163
+ response_schema=schema,
164
+ ),
165
+ )
166
+ # print(response.text)
167
+ cleaned_text = "".join(
168
+ char for char in response.text if ord(char) >= 32 or char in "\n\r\t"
169
+ )
170
+ with open("chunking_text.txt", "w", encoding="utf-8") as file_text:
171
+ print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
172
+ file_text.write(cleaned_text)
173
+ print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
174
+
175
+ cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
176
+ "utf-8"
177
+ )
178
+
179
+ result = json.loads(cleaned_text)
180
+
181
+ # print(result)
182
+
183
+ print("pdf parsing")
184
+
185
+ chunks = []
186
+
187
+ for segment in result.get("segments", []):
188
+ # print("################################")
189
+ # print(segment.get("content", ""))
190
+ # print("################################")
191
+ temp_metadata = segment["metadata"]
192
+ if content_type == "pdf":
193
+ chunk = {
194
+ "content": segment.get("content", ""),
195
+ "metadata": {
196
+ "topics": temp_metadata.get("key_concepts", []),
197
+ "page-number": temp_metadata.get("page-number", ""),
198
+ "type": "pdf",
199
+ },
200
+ }
201
+ else:
202
+ chunk = {
203
+ "content": segment.get("content", ""),
204
+ "metadata": {
205
+ "topics": segment.get("key_concepts", []),
206
+ "page-number": segment.get("page-number", ""),
207
+ "type": "text",
208
+ },
209
+ }
210
+ chunks.append(chunk)
211
+
212
+ # print()
213
+ # print()
214
+ # print("chunks:")
215
+ # print(chunks)
216
+ # print()
217
+ # print()
218
+
219
+ return chunks
220
+
221
+ except Exception as e:
222
+ print(f"Error in Gemini chunking: {e}")
223
+ return [
224
+ {
225
+ "content": content,
226
+ "metadata": {
227
+ "topics": "",
228
+ "page-number": "0",
229
+ },
230
+ }
231
+ ]
232
+
233
+ def process_media_file(
234
+ self, file_path: str, media_type: str
235
+ ) -> List[Dict[str, Any]]:
236
+ """Process video or audio file using Gemini's media understanding"""
237
+ try:
238
+ print("here-0!!!")
239
+ media_file = genai.upload_file(path=file_path)
240
+ self.check_file_ready(media_file)
241
+ print("here-1!!!")
242
+
243
+ if media_type == "video":
244
+ schema = {
245
+ "type": "object",
246
+ "properties": {
247
+ "segments": {
248
+ "type": "array",
249
+ "items": {
250
+ "type": "object",
251
+ "properties": {
252
+ "timestamp": {"type": "string"},
253
+ "description": {"type": "string"},
254
+ "topics": {
255
+ "type": "array",
256
+ "items": {"type": "string"},
257
+ },
258
+ },
259
+ },
260
+ }
261
+ },
262
+ }
263
+
264
+ prompt = "Describe this video in detail, breaking it into timestamped segments. Include key events and actions."
265
+ else: # audio
266
+ schema = {
267
+ "type": "object",
268
+ "properties": {
269
+ "segments": {
270
+ "type": "array",
271
+ "items": {
272
+ "type": "object",
273
+ "properties": {
274
+ "timestamp": {"type": "string"},
275
+ "transcription": {"type": "string"},
276
+ "speaker": {"type": "string"},
277
+ "topics": {
278
+ "type": "array",
279
+ "items": {"type": "string"},
280
+ },
281
+ },
282
+ },
283
+ }
284
+ },
285
+ }
286
+
287
+ prompt = "Transcribe this audio, identifying speakers and key topics discussed."
288
+
289
+ print("Here-2!!!")
290
+ response = self.model.generate_content(
291
+ [media_file, prompt],
292
+ generation_config=genai.GenerationConfig(
293
+ response_schema=schema, response_mime_type="application/json"
294
+ ),
295
+ )
296
+ print("Here-3!!!")
297
+
298
+ # Convert Gemini's media response to our standard chunk format
299
+ print(response.text)
300
+ print("Here-4!!!")
301
+ cleaned_text = "".join(
302
+ char for char in response.text if ord(char) >= 32 or char in "\n\r\t"
303
+ )
304
+ result = json.loads(cleaned_text)
305
+ # result = json.loads(response.text)
306
+ chunks = []
307
+ print("Here-5!!!")
308
+
309
+ for segment in result.get("segments", []):
310
+ if media_type == "video":
311
+ chunk = {
312
+ "content": segment.get("description", ""),
313
+ "metadata": {
314
+ "timestamp": segment.get("timestamp", ""),
315
+ "topics": segment.get("key_events", []),
316
+ "type": "video",
317
+ },
318
+ }
319
+ else:
320
+ chunk = {
321
+ "content": segment.get("transcription", ""),
322
+ "metadata": {
323
+ "timestamp": segment.get("timestamp", ""),
324
+ "speaker": segment.get("speaker", ""),
325
+ "topics": segment.get("topics", []),
326
+ "type": "audio",
327
+ },
328
+ }
329
+ chunks.append(chunk)
330
+ print("Here-6!!!")
331
+
332
+ return chunks
333
+
334
+ except Exception as e:
335
+ print(f"Error processing {media_type} file: {e}")
336
+ return [
337
+ {
338
+ "content": f"Error processing {media_type} file",
339
+ "metadata": {"type": media_type, "error": str(e)},
340
+ }
341
+ ]
342
+
343
+
344
+ class ContentProcessor:
345
+ def __init__(self):
346
+ self.gemini_chunker = GeminiChunker()
347
+
348
+ def process_text(self, text: str, source_type: str) -> List[Dict[str, Any]]:
349
+ """Process any text content using Gemini chunking"""
350
+ chunks = self.gemini_chunker.chunk_with_gemini(text, source_type)
351
+ for chunk in chunks:
352
+ chunk["metadata"]["source_type"] = source_type
353
+ return chunks
354
+
355
+ def process_pdf(self, file_path: str) -> List[Dict[str, Any]]:
356
+ with open(file_path, "rb") as file:
357
+ pdf_reader = PyPDF2.PdfReader(file)
358
+ full_text = ""
359
+ for page in pdf_reader.pages:
360
+ full_text += page.extract_text() + " "
361
+ return self.process_text(full_text, "pdf")
362
+
363
+ def process_docx(self, file_path: str) -> List[Dict[str, Any]]:
364
+ doc = docx.Document(file_path)
365
+ full_text = " ".join([paragraph.text for paragraph in doc.paragraphs])
366
+ return self.process_text(full_text, "docx")
367
+
368
+ def process_csv(self, file_path: str) -> List[Dict[str, Any]]:
369
+ df = pd.read_csv(file_path)
370
+ # Convert DataFrame to a more readable format for Gemini
371
+ text_content = df.to_string()
372
+ return self.process_text(text_content, "csv")
373
+
374
+ def process_webpage(self, url: str) -> List[Dict[str, Any]]:
375
+ response = requests.get(url)
376
+ soup = BeautifulSoup(response.text, "html.parser")
377
+ for script in soup(["script", "style"]):
378
+ script.decompose()
379
+ text = soup.get_text()
380
+ return self.process_text(text, "webpage")
381
+
382
+ def process_video(self, file_path: str) -> List[Dict[str, Any]]:
383
+ """Process video using Gemini's video understanding capabilities"""
384
+ print("in process function of video")
385
+ video_file = genai.upload_file(path=file_path)
386
+ self.gemini_chunker.check_file_ready(video_file)
387
+
388
+ chunks = self.gemini_chunker.process_media_file(
389
+ file_path=file_path, media_type="video"
390
+ )
391
+ return chunks
392
+
393
+ def process_audio(self, file_path: str) -> List[Dict[str, Any]]:
394
+ """Process audio using Gemini's audio understanding capabilities"""
395
+ print("in process function of audio")
396
+ audio_file = genai.upload_file(path=file_path)
397
+ self.gemini_chunker.check_file_ready(audio_file)
398
+
399
+ chunks = self.gemini_chunker.process_media_file(
400
+ file_path=file_path, media_type="audio"
401
+ )
402
+ return chunks
403
+
404
+
405
+ class AgenticRAG:
406
+ def __init__(self, query_value=False, is_uploaded=False):
407
+ self.processor = ContentProcessor()
408
+ self.vector_store = VectorStore(query=query_value, is_uploaded=is_uploaded)
409
+ if query_value == False and is_uploaded == True:
410
+ self.json_file_path = "json_file_record.json"
411
+ else:
412
+ self.json_file_path = "utils/json_file_record.json"
413
+
414
+ def process_file(self, file_path: str, file_type: Optional[str] = None):
415
+ if file_type is None:
416
+ file_type = self._detect_file_type(file_path)
417
+
418
+ if os.path.exists(self.json_file_path):
419
+ with open(self.json_file_path, "r") as json_file:
420
+ json_data = json.load(json_file)
421
+ for record in json_data:
422
+ if record["file_path"] == file_path:
423
+ return True # File path exists
424
+ try:
425
+ chunks = []
426
+ if file_type == "pdf":
427
+ chunks = self.processor.process_pdf(file_path)
428
+ elif file_type == "docx":
429
+ chunks = self.processor.process_docx(file_path)
430
+ elif file_type == "csv":
431
+ chunks = self.processor.process_csv(file_path)
432
+ elif file_type == "url":
433
+ chunks = self.processor.process_webpage(file_path)
434
+ elif file_type == "video":
435
+ chunks = self.processor.process_video(file_path)
436
+ elif file_type == "audio":
437
+ chunks = self.processor.process_audio(file_path)
438
+ elif file_type == "text":
439
+ with open(file_path, "r") as file:
440
+ chunks = self.processor.process_text(file.read(), "text")
441
+
442
+ if chunks:
443
+ # Add source information to metadata
444
+ print("in processfile fucntion file.")
445
+ for chunk in chunks:
446
+ chunk["metadata"]["source"] = file_path
447
+
448
+ print(chunks)
449
+
450
+ # Add to Vector Database.
451
+ self.vector_store.add_documents(chunks)
452
+ print(f"Successfully processed {file_path} with {len(chunks)} chunks")
453
+
454
+ return True
455
+ return False
456
+
457
+ except Exception as e:
458
+ print(f"Error processing {file_path}: {e}")
459
+
460
+ def _detect_file_type(self, file_path: str) -> str:
461
+ if file_path.startswith("http"):
462
+ return "url"
463
+
464
+ extension = file_path.split(".")[-1].lower()
465
+ type_mapping = {
466
+ "pdf": "pdf",
467
+ "docx": "docx",
468
+ "doc": "docx",
469
+ "csv": "csv",
470
+ "txt": "text",
471
+ "mp3": "audio",
472
+ "wav": "audio",
473
+ "mp4": "video",
474
+ "mov": "video",
475
+ }
476
+ return type_mapping.get(extension, "unknown")
477
+
478
+ def query(self, query_text: str, n_results: int = 5) -> Dict:
479
+ return self.vector_store.query(query_text, n_results)
480
+
481
+
482
+ # Define a function to determine the file type based on the extension
483
+ def get_file_type(file_name: str) -> str:
484
+ if file_name.endswith(".mp3"):
485
+ return "audio"
486
+ elif file_name.endswith(".mp4"):
487
+ return "video"
488
+ elif file_name.endswith(".csv"):
489
+ return "csv"
490
+ elif file_name.endswith(".pdf"):
491
+ return "pdf"
492
+ elif file_name.endswith(".docx"):
493
+ return "docx"
494
+ elif file_name.startswith("http"):
495
+ return "url"
496
+ else:
497
+ return "unknown"
498
+
499
+
500
+ def main():
501
+ # Initialize the RAG system
502
+ rag = AgenticRAG(is_uploaded=True)
503
+
504
+ # Automatically read the files in the 'data' directory
505
+ data_directory = "../data"
506
+ test_files = []
507
+
508
+ # Loop through all files in the 'data' directory
509
+ for filename in os.listdir(data_directory):
510
+ file_path = os.path.join(data_directory, filename)
511
+ if os.path.isfile(file_path): # Check if it's a file
512
+ file_type = get_file_type(filename)
513
+ test_files.append((file_path, file_type))
514
+
515
+ # Process each file
516
+ for file_path, file_type in test_files:
517
+ print(f"\nProcessing {file_path}...")
518
+ time.sleep(5)
519
+ rag.process_file(file_path, file_type)
520
+
521
+
522
+ if __name__ == "__main__":
523
+ main()
src/utils/vectorDB.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+ from typing import Any, Dict, List
5
+
6
+ # Add the project root to Python path
7
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
8
+ sys.path.append(project_root)
9
+
10
+ import chromadb
11
+ import numpy as np
12
+ import torch
13
+ from nanoid import generate
14
+ from transformers import AutoModel, AutoTokenizer
15
+
16
+
17
+ def append_to_json(new_entries, filename="json_file_record.json"):
18
+ """
19
+ Append new entries to an existing JSON array file, or create a new one if it doesn't exist.
20
+
21
+ Args:
22
+ new_entries (list): List of dictionaries to append
23
+ filename (str): Name of the JSON file
24
+ """
25
+ try:
26
+ # Read existing data if file exists
27
+ if os.path.exists(filename) and os.path.getsize(filename) > 0:
28
+ with open(filename, "r") as f:
29
+ data = json.load(f)
30
+ if not isinstance(data, list):
31
+ data = []
32
+ else:
33
+ data = []
34
+
35
+ # # Append new entries
36
+ # data.extend(new_entries)
37
+ # print(data)
38
+
39
+ # Write back the updated data
40
+ with open(filename, "w") as f:
41
+ json.dump(data, f, indent=4)
42
+
43
+ except json.JSONDecodeError:
44
+ # Handle case where file exists but is not valid JSON
45
+ data = new_entries
46
+ with open(filename, "w") as f:
47
+ json.dump(data, f, indent=4)
48
+
49
+
50
+ class BERTEmbedder:
51
+ def __init__(self):
52
+ self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
53
+ self.model = AutoModel.from_pretrained("bert-base-uncased")
54
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55
+ self.model.to(self.device)
56
+
57
+ def get_embeddings(self, texts: List[str]) -> np.ndarray:
58
+ embeddings = []
59
+ self.model.eval()
60
+ with torch.no_grad():
61
+ for text in texts:
62
+ inputs = self.tokenizer(
63
+ text,
64
+ padding=True,
65
+ truncation=True,
66
+ max_length=512,
67
+ return_tensors="pt",
68
+ )
69
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
70
+ outputs = self.model(**inputs)
71
+ embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
72
+ return np.vstack(embeddings)
73
+
74
+
75
+ class VectorStore:
76
+ def __init__(
77
+ self, persist_directory: str = "../chroma_rag", query=False, is_uploaded=False
78
+ ):
79
+ try:
80
+ if query == False and is_uploaded == True:
81
+ print("Embbeding store mode.")
82
+ print(f"Initializing ChromaDB with directory: {persist_directory}")
83
+ self.client = chromadb.PersistentClient(path=persist_directory)
84
+ print("ChromaDB client created successfully")
85
+
86
+ self.collection = self.client.get_or_create_collection(
87
+ name="documents",
88
+ metadata={"hnsw:space": "cosine"},
89
+ embedding_function=None, # We're using our own embeddings
90
+ )
91
+ print(f"Collection 'documents' ready")
92
+
93
+ self.embedder = BERTEmbedder()
94
+ print("BERT embedder initialized")
95
+
96
+ # Check if collection has documents
97
+ content = self.collection.get()
98
+ print(f"Collection contains {len(content['documents'])} documents")
99
+ self.json_file_path = "json_file_record.json"
100
+
101
+ else:
102
+ print("query mode")
103
+ persist_directory = "chroma_rag"
104
+ print(f"Initializing ChromaDB with directory: {persist_directory}")
105
+ self.client = chromadb.PersistentClient(path=persist_directory)
106
+ print("ChromaDB client created successfully")
107
+
108
+ self.collection = self.client.get_or_create_collection(
109
+ name="documents",
110
+ metadata={"hnsw:space": "cosine"},
111
+ embedding_function=None, # We're using our own embeddings
112
+ )
113
+ print(f"Collection 'documents' ready")
114
+
115
+ self.embedder = BERTEmbedder()
116
+ print("BERT embedder initialized")
117
+
118
+ # Check if collection has documents
119
+ content = self.collection.get()
120
+ print(f"Collection contains {len(content['documents'])} documents")
121
+ self.json_file_path = "utils/json_file_record.json"
122
+
123
+ except Exception as e:
124
+ print(f"Error initializing VectorStore: {e}")
125
+ raise
126
+
127
+ def is_collection_empty(self) -> bool:
128
+ try:
129
+ content = self.collection.get()
130
+ return len(content["documents"]) == 0
131
+ except Exception as e:
132
+ print(f"Error checking collection: {e}")
133
+ return True
134
+
135
+ def add_documents(self, chunks: List[Dict[str, Any]]):
136
+ try:
137
+ texts = [chunk["content"] for chunk in chunks]
138
+ metadatas = [chunk["metadata"] for chunk in chunks]
139
+
140
+ print(f"Generating embeddings for {len(texts)} documents...")
141
+ print(texts)
142
+ embeddings = self.embedder.get_embeddings(texts)
143
+
144
+ id_val = str(generate(size=8))
145
+ print(f"Generated ID: {id_val}")
146
+
147
+ if os.path.exists(self.json_file_path):
148
+
149
+ with open(self.json_file_path, "r") as f:
150
+ data = json.load(f)
151
+ for chunk in chunks:
152
+ temp = {"id": id_val, "file_path": chunk["metadata"]["source"]}
153
+
154
+ break
155
+ # Append the new entry
156
+ data.append(temp)
157
+
158
+ # Write the updated JSON data back to the file
159
+ with open(self.json_file_path, "w") as file:
160
+ json.dump(data, file, indent=4)
161
+ else:
162
+ # Usage in your code would be:
163
+ with open(self.json_file_path, "w") as f:
164
+ temp = []
165
+ for chunk in chunks:
166
+ temp.append(
167
+ {"id": id_val, "file_path": chunk["metadata"]["source"]}
168
+ )
169
+ break
170
+ # Write the updated JSON data back to the file
171
+ with open(self.json_file_path, "w") as file:
172
+ json.dump(temp, file, indent=4)
173
+
174
+ print("*************")
175
+ count = 0
176
+ ids = []
177
+ # Clean metadata
178
+ for metadata in metadatas:
179
+ metadata["topics"] = str(metadata["topics"])
180
+ ids.append(f"{id_val}{count}")
181
+ count += 1
182
+ print(metadatas)
183
+ print("------------------------")
184
+ print(len(metadatas))
185
+
186
+ print(f"Adding {len(texts)} documents to collection...")
187
+ self.collection.add(
188
+ embeddings=embeddings.tolist(),
189
+ documents=texts,
190
+ metadatas=metadatas,
191
+ ids=ids,
192
+ )
193
+
194
+ # Verify addition
195
+ collection_content = self.collection.get()
196
+ print(
197
+ f"Collection now contains {len(collection_content['documents'])} documents"
198
+ )
199
+
200
+ return True
201
+ except Exception as e:
202
+ print(f"Error adding documents: {e}")
203
+ return False
204
+
205
+ def query(self, query_text: str, n_results: int = 3) -> Dict:
206
+ try:
207
+
208
+ print(f"Generating embedding for query: {query_text}")
209
+ query_embedding = self.embedder.get_embeddings([query_text])
210
+
211
+ print("Checking collection content:")
212
+ collection_content = self.collection.get()
213
+ print(
214
+ f"Number of documents in collection: {len(collection_content['documents'])}"
215
+ )
216
+
217
+ print("Executing query...")
218
+ query_vector = query_embedding.tolist()
219
+ results = self.collection.query(
220
+ n_results=min(n_results, len(collection_content["documents"])),
221
+ query_embeddings=query_vector,
222
+ )
223
+
224
+ # query_texts=[query_text],
225
+
226
+ print(f"Query results: {json.dumps(results, indent=2)}")
227
+ return results
228
+ except Exception as e:
229
+ print(f"Error during query: {e}")
230
+ return {"error": str(e)}
231
+
232
+ def delete_documents_by_filename(self, file_substring: str):
233
+ """
234
+ Delete documents from the collection and JSON file by matching a substring in the file path.
235
+
236
+ Args:
237
+ file_substring (str): Substring to match in the file paths.
238
+ json_file (str): Path to the JSON file containing document metadata.
239
+ """
240
+ try:
241
+ # Load JSON data
242
+ print(file_substring)
243
+ json_file = self.json_file_path
244
+ if not os.path.exists(json_file):
245
+ print(f"JSON file {json_file} does not exist.")
246
+ return
247
+
248
+ with open(json_file, "r") as f:
249
+ data = json.load(f)
250
+
251
+ # Find matching records
252
+ matching_records = [
253
+ record for record in data if file_substring in record["file_path"]
254
+ ]
255
+ if not matching_records:
256
+ print(f"No records found matching substring: {file_substring}")
257
+ return
258
+
259
+ # print("record", record)
260
+
261
+ # Get IDs of matching records
262
+ matching_ids = [record["id"] for record in matching_records]
263
+ print("maching_ids", matching_ids[0])
264
+
265
+ # Remove matching records from JSON file
266
+ updated_data = [
267
+ record for record in data if record["id"] not in matching_ids
268
+ ]
269
+
270
+ print("updated data", updated_data)
271
+
272
+ with open(json_file, "w") as f:
273
+ json.dump(updated_data, f, indent=4)
274
+
275
+ print(f"Deleted {len(matching_records)} records from JSON file.")
276
+ # Retrieve all IDs in the collection
277
+ all_ids = self.collection.get()["ids"]
278
+
279
+ # Filter IDs that contain the substring "LDtz9CG5"
280
+ ids_to_delete = [id_ for id_ in all_ids if matching_ids[0] in id_]
281
+
282
+ # Delete those IDs from the collection
283
+ if ids_to_delete:
284
+ self.collection.delete(ids=ids_to_delete)
285
+ print(
286
+ f"Deleted {len(ids_to_delete)} records with IDs containing 'LDtz9CG5'."
287
+ )
288
+ else:
289
+ print("No matching IDs found.")
290
+ except Exception as e:
291
+ print(f"Error deleting documents: {e}")